/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 81 by nigel, Sat Feb 24 21:40:59 2007 UTC revision 178 by ph10, Wed Jun 13 08:44:34 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
53    /* When DEBUG is defined, we need the pcre_printint() function, which is also
54    used by pcretest. DEBUG is not defined when building a production library. */
55    
56    #ifdef DEBUG
57    #include "pcre_printint.src"
58    #endif
59    
60    
61    /* Macro for setting individual bits in class bitmaps. */
62    
63    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
64    
65    
66  /*************************************************  /*************************************************
67  *      Code parameters and static tables         *  *      Code parameters and static tables         *
68  *************************************************/  *************************************************/
69    
70  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
71  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
72  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
73  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
74  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
75    so this number is very generous.
76    
77    The same workspace is used during the second, actual compile phase for
78    remembering forward references to groups so that they can be filled in at the
79    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
80    is 4 there is plenty of room. */
81    
82  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
83    
84    
85  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 87  are simple data values; negative values
87  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
88  is invalid. */  is invalid. */
89    
90  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
91  static const short int escapes[] = {  static const short int escapes[] = {
92       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
93       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
94     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
95       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
96  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
97  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
98     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
99       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
100  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
101       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
102  };  };
103    
104  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
105  static const short int escapes[] = {  static const short int escapes[] = {
106  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
107  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 111  static const short int escapes[] = {
111  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
112  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
113  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
114  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
115  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
116  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
117  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
118  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
119  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
120  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
121  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
122  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
123  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
124  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
125  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
126  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
127  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
128  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 107  static const short int escapes[] = { Line 131  static const short int escapes[] = {
131    
132    
133  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
134  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
135  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
136    
137  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 118  static const char *const posix_names[] = Line 142  static const char *const posix_names[] =
142  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
143    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
144    
145  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
146  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
147  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
148    characters are removed, and for [:alpha:] and [:alnum:] the underscore
149    character is removed. The triples in the table consist of the base map offset,
150    second map offset or -1 if no second map, and a non-negative value for map
151    addition or a negative value for map subtraction (if there are two maps). The
152    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
153    remove vertical space characters, 2 => remove underscore. */
154    
155  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
156    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
157    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
158    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
159    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
160    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
161    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
162    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
163    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
164    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
165    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
166    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
167    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
168    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
169    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
170  };  };
171    
172    
173    #define STRING(a)  # a
174    #define XSTRING(s) STRING(s)
175    
176  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
177  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
178    they are documented. Always add a new error instead. Messages marked DEAD below
179    are no longer used. */
180    
181  static const char *error_texts[] = {  static const char *error_texts[] = {
182    "no error",    "no error",
# Line 156  static const char *error_texts[] = { Line 191  static const char *error_texts[] = {
191    "range out of order in character class",    "range out of order in character class",
192    "nothing to repeat",    "nothing to repeat",
193    /* 10 */    /* 10 */
194    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
195    "internal error: unexpected repeat",    "internal error: unexpected repeat",
196    "unrecognized character after (?",    "unrecognized character after (?",
197    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 166  static const char *error_texts[] = { Line 201  static const char *error_texts[] = {
201    "erroffset passed as NULL",    "erroffset passed as NULL",
202    "unknown option bit(s) set",    "unknown option bit(s) set",
203    "missing ) after comment",    "missing ) after comment",
204    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
205    /* 20 */    /* 20 */
206    "regular expression too large",    "regular expression too large",
207    "failed to get memory",    "failed to get memory",
# Line 175  static const char *error_texts[] = { Line 210  static const char *error_texts[] = {
210    "unrecognized character after (?<",    "unrecognized character after (?<",
211    /* 25 */    /* 25 */
212    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
213    "malformed number after (?(",    "malformed number or name after (?(",
214    "conditional group contains more than two branches",    "conditional group contains more than two branches",
215    "assertion expected after (?(",    "assertion expected after (?(",
216    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
217    /* 30 */    /* 30 */
218    "unknown POSIX class name",    "unknown POSIX class name",
219    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
220    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
221    "spare error",    "spare error",  /** DEAD **/
222    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
223    /* 35 */    /* 35 */
224    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 194  static const char *error_texts[] = { Line 229  static const char *error_texts[] = {
229    /* 40 */    /* 40 */
230    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
231    "unrecognized character after (?P",    "unrecognized character after (?P",
232    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
233    "two named groups have the same name",    "two named subpatterns have the same name",
234    "invalid UTF-8 string",    "invalid UTF-8 string",
235    /* 45 */    /* 45 */
236    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
237    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
238    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
239      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
240      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
241      /* 50 */
242      "repeated subpattern is too long",
243      "octal value is greater than \\377 (not in UTF-8 mode)",
244      "internal error: overran compiling workspace",
245      "internal error: previously-checked referenced subpattern not found",
246      "DEFINE group contains more than one branch",
247      /* 55 */
248      "repeating a DEFINE group is not allowed",
249      "inconsistent NEWLINE options",
250      "\\g is not followed by a braced name or an optionally braced non-zero number",
251      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
252  };  };
253    
254    
# Line 220  For convenience, we use the same bit def Line 268  For convenience, we use the same bit def
268    
269  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
270    
271  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
272  static const unsigned char digitab[] =  static const unsigned char digitab[] =
273    {    {
274    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 304  static const unsigned char digitab[] =
304    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
306    
307  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
308  static const unsigned char digitab[] =  static const unsigned char digitab[] =
309    {    {
310    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 318  static const unsigned char digitab[] =
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
320    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
321    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
322    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
323    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
324    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 352  static const unsigned char ebcdic_charta
352    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
353    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
354    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
355    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
356    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
357    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 379  static const unsigned char ebcdic_charta
379  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
380    
381  static BOOL  static BOOL
382    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
383      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
384    
385    
386    
# Line 342  static BOOL Line 390  static BOOL
390    
391  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
392  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
393  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
394  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
395  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
396    ptr is pointing at the \. On exit, it is on the final character of the escape
397    sequence.
398    
399  Arguments:  Arguments:
400    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 362  static int Line 412  static int
412  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
413    int options, BOOL isclass)    int options, BOOL isclass)
414  {  {
415  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
416    const uschar *ptr = *ptrptr + 1;
417  int c, i;  int c, i;
418    
419    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
420    ptr--;                            /* Set pointer back to the last byte */
421    
422  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
423    
 c = *(++ptr);  
424  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
425    
426  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
427  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
428  Otherwise further processing may be required. */  Otherwise further processing may be required. */
429    
430  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
431  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
432  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
433    
434  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
435  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
436  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
437  #endif  #endif
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 441  else if ((i = escapes[c - 0x48]) != 0)
441  else  else
442    {    {
443    const uschar *oldptr;    const uschar *oldptr;
444      BOOL braced, negated;
445    
446    switch (c)    switch (c)
447      {      {
448      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 456  else
456      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
457      break;      break;
458    
459        /* \g must be followed by a number, either plain or braced. If positive, it
460        is an absolute backreference. If negative, it is a relative backreference.
461        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
462        reference to a named group. This is part of Perl's movement towards a
463        unified syntax for back references. As this is synonymous with \k{name}, we
464        fudge it up by pretending it really was \k. */
465    
466        case 'g':
467        if (ptr[1] == '{')
468          {
469          const uschar *p;
470          for (p = ptr+2; *p != 0 && *p != '}'; p++)
471            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
472          if (*p != 0 && *p != '}')
473            {
474            c = -ESC_k;
475            break;
476            }
477          braced = TRUE;
478          ptr++;
479          }
480        else braced = FALSE;
481    
482        if (ptr[1] == '-')
483          {
484          negated = TRUE;
485          ptr++;
486          }
487        else negated = FALSE;
488    
489        c = 0;
490        while ((digitab[ptr[1]] & ctype_digit) != 0)
491          c = c * 10 + *(++ptr) - '0';
492    
493        if (c == 0 || (braced && *(++ptr) != '}'))
494          {
495          *errorcodeptr = ERR57;
496          return 0;
497          }
498    
499        if (negated)
500          {
501          if (c > bracount)
502            {
503            *errorcodeptr = ERR15;
504            return 0;
505            }
506          c = bracount - (c - 1);
507          }
508    
509        c = -(ESC_REF + c);
510        break;
511    
512      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
513      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
514      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 442  else Line 550  else
550        }        }
551    
552      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
553      larger first octal digit. */      larger first octal digit. The original code used just to take the least
554        significant 8 bits of octal numbers (I think this is what early Perls used
555        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
556        than 3 octal digits. */
557    
558      case '0':      case '0':
559      c -= '0';      c -= '0';
560      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
561          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
562      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
563      break;      break;
564    
565      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
566      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
567        treated as a data character. */
568    
569      case 'x':      case 'x':
570  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
571        {        {
572        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
573        register int count = 0;        int count = 0;
574    
575        c = 0;        c = 0;
576        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
577          {          {
578          int cc = *pt++;          register int cc = *pt++;
579            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
580          count++;          count++;
581  #if !EBCDIC    /* ASCII coding */  
582    #ifndef EBCDIC  /* ASCII coding */
583          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
584          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
585  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
586          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
587          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
588  #endif  #endif
589          }          }
590    
591        if (*pt == '}')        if (*pt == '}')
592          {          {
593          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
594          ptr = pt;          ptr = pt;
595          break;          break;
596          }          }
597    
598        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
599        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
600        }        }
 #endif  
601    
602      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
603    
604      c = 0;      c = 0;
605      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
606        {        {
607        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
608        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
609  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
610        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
611        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
612  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
613        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
614        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
615  #endif  #endif
616        }        }
617      break;      break;
618    
619      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
620        This coding is ASCII-specific, but then the whole concept of \cx is
621        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
622    
623      case 'c':      case 'c':
624      c = *(++ptr);      c = *(++ptr);
# Line 511  else Line 628  else
628        return 0;        return 0;
629        }        }
630    
631      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
632      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
633      c ^= 0x40;      c ^= 0x40;
634  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
635      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
636      c ^= 0xC0;      c ^= 0xC0;
637  #endif  #endif
# Line 560  escape sequence. Line 673  escape sequence.
673  Argument:  Argument:
674    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
675    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
676      dptr           points to an int that is set to the detailed property value
677    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
678    
679  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
680  */  */
681    
682  static int  static int
683  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
684  {  {
685  int c, i, bot, top;  int c, i, bot, top;
686  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
687  char name[4];  char name[32];
688    
689  c = *(++ptr);  c = *(++ptr);
690  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
691    
692  *negptr = FALSE;  *negptr = FALSE;
693    
694  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
695  preceded by ^ for negation. */  negation. */
696    
697  if (c == '{')  if (c == '{')
698    {    {
# Line 587  if (c == '{') Line 701  if (c == '{')
701      *negptr = TRUE;      *negptr = TRUE;
702      ptr++;      ptr++;
703      }      }
704    for (i = 0; i <= 2; i++)    for (i = 0; i < sizeof(name) - 1; i++)
705      {      {
706      c = *(++ptr);      c = *(++ptr);
707      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
708      if (c == '}') break;      if (c == '}') break;
709      name[i] = c;      name[i] = c;
710      }      }
711    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
712    name[i] = 0;    name[i] = 0;
713    }    }
714    
# Line 619  top = _pcre_utt_size; Line 729  top = _pcre_utt_size;
729    
730  while (bot < top)  while (bot < top)
731    {    {
732    i = (bot + top)/2;    i = (bot + top) >> 1;
733    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
734    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
735        {
736        *dptr = _pcre_utt[i].value;
737        return _pcre_utt[i].type;
738        }
739    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
740    }    }
741    
 UNKNOWN_RETURN:  
742  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
743  *ptrptr = ptr;  *ptrptr = ptr;
744  return -1;  return -1;
# Line 741  return p; Line 854  return p;
854    
855    
856  /*************************************************  /*************************************************
857    *       Find forward referenced subpattern       *
858    *************************************************/
859    
860    /* This function scans along a pattern's text looking for capturing
861    subpatterns, and counting them. If it finds a named pattern that matches the
862    name it is given, it returns its number. Alternatively, if the name is NULL, it
863    returns when it reaches a given numbered subpattern. This is used for forward
864    references to subpatterns. We know that if (?P< is encountered, the name will
865    be terminated by '>' because that is checked in the first pass.
866    
867    Arguments:
868      ptr          current position in the pattern
869      count        current count of capturing parens so far encountered
870      name         name to seek, or NULL if seeking a numbered subpattern
871      lorn         name length, or subpattern number if name is NULL
872      xmode        TRUE if we are in /x mode
873    
874    Returns:       the number of the named subpattern, or -1 if not found
875    */
876    
877    static int
878    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
879      BOOL xmode)
880    {
881    const uschar *thisname;
882    
883    for (; *ptr != 0; ptr++)
884      {
885      int term;
886    
887      /* Skip over backslashed characters and also entire \Q...\E */
888    
889      if (*ptr == '\\')
890        {
891        if (*(++ptr) == 0) return -1;
892        if (*ptr == 'Q') for (;;)
893          {
894          while (*(++ptr) != 0 && *ptr != '\\');
895          if (*ptr == 0) return -1;
896          if (*(++ptr) == 'E') break;
897          }
898        continue;
899        }
900    
901      /* Skip over character classes */
902    
903      if (*ptr == '[')
904        {
905        while (*(++ptr) != ']')
906          {
907          if (*ptr == '\\')
908            {
909            if (*(++ptr) == 0) return -1;
910            if (*ptr == 'Q') for (;;)
911              {
912              while (*(++ptr) != 0 && *ptr != '\\');
913              if (*ptr == 0) return -1;
914              if (*(++ptr) == 'E') break;
915              }
916            continue;
917            }
918          }
919        continue;
920        }
921    
922      /* Skip comments in /x mode */
923    
924      if (xmode && *ptr == '#')
925        {
926        while (*(++ptr) != 0 && *ptr != '\n');
927        if (*ptr == 0) return -1;
928        continue;
929        }
930    
931      /* An opening parens must now be a real metacharacter */
932    
933      if (*ptr != '(') continue;
934      if (ptr[1] != '?')
935        {
936        count++;
937        if (name == NULL && count == lorn) return count;
938        continue;
939        }
940    
941      ptr += 2;
942      if (*ptr == 'P') ptr++;                      /* Allow optional P */
943    
944      /* We have to disambiguate (?<! and (?<= from (?<name> */
945    
946      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
947           *ptr != '\'')
948        continue;
949    
950      count++;
951    
952      if (name == NULL && count == lorn) return count;
953      term = *ptr++;
954      if (term == '<') term = '>';
955      thisname = ptr;
956      while (*ptr != term) ptr++;
957      if (name != NULL && lorn == ptr - thisname &&
958          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
959        return count;
960      }
961    
962    return -1;
963    }
964    
965    
966    
967    /*************************************************
968  *      Find first significant op code            *  *      Find first significant op code            *
969  *************************************************/  *************************************************/
970    
# Line 789  for (;;) Line 1013  for (;;)
1013    
1014      case OP_CALLOUT:      case OP_CALLOUT:
1015      case OP_CREF:      case OP_CREF:
1016      case OP_BRANUMBER:      case OP_RREF:
1017        case OP_DEF:
1018      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1019      break;      break;
1020    
# Line 834  for (;;) Line 1059  for (;;)
1059    {    {
1060    int d;    int d;
1061    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1062    
1063    switch (op)    switch (op)
1064      {      {
1065        case OP_CBRA:
1066      case OP_BRA:      case OP_BRA:
1067      case OP_ONCE:      case OP_ONCE:
1068      case OP_COND:      case OP_COND:
1069      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1070      if (d < 0) return d;      if (d < 0) return d;
1071      branchlength += d;      branchlength += d;
1072      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 876  for (;;) Line 1101  for (;;)
1101      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1102    
1103      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1104      case OP_CREF:      case OP_CREF:
1105        case OP_RREF:
1106        case OP_DEF:
1107      case OP_OPT:      case OP_OPT:
1108      case OP_CALLOUT:      case OP_CALLOUT:
1109      case OP_SOD:      case OP_SOD:
# Line 895  for (;;) Line 1121  for (;;)
1121    
1122      case OP_CHAR:      case OP_CHAR:
1123      case OP_CHARNC:      case OP_CHARNC:
1124        case OP_NOT:
1125      branchlength++;      branchlength++;
1126      cc += 2;      cc += 2;
1127  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 928  for (;;) Line 1155  for (;;)
1155    
1156      case OP_PROP:      case OP_PROP:
1157      case OP_NOTPROP:      case OP_NOTPROP:
1158      cc++;      cc += 2;
1159      /* Fall through */      /* Fall through */
1160    
1161      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 1009  Returns:      pointer to the opcode for Line 1236  Returns:      pointer to the opcode for
1236  static const uschar *  static const uschar *
1237  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1238  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1239  for (;;)  for (;;)
1240    {    {
1241    register int c = *code;    register int c = *code;
1242    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1243    else if (c > OP_BRA)  
1244      /* XCLASS is used for classes that cannot be represented just by a bit
1245      map. This includes negated single high-valued characters. The length in
1246      the table is zero; the actual length is stored in the compiled code. */
1247    
1248      if (c == OP_XCLASS) code += GET(code, 1);
1249    
1250      /* Handle capturing bracket */
1251    
1252      else if (c == OP_CBRA)
1253      {      {
1254      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1255      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1256      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1257      }      }
1258    
1259      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1260      a multi-byte character. The length in the table is a minimum, so we have to
1261      arrange to skip the extra bytes. */
1262    
1263    else    else
1264      {      {
1265      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1266  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1267      if (utf8) switch(c)      if (utf8) switch(c)
1268        {        {
1269        case OP_CHAR:        case OP_CHAR:
# Line 1042  for (;;) Line 1271  for (;;)
1271        case OP_EXACT:        case OP_EXACT:
1272        case OP_UPTO:        case OP_UPTO:
1273        case OP_MINUPTO:        case OP_MINUPTO:
1274          case OP_POSUPTO:
1275        case OP_STAR:        case OP_STAR:
1276        case OP_MINSTAR:        case OP_MINSTAR:
1277          case OP_POSSTAR:
1278        case OP_PLUS:        case OP_PLUS:
1279        case OP_MINPLUS:        case OP_MINPLUS:
1280          case OP_POSPLUS:
1281        case OP_QUERY:        case OP_QUERY:
1282        case OP_MINQUERY:        case OP_MINQUERY:
1283        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1284        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1285        break;        break;
1286        }        }
1287  #endif  #endif
# Line 1083  Returns:      pointer to the opcode for Line 1308  Returns:      pointer to the opcode for
1308  static const uschar *  static const uschar *
1309  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1310  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1311  for (;;)  for (;;)
1312    {    {
1313    register int c = *code;    register int c = *code;
1314    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1315    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1316    else if (c > OP_BRA)  
1317      {    /* XCLASS is used for classes that cannot be represented just by a bit
1318      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1319      }    the table is zero; the actual length is stored in the compiled code. */
1320    
1321      if (c == OP_XCLASS) code += GET(code, 1);
1322    
1323      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1324      that are followed by a character may be followed by a multi-byte character.
1325      The length in the table is a minimum, so we have to arrange to skip the extra
1326      bytes. */
1327    
1328    else    else
1329      {      {
1330      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1331  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1332      if (utf8) switch(c)      if (utf8) switch(c)
1333        {        {
1334        case OP_CHAR:        case OP_CHAR:
# Line 1114  for (;;) Line 1336  for (;;)
1336        case OP_EXACT:        case OP_EXACT:
1337        case OP_UPTO:        case OP_UPTO:
1338        case OP_MINUPTO:        case OP_MINUPTO:
1339          case OP_POSUPTO:
1340        case OP_STAR:        case OP_STAR:
1341        case OP_MINSTAR:        case OP_MINSTAR:
1342          case OP_POSSTAR:
1343        case OP_PLUS:        case OP_PLUS:
1344        case OP_MINPLUS:        case OP_MINPLUS:
1345          case OP_POSPLUS:
1346        case OP_QUERY:        case OP_QUERY:
1347        case OP_MINQUERY:        case OP_MINQUERY:
1348        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1349        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1350        break;        break;
1351        }        }
1352  #endif  #endif
# Line 1143  for (;;) Line 1361  for (;;)
1361  *************************************************/  *************************************************/
1362    
1363  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1364  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1365  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1366  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1367  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1368    struck an inner bracket whose current branch will already have been scanned.
1369    
1370  Arguments:  Arguments:
1371    code        points to start of search    code        points to start of search
# Line 1160  static BOOL Line 1379  static BOOL
1379  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1380  {  {
1381  register int c;  register int c;
1382  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1383       code < endcode;       code < endcode;
1384       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1385    {    {
# Line 1168  for (code = first_significant_code(code Line 1387  for (code = first_significant_code(code
1387    
1388    c = *code;    c = *code;
1389    
1390    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1391    
1392      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1393        {
1394        code += _pcre_OP_lengths[c];
1395        do code += GET(code, 1); while (*code == OP_ALT);
1396        c = *code;
1397        continue;
1398        }
1399    
1400      /* For other groups, scan the branches. */
1401    
1402      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1403      {      {
1404      BOOL empty_branch;      BOOL empty_branch;
1405      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1184  for (code = first_significant_code(code Line 1415  for (code = first_significant_code(code
1415        }        }
1416      while (*code == OP_ALT);      while (*code == OP_ALT);
1417      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1418      c = *code;      c = *code;
1419        continue;
1420      }      }
1421    
1422    else switch (c)    /* Handle the other opcodes */
1423    
1424      switch (c)
1425      {      {
1426      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1427    
# Line 1244  for (code = first_significant_code(code Line 1477  for (code = first_significant_code(code
1477      case OP_NOT:      case OP_NOT:
1478      case OP_PLUS:      case OP_PLUS:
1479      case OP_MINPLUS:      case OP_MINPLUS:
1480        case OP_POSPLUS:
1481      case OP_EXACT:      case OP_EXACT:
1482      case OP_NOTPLUS:      case OP_NOTPLUS:
1483      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1484        case OP_NOTPOSPLUS:
1485      case OP_NOTEXACT:      case OP_NOTEXACT:
1486      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1487      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1488        case OP_TYPEPOSPLUS:
1489      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1490      return FALSE;      return FALSE;
1491    
# Line 1261  for (code = first_significant_code(code Line 1497  for (code = first_significant_code(code
1497      case OP_ALT:      case OP_ALT:
1498      return TRUE;      return TRUE;
1499    
1500      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1501      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1502    
1503  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1504      case OP_STAR:      case OP_STAR:
1505      case OP_MINSTAR:      case OP_MINSTAR:
1506        case OP_POSSTAR:
1507      case OP_QUERY:      case OP_QUERY:
1508      case OP_MINQUERY:      case OP_MINQUERY:
1509        case OP_POSQUERY:
1510      case OP_UPTO:      case OP_UPTO:
1511      case OP_MINUPTO:      case OP_MINUPTO:
1512        case OP_POSUPTO:
1513      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1514      break;      break;
1515  #endif  #endif
# Line 1388  earlier groups that are outside the curr Line 1627  earlier groups that are outside the curr
1627  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1628  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1629  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1630  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1631  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1632    
1633    This function has been extended with the possibility of forward references for
1634    recursions and subroutine calls. It must also check the list of such references
1635    for the group we are dealing with. If it finds that one of the recursions in
1636    the current group is on this list, it adjusts the offset in the list, not the
1637    value in the reference (which is a group number).
1638    
1639  Arguments:  Arguments:
1640    group      points to the start of the group    group      points to the start of the group
1641    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1642    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1643    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1644      save_hwm   the hwm forward reference pointer at the start of the group
1645    
1646  Returns:     nothing  Returns:     nothing
1647  */  */
1648    
1649  static void  static void
1650  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1651      uschar *save_hwm)
1652  {  {
1653  uschar *ptr = group;  uschar *ptr = group;
1654  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1655    {    {
1656    int offset = GET(ptr, 1);    int offset;
1657    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1658    
1659      /* See if this recursion is on the forward reference list. If so, adjust the
1660      reference. */
1661    
1662      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1663        {
1664        offset = GET(hc, 0);
1665        if (cd->start_code + offset == ptr + 1)
1666          {
1667          PUT(hc, 0, offset + adjust);
1668          break;
1669          }
1670        }
1671    
1672      /* Otherwise, adjust the recursion offset if it's after the start of this
1673      group. */
1674    
1675      if (hc >= cd->hwm)
1676        {
1677        offset = GET(ptr, 1);
1678        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1679        }
1680    
1681    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1682    }    }
1683  }  }
# Line 1486  Yield:        TRUE when range returned; Line 1756  Yield:        TRUE when range returned;
1756  */  */
1757    
1758  static BOOL  static BOOL
1759  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1760      unsigned int *odptr)
1761  {  {
1762  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1763    
1764  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1765    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1766    
1767  if (c > d) return FALSE;  if (c > d) return FALSE;
1768    
# Line 1503  next = othercase + 1; Line 1771  next = othercase + 1;
1771    
1772  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1773    {    {
1774    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1775    next++;    next++;
1776    }    }
1777    
# Line 1517  return TRUE; Line 1783  return TRUE;
1783  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1784    
1785    
1786    
1787  /*************************************************  /*************************************************
1788  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1789  *************************************************/  *************************************************/
1790    
1791  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1792  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1793  bits.  sense to automatically possessify the repeated item.
1794    
1795  Arguments:  Arguments:
1796    optionsptr     pointer to the option bits    op_code       the repeated op code
1797    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1798    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1799    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1800    errorcodeptr   points to error code variable    ptr           next character in pattern
1801    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1802    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1803    
1804  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1805  */  */
1806    
1807  static BOOL  static BOOL
1808  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1809    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1810  {  {
1811  int repeat_type, op_type;  int next;
1812  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1813  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
1814  int greedy_default, greedy_non_default;  
1815  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
1816  int zeroreqbyte, zerofirstbyte;    {
1817  int req_caseopt, reqvary, tempreqvary;    for (;;)
1818  int condcount = 0;      {
1819  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1820  int after_manual_callout = 0;      if (*ptr == '#')
1821  register int c;        {
1822  register uschar *code = *codeptr;        while (*(++ptr) != 0)
1823  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1824  BOOL inescq = FALSE;        }
1825  BOOL groupsetfirstbyte = FALSE;      else break;
1826  const uschar *ptr = *ptrptr;      }
1827  const uschar *tempptr;    }
1828  uschar *previous = NULL;  
1829  uschar *previous_callout = NULL;  /* If the next item is one that we can handle, get its value. A non-negative
1830  uschar classbits[32];  value is a character, a negative value is an escape value. */
1831    
1832    if (*ptr == '\\')
1833      {
1834      int temperrorcode = 0;
1835      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1836      if (temperrorcode != 0) return FALSE;
1837      ptr++;    /* Point after the escape sequence */
1838      }
1839    
1840    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1841      {
1842  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1843  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1844  #endif  #endif
1845      next = *ptr++;
1846      }
1847    
1848  /* Set up the default and non-default settings for greediness */  else return FALSE;
1849    
1850  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1851    
1852  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1853  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1854  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1855  find one.      {
1856        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1857        if (*ptr == '#')
1858          {
1859          while (*(++ptr) != 0)
1860            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1861          }
1862        else break;
1863        }
1864      }
1865    
1866  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1867    
1868  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1869      return FALSE;
1870    
1871  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1872  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1873  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1874  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1875    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1876    
1877  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1878    
1879  for (;; ptr++)  if (next >= 0) switch(op_code)
1880    {    {
1881    BOOL negate_class;    case OP_CHAR:
1882    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
1883    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1884    int class_charcount;  #endif
1885    int class_lastchar;    return item != next;
   int newoptions;  
   int recno;  
   int skipbytes;  
   int subreqbyte;  
   int subfirstbyte;  
   int mclength;  
   uschar mcbuffer[8];  
1886    
1887    /* Next byte in the pattern */    /* For CHARNC (caseless character) we must check the other case. If we have
1888      Unicode property support, we can use it to test the other case of
1889      high-valued characters. */
1890    
1891      case OP_CHARNC:
1892    #ifdef SUPPORT_UTF8
1893      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894    #endif
1895      if (item == next) return FALSE;
1896    #ifdef SUPPORT_UTF8
1897      if (utf8)
1898        {
1899        unsigned int othercase;
1900        if (next < 128) othercase = cd->fcc[next]; else
1901    #ifdef SUPPORT_UCP
1902        othercase = _pcre_ucp_othercase((unsigned int)next);
1903    #else
1904        othercase = NOTACHAR;
1905    #endif
1906        return (unsigned int)item != othercase;
1907        }
1908      else
1909    #endif  /* SUPPORT_UTF8 */
1910      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1911    
1912      /* For OP_NOT, "item" must be a single-byte character. */
1913    
1914      case OP_NOT:
1915      if (next < 0) return FALSE;  /* Not a character */
1916      if (item == next) return TRUE;
1917      if ((options & PCRE_CASELESS) == 0) return FALSE;
1918    #ifdef SUPPORT_UTF8
1919      if (utf8)
1920        {
1921        unsigned int othercase;
1922        if (next < 128) othercase = cd->fcc[next]; else
1923    #ifdef SUPPORT_UCP
1924        othercase = _pcre_ucp_othercase(next);
1925    #else
1926        othercase = NOTACHAR;
1927    #endif
1928        return (unsigned int)item == othercase;
1929        }
1930      else
1931    #endif  /* SUPPORT_UTF8 */
1932      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1933    
1934      case OP_DIGIT:
1935      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1936    
1937      case OP_NOT_DIGIT:
1938      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1939    
1940      case OP_WHITESPACE:
1941      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1942    
1943      case OP_NOT_WHITESPACE:
1944      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1945    
1946      case OP_WORDCHAR:
1947      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1948    
1949      case OP_NOT_WORDCHAR:
1950      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1951    
1952      default:
1953      return FALSE;
1954      }
1955    
1956    
1957    /* Handle the case when the next item is \d, \s, etc. */
1958    
1959    switch(op_code)
1960      {
1961      case OP_CHAR:
1962      case OP_CHARNC:
1963    #ifdef SUPPORT_UTF8
1964      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1965    #endif
1966      switch(-next)
1967        {
1968        case ESC_d:
1969        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1970    
1971        case ESC_D:
1972        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1973    
1974        case ESC_s:
1975        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1976    
1977        case ESC_S:
1978        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1979    
1980        case ESC_w:
1981        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1982    
1983        case ESC_W:
1984        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1985    
1986        default:
1987        return FALSE;
1988        }
1989    
1990      case OP_DIGIT:
1991      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1992    
1993      case OP_NOT_DIGIT:
1994      return next == -ESC_d;
1995    
1996      case OP_WHITESPACE:
1997      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1998    
1999      case OP_NOT_WHITESPACE:
2000      return next == -ESC_s;
2001    
2002      case OP_WORDCHAR:
2003      return next == -ESC_W || next == -ESC_s;
2004    
2005      case OP_NOT_WORDCHAR:
2006      return next == -ESC_w || next == -ESC_d;
2007    
2008      default:
2009      return FALSE;
2010      }
2011    
2012    /* Control does not reach here */
2013    }
2014    
2015    
2016    
2017    /*************************************************
2018    *           Compile one branch                   *
2019    *************************************************/
2020    
2021    /* Scan the pattern, compiling it into the a vector. If the options are
2022    changed during the branch, the pointer is used to change the external options
2023    bits. This function is used during the pre-compile phase when we are trying
2024    to find out the amount of memory needed, as well as during the real compile
2025    phase. The value of lengthptr distinguishes the two phases.
2026    
2027    Arguments:
2028      optionsptr     pointer to the option bits
2029      codeptr        points to the pointer to the current code point
2030      ptrptr         points to the current pattern pointer
2031      errorcodeptr   points to error code variable
2032      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2033      reqbyteptr     set to the last literal character required, else < 0
2034      bcptr          points to current branch chain
2035      cd             contains pointers to tables etc.
2036      lengthptr      NULL during the real compile phase
2037                     points to length accumulator during pre-compile phase
2038    
2039    Returns:         TRUE on success
2040                     FALSE, with *errorcodeptr set non-zero on error
2041    */
2042    
2043    static BOOL
2044    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2045      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2046      compile_data *cd, int *lengthptr)
2047    {
2048    int repeat_type, op_type;
2049    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2050    int bravalue = 0;
2051    int greedy_default, greedy_non_default;
2052    int firstbyte, reqbyte;
2053    int zeroreqbyte, zerofirstbyte;
2054    int req_caseopt, reqvary, tempreqvary;
2055    int options = *optionsptr;
2056    int after_manual_callout = 0;
2057    int length_prevgroup = 0;
2058    register int c;
2059    register uschar *code = *codeptr;
2060    uschar *last_code = code;
2061    uschar *orig_code = code;
2062    uschar *tempcode;
2063    BOOL inescq = FALSE;
2064    BOOL groupsetfirstbyte = FALSE;
2065    const uschar *ptr = *ptrptr;
2066    const uschar *tempptr;
2067    uschar *previous = NULL;
2068    uschar *previous_callout = NULL;
2069    uschar *save_hwm = NULL;
2070    uschar classbits[32];
2071    
2072    #ifdef SUPPORT_UTF8
2073    BOOL class_utf8;
2074    BOOL utf8 = (options & PCRE_UTF8) != 0;
2075    uschar *class_utf8data;
2076    uschar utf8_char[6];
2077    #else
2078    BOOL utf8 = FALSE;
2079    uschar *utf8_char = NULL;
2080    #endif
2081    
2082    #ifdef DEBUG
2083    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2084    #endif
2085    
2086    /* Set up the default and non-default settings for greediness */
2087    
2088    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2089    greedy_non_default = greedy_default ^ 1;
2090    
2091    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2092    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2093    matches a non-fixed char first char; reqbyte just remains unset if we never
2094    find one.
2095    
2096    When we hit a repeat whose minimum is zero, we may have to adjust these values
2097    to take the zero repeat into account. This is implemented by setting them to
2098    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2099    item types that can be repeated set these backoff variables appropriately. */
2100    
2101    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2102    
2103    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2104    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2105    value > 255. It is added into the firstbyte or reqbyte variables to record the
2106    case status of the value. This is used only for ASCII characters. */
2107    
2108    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2109    
2110    /* Switch on next character until the end of the branch */
2111    
2112    for (;; ptr++)
2113      {
2114      BOOL negate_class;
2115      BOOL possessive_quantifier;
2116      BOOL is_quantifier;
2117      BOOL is_recurse;
2118      BOOL reset_bracount;
2119      int class_charcount;
2120      int class_lastchar;
2121      int newoptions;
2122      int recno;
2123      int refsign;
2124      int skipbytes;
2125      int subreqbyte;
2126      int subfirstbyte;
2127      int terminator;
2128      int mclength;
2129      uschar mcbuffer[8];
2130    
2131      /* Get next byte in the pattern */
2132    
2133    c = *ptr;    c = *ptr;
2134    
2135      /* If we are in the pre-compile phase, accumulate the length used for the
2136      previous cycle of this loop. */
2137    
2138      if (lengthptr != NULL)
2139        {
2140    #ifdef DEBUG
2141        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2142    #endif
2143        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2144          {
2145          *errorcodeptr = ERR52;
2146          goto FAILED;
2147          }
2148    
2149        /* There is at least one situation where code goes backwards: this is the
2150        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2151        the class is simply eliminated. However, it is created first, so we have to
2152        allow memory for it. Therefore, don't ever reduce the length at this point.
2153        */
2154    
2155        if (code < last_code) code = last_code;
2156        *lengthptr += code - last_code;
2157        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2158    
2159        /* If "previous" is set and it is not at the start of the work space, move
2160        it back to there, in order to avoid filling up the work space. Otherwise,
2161        if "previous" is NULL, reset the current code pointer to the start. */
2162    
2163        if (previous != NULL)
2164          {
2165          if (previous > orig_code)
2166            {
2167            memmove(orig_code, previous, code - previous);
2168            code -= previous - orig_code;
2169            previous = orig_code;
2170            }
2171          }
2172        else code = orig_code;
2173    
2174        /* Remember where this code item starts so we can pick up the length
2175        next time round. */
2176    
2177        last_code = code;
2178        }
2179    
2180      /* In the real compile phase, just check the workspace used by the forward
2181      reference list. */
2182    
2183      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2184        {
2185        *errorcodeptr = ERR52;
2186        goto FAILED;
2187        }
2188    
2189    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2190    
2191    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1634  for (;; ptr++) Line 2200  for (;; ptr++)
2200        {        {
2201        if (previous_callout != NULL)        if (previous_callout != NULL)
2202          {          {
2203          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2204              complete_callout(previous_callout, ptr, cd);
2205          previous_callout = NULL;          previous_callout = NULL;
2206          }          }
2207        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1655  for (;; ptr++) Line 2222  for (;; ptr++)
2222    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2223         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2224      {      {
2225      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2226          complete_callout(previous_callout, ptr, cd);
2227      previous_callout = NULL;      previous_callout = NULL;
2228      }      }
2229    
# Line 1666  for (;; ptr++) Line 2234  for (;; ptr++)
2234      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2235      if (c == '#')      if (c == '#')
2236        {        {
2237        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2238        on the Macintosh. */          {
2239        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2240        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2241          if (*ptr != 0) continue;
2242    
2243          /* Else fall through to handle end of string */
2244          c = 0;
2245        }        }
2246      }      }
2247    
# Line 1683  for (;; ptr++) Line 2255  for (;; ptr++)
2255    
2256    switch(c)    switch(c)
2257      {      {
2258      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2259        case 0:                        /* The branch terminates at string end */
2260      case 0:      case '|':                      /* or | or ) */
     case '|':  
2261      case ')':      case ')':
2262      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2263      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2264      *codeptr = code;      *codeptr = code;
2265      *ptrptr = ptr;      *ptrptr = ptr;
2266        if (lengthptr != NULL)
2267          {
2268          *lengthptr += code - last_code;   /* To include callout length */
2269          DPRINTF((">> end branch\n"));
2270          }
2271      return TRUE;      return TRUE;
2272    
2273    
2274        /* ===================================================================*/
2275      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2276      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2277    
# Line 1722  for (;; ptr++) Line 2300  for (;; ptr++)
2300      *code++ = OP_ANY;      *code++ = OP_ANY;
2301      break;      break;
2302    
2303      /* Character classes. If the included characters are all < 255 in value, we  
2304      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2305      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2306      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2307      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2308        map as usual, then invert it at the end. However, we use a different opcode
2309        so that data characters > 255 can be handled correctly.
2310    
2311      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2312      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1760  for (;; ptr++) Line 2340  for (;; ptr++)
2340        }        }
2341    
2342      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2343      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2344      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2345    
2346      class_charcount = 0;      class_charcount = 0;
2347      class_lastchar = -1;      class_lastchar = -1;
2348    
2349        /* Initialize the 32-char bit map to all zeros. We build the map in a
2350        temporary bit of memory, in case the class contains only 1 character (less
2351        than 256), because in that case the compiled code doesn't use the bit map.
2352        */
2353    
2354        memset(classbits, 0, 32 * sizeof(uschar));
2355    
2356  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2357      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2358      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2359  #endif  #endif
2360    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2361      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2362      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2363      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2364    
2365      do      if (c != 0) do
2366        {        {
2367          const uschar *oldptr;
2368    
2369  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2370        if (utf8 && c > 127)        if (utf8 && c > 127)
2371          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1797  for (;; ptr++) Line 2377  for (;; ptr++)
2377    
2378        if (inescq)        if (inescq)
2379          {          {
2380          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2381            {            {
2382            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2383            ptr++;            ptr++;                            /* Skip the 'E' */
2384            continue;            continue;                         /* Carry on with next */
2385            }            }
2386          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2387          }          }
2388    
2389        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1817  for (;; ptr++) Line 2397  for (;; ptr++)
2397            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2398          {          {
2399          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2400          int posix_class, i;          int posix_class, taboffset, tabopt;
2401          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2402            uschar pbits[32];
2403    
2404          if (ptr[1] != ':')          if (ptr[1] != ':')
2405            {            {
# Line 1847  for (;; ptr++) Line 2428  for (;; ptr++)
2428          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2429            posix_class = 0;            posix_class = 0;
2430    
2431          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2432          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2433          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2434          white space chars afterwards. */          result into the bit map that is being built. */
2435    
2436          posix_class *= 3;          posix_class *= 3;
2437          for (i = 0; i < 3; i++)  
2438            /* Copy in the first table (always present) */
2439    
2440            memcpy(pbits, cbits + posix_class_maps[posix_class],
2441              32 * sizeof(uschar));
2442    
2443            /* If there is a second table, add or remove it as required. */
2444    
2445            taboffset = posix_class_maps[posix_class + 1];
2446            tabopt = posix_class_maps[posix_class + 2];
2447    
2448            if (taboffset >= 0)
2449            {            {
2450            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2451            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2452            else            else
2453              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2454            }            }
2455    
2456            /* Not see if we need to remove any special characters. An option
2457            value of 1 removes vertical space and 2 removes underscore. */
2458    
2459            if (tabopt < 0) tabopt = -tabopt;
2460            if (tabopt == 1) pbits[1] &= ~0x3c;
2461              else if (tabopt == 2) pbits[11] &= 0x7f;
2462    
2463            /* Add the POSIX table or its complement into the main table that is
2464            being built and we are done. */
2465    
2466            if (local_negate)
2467              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2468            else
2469              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2470    
2471          ptr = tempptr + 1;          ptr = tempptr + 1;
2472          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2473          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2474          }          }
2475    
2476        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2477        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2478        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2479        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2480        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2481        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2482    
2483        if (c == '\\')        if (c == '\\')
2484          {          {
2485          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2486            if (*errorcodeptr != 0) goto FAILED;
2487    
2488          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2489          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2490            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2491          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2492            {            {
2493            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1906  for (;; ptr++) Line 2502  for (;; ptr++)
2502            {            {
2503            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2504            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2505            switch (-c)  
2506              /* Save time by not doing this in the pre-compile phase. */
2507    
2508              if (lengthptr == NULL) switch (-c)
2509              {              {
2510              case ESC_d:              case ESC_d:
2511              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1934  for (;; ptr++) Line 2533  for (;; ptr++)
2533              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2534              continue;              continue;
2535    
2536  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
2537              case ESC_p:              continue;
2538              case ESC_P:  
2539                default:    /* Not recognized; fall through */
2540                break;      /* Need "default" setting to stop compiler warning. */
2541                }
2542    
2543              /* In the pre-compile phase, just do the recognition. */
2544    
2545              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2546                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2547    
2548              /* We need to deal with \H, \h, \V, and \v in both phases because
2549              they use extra memory. */
2550    
2551              if (-c == ESC_h)
2552                {
2553                SETBIT(classbits, 0x09); /* VT */
2554                SETBIT(classbits, 0x20); /* SPACE */
2555                SETBIT(classbits, 0xa0); /* NSBP */
2556    #ifdef SUPPORT_UTF8
2557                if (utf8)
2558                  {
2559                  class_utf8 = TRUE;
2560                  *class_utf8data++ = XCL_SINGLE;
2561                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2562                  *class_utf8data++ = XCL_SINGLE;
2563                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2564                  *class_utf8data++ = XCL_RANGE;
2565                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2566                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2567                  *class_utf8data++ = XCL_SINGLE;
2568                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2569                  *class_utf8data++ = XCL_SINGLE;
2570                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2571                  *class_utf8data++ = XCL_SINGLE;
2572                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2573                  }
2574    #endif
2575                continue;
2576                }
2577    
2578              if (-c == ESC_H)
2579                {
2580                for (c = 0; c < 32; c++)
2581                  {
2582                  int x = 0xff;
2583                  switch (c)
2584                    {
2585                    case 0x09/8: x ^= 1 << (0x09%8); break;
2586                    case 0x20/8: x ^= 1 << (0x20%8); break;
2587                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2588                    default: break;
2589                    }
2590                  classbits[c] |= x;
2591                  }
2592    
2593    #ifdef SUPPORT_UTF8
2594                if (utf8)
2595                  {
2596                  class_utf8 = TRUE;
2597                  *class_utf8data++ = XCL_RANGE;
2598                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2599                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2600                  *class_utf8data++ = XCL_RANGE;
2601                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2602                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2603                  *class_utf8data++ = XCL_RANGE;
2604                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2605                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2606                  *class_utf8data++ = XCL_RANGE;
2607                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2608                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2609                  *class_utf8data++ = XCL_RANGE;
2610                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2611                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2612                  *class_utf8data++ = XCL_RANGE;
2613                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2614                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2615                  *class_utf8data++ = XCL_RANGE;
2616                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2617                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2618                  }
2619    #endif
2620                continue;
2621                }
2622    
2623              if (-c == ESC_v)
2624                {
2625                SETBIT(classbits, 0x0a); /* LF */
2626                SETBIT(classbits, 0x0b); /* VT */
2627                SETBIT(classbits, 0x0c); /* FF */
2628                SETBIT(classbits, 0x0d); /* CR */
2629                SETBIT(classbits, 0x85); /* NEL */
2630    #ifdef SUPPORT_UTF8
2631                if (utf8)
2632                  {
2633                  class_utf8 = TRUE;
2634                  *class_utf8data++ = XCL_RANGE;
2635                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2636                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2637                  }
2638    #endif
2639                continue;
2640                }
2641    
2642              if (-c == ESC_V)
2643                {
2644                for (c = 0; c < 32; c++)
2645                {                {
2646                BOOL negated;                int x = 0xff;
2647                int property = get_ucp(&ptr, &negated, errorcodeptr);                switch (c)
2648                if (property < 0) goto FAILED;                  {
2649                    case 0x0a/8: x ^= 1 << (0x0a%8);
2650                                 x ^= 1 << (0x0b%8);
2651                                 x ^= 1 << (0x0c%8);
2652                                 x ^= 1 << (0x0d%8);
2653                                 break;
2654                    case 0x85/8: x ^= 1 << (0x85%8); break;
2655                    default: break;
2656                    }
2657                  classbits[c] |= x;
2658                  }
2659    
2660    #ifdef SUPPORT_UTF8
2661                if (utf8)
2662                  {
2663                class_utf8 = TRUE;                class_utf8 = TRUE;
2664                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_RANGE;
2665                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2666                *class_utf8data++ = property;                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2667                class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = XCL_RANGE;
2668                }                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2669                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2670                  }
2671    #endif
2672                continue;
2673                }
2674    
2675              /* We need to deal with \P and \p in both phases. */
2676    
2677    #ifdef SUPPORT_UCP
2678              if (-c == ESC_p || -c == ESC_P)
2679                {
2680                BOOL negated;
2681                int pdata;
2682                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2683                if (ptype < 0) goto FAILED;
2684                class_utf8 = TRUE;
2685                *class_utf8data++ = ((-c == ESC_p) != negated)?
2686                  XCL_PROP : XCL_NOTPROP;
2687                *class_utf8data++ = ptype;
2688                *class_utf8data++ = pdata;
2689                class_charcount -= 2;   /* Not a < 256 character */
2690              continue;              continue;
2691                }
2692  #endif  #endif
2693              /* Unrecognized escapes are faulted if PCRE is running in its
2694              strict mode. By default, for compatibility with Perl, they are
2695              treated as literals. */
2696    
2697              /* Unrecognized escapes are faulted if PCRE is running in its            if ((options & PCRE_EXTRA) != 0)
2698              strict mode. By default, for compatibility with Perl, they are              {
2699              treated as literals. */              *errorcodeptr = ERR7;
2700                goto FAILED;
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2701              }              }
2702    
2703              class_charcount -= 2;  /* Undo the default count from above */
2704              c = *ptr;              /* Get the final character and fall through */
2705            }            }
2706    
2707          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2708          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2709    
2710          }   /* End of backslash handling */          }   /* End of backslash handling */
2711    
2712        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2713        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2714        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2715          entirely. The code for handling \Q and \E is messy. */
2716    
2717        if (ptr[1] == '-' && ptr[2] != ']')        CHECK_RANGE:
2718          while (ptr[1] == '\\' && ptr[2] == 'E')
2719          {          {
2720          int d;          inescq = FALSE;
2721          ptr += 2;          ptr += 2;
2722            }
2723    
2724  #ifdef SUPPORT_UTF8        oldptr = ptr;
2725    
2726          if (!inescq && ptr[1] == '-')
2727            {
2728            int d;
2729            ptr += 2;
2730            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2731    
2732            /* If we hit \Q (not followed by \E) at this point, go into escaped
2733            mode. */
2734    
2735            while (*ptr == '\\' && ptr[1] == 'Q')
2736              {
2737              ptr += 2;
2738              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2739              inescq = TRUE;
2740              break;
2741              }
2742    
2743            if (*ptr == 0 || (!inescq && *ptr == ']'))
2744              {
2745              ptr = oldptr;
2746              goto LONE_SINGLE_CHARACTER;
2747              }
2748    
2749    #ifdef SUPPORT_UTF8
2750          if (utf8)          if (utf8)
2751            {                           /* Braces are required because the */            {                           /* Braces are required because the */
2752            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
# Line 1992  for (;; ptr++) Line 2759  for (;; ptr++)
2759          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2760          in such circumstances. */          in such circumstances. */
2761    
2762          if (d == '\\')          if (!inescq && d == '\\')
2763            {            {
2764            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2765            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2766    
2767            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2768            was literal */            special means the '-' was literal */
2769    
2770            if (d < 0)            if (d < 0)
2771              {              {
2772              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2773              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2774                else if (d == -ESC_R) d = 'R'; else
2775                {                {
2776                ptr = oldptr - 2;                ptr = oldptr;
2777                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2778                }                }
2779              }              }
2780            }            }
2781    
2782          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2783          the pre-pass. Optimize one-character ranges */          one-character ranges */
2784    
2785            if (d < c)
2786              {
2787              *errorcodeptr = ERR8;
2788              goto FAILED;
2789              }
2790    
2791          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2792    
# Line 2033  for (;; ptr++) Line 2807  for (;; ptr++)
2807  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2808            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2809              {              {
2810              int occ, ocd;              unsigned int occ, ocd;
2811              int cc = c;              unsigned int cc = c;
2812              int origd = d;              unsigned int origd = d;
2813              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2814                {                {
2815                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2816                      ocd <= (unsigned int)d)
2817                    continue;                          /* Skip embedded ranges */
2818    
2819                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2820                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2821                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2822                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2823                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2824                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2825                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2826                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2827                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2828                  d = ocd;                  d = ocd;
2829                  continue;                  continue;
# Line 2093  for (;; ptr++) Line 2871  for (;; ptr++)
2871          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2872          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2873    
2874          for (; c <= d; c++)          class_charcount += d - c + 1;
2875            class_lastchar = d;
2876    
2877            /* We can save a bit of time by skipping this in the pre-compile. */
2878    
2879            if (lengthptr == NULL) for (; c <= d; c++)
2880            {            {
2881            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2882            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2101  for (;; ptr++) Line 2884  for (;; ptr++)
2884              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2885              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2886              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2887            }            }
2888    
2889          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2126  for (;; ptr++) Line 2907  for (;; ptr++)
2907  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2908          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2909            {            {
2910            int chartype;            unsigned int othercase;
2911            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
2912              {              {
2913              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2914              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2154  for (;; ptr++) Line 2933  for (;; ptr++)
2933          }          }
2934        }        }
2935    
2936      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
2937    
2938      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2939    
2940        if (c == 0)                          /* Missing terminating ']' */
2941          {
2942          *errorcodeptr = ERR6;
2943          goto FAILED;
2944          }
2945    
2946      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2947      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2221  for (;; ptr++) Line 3005  for (;; ptr++)
3005    
3006      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3007      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3008      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3009    
3010  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3011      if (class_utf8)      if (class_utf8)
# Line 2231  for (;; ptr++) Line 3015  for (;; ptr++)
3015        code += LINK_SIZE;        code += LINK_SIZE;
3016        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3017    
3018        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3019        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3020    
3021        if (class_charcount > 0)        if (class_charcount > 0)
3022          {          {
3023          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3024            memmove(code + 32, code, class_utf8data - code);
3025          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3026          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3027          }          }
3028          else code = class_utf8data;
3029    
3030        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3031    
# Line 2265  for (;; ptr++) Line 3042  for (;; ptr++)
3042      if (negate_class)      if (negate_class)
3043        {        {
3044        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3045        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3046            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3047        }        }
3048      else      else
3049        {        {
# Line 2275  for (;; ptr++) Line 3053  for (;; ptr++)
3053      code += 32;      code += 32;
3054      break;      break;
3055    
3056    
3057        /* ===================================================================*/
3058      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3059      has been tested above. */      has been tested above. */
3060    
# Line 2342  for (;; ptr++) Line 3122  for (;; ptr++)
3122        }        }
3123      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3124    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3125      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3126      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3127      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2389  for (;; ptr++) Line 3155  for (;; ptr++)
3155          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3156          }          }
3157    
3158          /* If the repetition is unlimited, it pays to see if the next thing on
3159          the line is something that cannot possibly match this character. If so,
3160          automatically possessifying this item gains some performance in the case
3161          where the match fails. */
3162    
3163          if (!possessive_quantifier &&
3164              repeat_max < 0 &&
3165              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3166                options, cd))
3167            {
3168            repeat_type = 0;    /* Force greedy */
3169            possessive_quantifier = TRUE;
3170            }
3171    
3172        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3173        }        }
3174    
3175      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3176      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3177      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3178      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3179        currently used only for single-byte chars. */
3180    
3181      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3182        {        {
3183        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3184        c = previous[1];        c = previous[1];
3185          if (!possessive_quantifier &&
3186              repeat_max < 0 &&
3187              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3188            {
3189            repeat_type = 0;    /* Force greedy */
3190            possessive_quantifier = TRUE;
3191            }
3192        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3193        }        }
3194    
# Line 2414  for (;; ptr++) Line 3202  for (;; ptr++)
3202      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3203        {        {
3204        uschar *oldcode;        uschar *oldcode;
3205        int prop_type;        int prop_type, prop_value;
3206        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3207        c = *previous;        c = *previous;
3208    
3209          if (!possessive_quantifier &&
3210              repeat_max < 0 &&
3211              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3212            {
3213            repeat_type = 0;    /* Force greedy */
3214            possessive_quantifier = TRUE;
3215            }
3216    
3217        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3218        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3219          previous[1] : -1;          {
3220            prop_type = previous[1];
3221            prop_value = previous[2];
3222            }
3223          else prop_type = prop_value = -1;
3224    
3225        oldcode = code;        oldcode = code;
3226        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2454  for (;; ptr++) Line 3254  for (;; ptr++)
3254          }          }
3255    
3256        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3257        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3258        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3259        one less than the maximum. */        one less than the maximum. */
3260    
# Line 2481  for (;; ptr++) Line 3281  for (;; ptr++)
3281    
3282          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3283          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3284          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3285          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3286          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3287    
# Line 2497  for (;; ptr++) Line 3297  for (;; ptr++)
3297  #endif  #endif
3298              {              {
3299              *code++ = c;              *code++ = c;
3300              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3301                  {
3302                  *code++ = prop_type;
3303                  *code++ = prop_value;
3304                  }
3305              }              }
3306            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3307            }            }
3308    
3309          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3310          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3311            UPTO is just for 1 instance, we can use QUERY instead. */
3312    
3313          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3314            {            {
# Line 2516  for (;; ptr++) Line 3321  for (;; ptr++)
3321            else            else
3322  #endif  #endif
3323            *code++ = c;            *code++ = c;
3324            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3325                {
3326                *code++ = prop_type;
3327                *code++ = prop_value;
3328                }
3329            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3330            *code++ = OP_UPTO + repeat_type;  
3331            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3332                {
3333                *code++ = OP_QUERY + repeat_type;
3334                }
3335              else
3336                {
3337                *code++ = OP_UPTO + repeat_type;
3338                PUT2INC(code, 0, repeat_max);
3339                }
3340            }            }
3341          }          }
3342    
# Line 2535  for (;; ptr++) Line 3352  for (;; ptr++)
3352  #endif  #endif
3353        *code++ = c;        *code++ = c;
3354    
3355        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3356        defines the required property. */        define the required property. */
3357    
3358  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3359        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3360            {
3361            *code++ = prop_type;
3362            *code++ = prop_value;
3363            }
3364  #endif  #endif
3365        }        }
3366    
# Line 2582  for (;; ptr++) Line 3403  for (;; ptr++)
3403      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3404      cases. */      cases. */
3405    
3406      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3407               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3408        {        {
3409        register int i;        register int i;
3410        int ketoffset = 0;        int ketoffset = 0;
3411        int len = code - previous;        int len = code - previous;
3412        uschar *bralink = NULL;        uschar *bralink = NULL;
3413    
3414          /* Repeating a DEFINE group is pointless */
3415    
3416          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3417            {
3418            *errorcodeptr = ERR55;
3419            goto FAILED;
3420            }
3421    
3422          /* This is a paranoid check to stop integer overflow later on */
3423    
3424          if (len > MAX_DUPLENGTH)
3425            {
3426            *errorcodeptr = ERR50;
3427            goto FAILED;
3428            }
3429    
3430        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3431        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3432        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2624  for (;; ptr++) Line 3461  for (;; ptr++)
3461          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3462          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3463          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3464          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3465          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3466            doing this. */
3467    
3468          if (repeat_max <= 1)          if (repeat_max <= 1)
3469            {            {
3470            *code = OP_END;            *code = OP_END;
3471            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3472            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3473            code++;            code++;
3474            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2648  for (;; ptr++) Line 3486  for (;; ptr++)
3486            {            {
3487            int offset;            int offset;
3488            *code = OP_END;            *code = OP_END;
3489            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3490            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3491            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3492            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2668  for (;; ptr++) Line 3506  for (;; ptr++)
3506        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3507        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3508        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3509        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3510          forward reference subroutine calls in the group, there will be entries on
3511          the workspace list; replicate these with an appropriate increment. */
3512    
3513        else        else
3514          {          {
3515          if (repeat_min > 1)          if (repeat_min > 1)
3516            {            {
3517            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3518            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3519    
3520              if (lengthptr != NULL)
3521                *lengthptr += (repeat_min - 1)*length_prevgroup;
3522    
3523              /* This is compiling for real */
3524    
3525              else
3526              {              {
3527              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3528              code += len;              for (i = 1; i < repeat_min; i++)
3529                  {
3530                  uschar *hc;
3531                  uschar *this_hwm = cd->hwm;
3532                  memcpy(code, previous, len);
3533                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3534                    {
3535                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3536                    cd->hwm += LINK_SIZE;
3537                    }
3538                  save_hwm = this_hwm;
3539                  code += len;
3540                  }
3541              }              }
3542            }            }
3543    
3544          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3545          }          }
3546    
# Line 2688  for (;; ptr++) Line 3548  for (;; ptr++)
3548        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3549        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3550        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3551        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3552          replicate entries on the forward reference list. */
3553    
3554        if (repeat_max >= 0)        if (repeat_max >= 0)
3555          {          {
3556          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3557            just adjust the length as if we had. For each repetition we must add 1
3558            to the length for BRAZERO and for all but the last repetition we must
3559            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3560    
3561            if (lengthptr != NULL && repeat_max > 0)
3562              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3563                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3564    
3565            /* This is compiling for real */
3566    
3567            else for (i = repeat_max - 1; i >= 0; i--)
3568            {            {
3569              uschar *hc;
3570              uschar *this_hwm = cd->hwm;
3571    
3572            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3573    
3574            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2709  for (;; ptr++) Line 3584  for (;; ptr++)
3584              }              }
3585    
3586            memcpy(code, previous, len);            memcpy(code, previous, len);
3587              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3588                {
3589                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3590                cd->hwm += LINK_SIZE;
3591                }
3592              save_hwm = this_hwm;
3593            code += len;            code += len;
3594            }            }
3595    
# Line 2731  for (;; ptr++) Line 3612  for (;; ptr++)
3612        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3613        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3614        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3615        correct offset was computed above. */        correct offset was computed above.
3616    
3617        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
3618          this group is a non-atomic one that could match an empty string. If so,
3619          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3620          that runtime checking can be done. [This check is also applied to
3621          atomic groups at runtime, but in a different way.] */
3622    
3623          else
3624            {
3625            uschar *ketcode = code - ketoffset;
3626            uschar *bracode = ketcode - GET(ketcode, 1);
3627            *ketcode = OP_KETRMAX + repeat_type;
3628            if (lengthptr == NULL && *bracode != OP_ONCE)
3629              {
3630              uschar *scode = bracode;
3631              do
3632                {
3633                if (could_be_empty_branch(scode, ketcode, utf8))
3634                  {
3635                  *bracode += OP_SBRA - OP_BRA;
3636                  break;
3637                  }
3638                scode += GET(scode, 1);
3639                }
3640              while (*scode == OP_ALT);
3641              }
3642            }
3643        }        }
3644    
3645      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2744  for (;; ptr++) Line 3650  for (;; ptr++)
3650        goto FAILED;        goto FAILED;
3651        }        }
3652    
3653      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3654      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3655      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3656      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3657      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3658        but the special opcodes can optimize it a bit. The repeated item starts at
3659        tempcode, not at previous, which might be the first part of a string whose
3660        (former) last char we repeated.
3661    
3662        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3663        an 'upto' may follow. We skip over an 'exact' item, and then test the
3664        length of what remains before proceeding. */
3665    
3666      if (possessive_quantifier)      if (possessive_quantifier)
3667        {        {
3668        int len = code - tempcode;        int len;
3669        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3670        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3671        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3672        tempcode[0] = OP_ONCE;        len = code - tempcode;
3673        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3674        PUTINC(code, 0, len);          {
3675        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3676            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3677            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3678            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3679    
3680            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3681            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3682            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3683            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3684    
3685            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3686            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3687            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3688            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3689    
3690            default:
3691            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3692            code += 1 + LINK_SIZE;
3693            len += 1 + LINK_SIZE;
3694            tempcode[0] = OP_ONCE;
3695            *code++ = OP_KET;
3696            PUTINC(code, 0, len);
3697            PUT(tempcode, 1, len);
3698            break;
3699            }
3700        }        }
3701    
3702      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2772  for (;; ptr++) Line 3709  for (;; ptr++)
3709      break;      break;
3710    
3711    
3712      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3713      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3714      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3715      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3716      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3717      check for syntax errors here.  */      group. */
3718    
3719      case '(':      case '(':
3720      newoptions = options;      newoptions = options;
3721      skipbytes = 0;      skipbytes = 0;
3722        bravalue = OP_CBRA;
3723        save_hwm = cd->hwm;
3724        reset_bracount = FALSE;
3725    
3726      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3727        {        {
3728        int set, unset;        int i, set, unset, namelen;
3729        int *optset;        int *optset;
3730          const uschar *name;
3731          uschar *slot;
3732    
3733        switch (*(++ptr))        switch (*(++ptr))
3734          {          {
3735          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3736          ptr++;          ptr++;
3737          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3738            if (*ptr == 0)
3739              {
3740              *errorcodeptr = ERR18;
3741              goto FAILED;
3742              }
3743          continue;          continue;
3744    
3745          case ':':                 /* Non-extracting bracket */  
3746            /* ------------------------------------------------------------ */
3747            case '|':                 /* Reset capture count for each branch */
3748            reset_bracount = TRUE;
3749            /* Fall through */
3750    
3751            /* ------------------------------------------------------------ */
3752            case ':':                 /* Non-capturing bracket */
3753          bravalue = OP_BRA;          bravalue = OP_BRA;
3754          ptr++;          ptr++;
3755          break;          break;
3756    
3757    
3758            /* ------------------------------------------------------------ */
3759          case '(':          case '(':
3760          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3761    
3762          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3763            group), a name (referring to a named group), or 'R', referring to
3764            recursion. R<digits> and R&name are also permitted for recursion tests.
3765    
3766            There are several syntaxes for testing a named group: (?(name)) is used
3767            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3768    
3769            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3770            be the recursive thing or the name 'R' (and similarly for 'R' followed
3771            by digits), and (b) a number could be a name that consists of digits.
3772            In both cases, we look for a name first; if not found, we try the other
3773            cases. */
3774    
3775            /* For conditions that are assertions, check the syntax, and then exit
3776            the switch. This will take control down to where bracketed groups,
3777            including assertions, are processed. */
3778    
3779            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3780              break;
3781    
3782            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3783            below), and all need to skip 3 bytes at the start of the group. */
3784    
3785            code[1+LINK_SIZE] = OP_CREF;
3786            skipbytes = 3;
3787            refsign = -1;
3788    
3789            /* Check for a test for recursion in a named group. */
3790    
3791            if (ptr[1] == 'R' && ptr[2] == '&')
3792              {
3793              terminator = -1;
3794              ptr += 2;
3795              code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3796              }
3797    
3798            /* Check for a test for a named group's having been set, using the Perl
3799            syntax (?(<name>) or (?('name') */
3800    
3801            else if (ptr[1] == '<')
3802              {
3803              terminator = '>';
3804              ptr++;
3805              }
3806            else if (ptr[1] == '\'')
3807              {
3808              terminator = '\'';
3809              ptr++;
3810              }
3811            else
3812              {
3813              terminator = 0;
3814              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3815              }
3816    
3817            /* We now expect to read a name; any thing else is an error */
3818    
3819            if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3820              {
3821              ptr += 1;  /* To get the right offset */
3822              *errorcodeptr = ERR28;
3823              goto FAILED;
3824              }
3825    
3826            /* Read the name, but also get it as a number if it's all digits */
3827    
3828          if (ptr[1] == 'R')          recno = 0;
3829            name = ++ptr;
3830            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3831              {
3832              if (recno >= 0)
3833                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3834                  recno * 10 + *ptr - '0' : -1;
3835              ptr++;
3836              }
3837            namelen = ptr - name;
3838    
3839            if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3840            {            {
3841            code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
3842            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            *errorcodeptr = ERR26;
3843            skipbytes = 3;            goto FAILED;
           ptr += 3;  
3844            }            }
3845    
3846          /* Condition to test for a numbered subpattern match. We know that          /* Do no further checking in the pre-compile phase. */
         if a digit follows ( then there will just be digits until ) because  
         the syntax was checked in the first pass. */  
3847    
3848          else if ((digitab[ptr[1]] && ctype_digit) != 0)          if (lengthptr != NULL) break;
3849    
3850            /* In the real compile we do the work of looking for the actual
3851            reference. If the string started with "+" or "-" we require the rest to
3852            be digits, in which case recno will be set. */
3853    
3854            if (refsign > 0)
3855            {            {
3856            int condref;                 /* Don't amalgamate; some compilers */            if (recno <= 0)
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
3857              {              {
3858              *errorcodeptr = ERR35;              *errorcodeptr = ERR58;
3859              goto FAILED;              goto FAILED;
3860              }              }
3861            ptr++;            if (refsign == '-')
3862            code[1+LINK_SIZE] = OP_CREF;              {
3863            PUT2(code, 2+LINK_SIZE, condref);              recno = cd->bracount - recno + 1;
3864            skipbytes = 3;              if (recno <= 0)
3865                  {
3866                  *errorcodeptr = ERR15;
3867                  goto FAILED;
3868                  }
3869                }
3870              else recno += cd->bracount;
3871              PUT2(code, 2+LINK_SIZE, recno);
3872              break;
3873              }
3874    
3875            /* Otherwise (did not start with "+" or "-"), start by looking for the
3876            name. */
3877    
3878            slot = cd->name_table;
3879            for (i = 0; i < cd->names_found; i++)
3880              {
3881              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3882              slot += cd->name_entry_size;
3883              }
3884    
3885            /* Found a previous named subpattern */
3886    
3887            if (i < cd->names_found)
3888              {
3889              recno = GET2(slot, 0);
3890              PUT2(code, 2+LINK_SIZE, recno);
3891              }
3892    
3893            /* Search the pattern for a forward reference */
3894    
3895            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3896                            (options & PCRE_EXTENDED) != 0)) > 0)
3897              {
3898              PUT2(code, 2+LINK_SIZE, i);
3899              }
3900    
3901            /* If terminator == 0 it means that the name followed directly after
3902            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3903            some further alternatives to try. For the cases where terminator != 0
3904            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3905            now checked all the possibilities, so give an error. */
3906    
3907            else if (terminator != 0)
3908              {
3909              *errorcodeptr = ERR15;
3910              goto FAILED;
3911              }
3912    
3913            /* Check for (?(R) for recursion. Allow digits after R to specify a
3914            specific group number. */
3915    
3916            else if (*name == 'R')
3917              {
3918              recno = 0;
3919              for (i = 1; i < namelen; i++)
3920                {
3921                if ((digitab[name[i]] & ctype_digit) == 0)
3922                  {
3923                  *errorcodeptr = ERR15;
3924                  goto FAILED;
3925                  }
3926                recno = recno * 10 + name[i] - '0';
3927                }
3928              if (recno == 0) recno = RREF_ANY;
3929              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3930              PUT2(code, 2+LINK_SIZE, recno);
3931              }
3932    
3933            /* Similarly, check for the (?(DEFINE) "condition", which is always
3934            false. */
3935    
3936            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3937              {
3938              code[1+LINK_SIZE] = OP_DEF;
3939              skipbytes = 1;
3940              }
3941    
3942            /* Check for the "name" actually being a subpattern number. */
3943    
3944            else if (recno > 0)
3945              {
3946              PUT2(code, 2+LINK_SIZE, recno);
3947              }
3948    
3949            /* Either an unidentified subpattern, or a reference to (?(0) */
3950    
3951            else
3952              {
3953              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3954              goto FAILED;
3955            }            }
         /* For conditions that are assertions, we just fall through, having  
         set bravalue above. */  
3956          break;          break;
3957    
3958    
3959            /* ------------------------------------------------------------ */
3960          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3961          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3962          ptr++;          ptr++;
3963          break;          break;
3964    
3965    
3966            /* ------------------------------------------------------------ */
3967          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3968          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3969          ptr++;          ptr++;
3970          break;          break;
3971    
3972          case '<':                 /* Lookbehinds */  
3973          switch (*(++ptr))          /* ------------------------------------------------------------ */
3974            case '<':                 /* Lookbehind or named define */
3975            switch (ptr[1])
3976            {            {
3977            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3978            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3979            ptr++;            ptr += 2;
3980            break;            break;
3981    
3982            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3983            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3984            ptr++;            ptr += 2;
3985            break;            break;
3986    
3987              default:                /* Could be name define, else bad */
3988              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3989              ptr++;                  /* Correct offset for error */
3990              *errorcodeptr = ERR24;
3991              goto FAILED;
3992            }            }
3993          break;          break;
3994    
3995    
3996            /* ------------------------------------------------------------ */
3997          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3998          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3999          ptr++;          ptr++;
4000          break;          break;
4001    
4002    
4003            /* ------------------------------------------------------------ */
4004          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
4005          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
4006          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
4007          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
4008            {                       /* closing parenthesis is present. */            {
4009            int n = 0;            int n = 0;
4010            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
4011              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
4012              if (*ptr != ')')
4013                {
4014                *errorcodeptr = ERR39;
4015                goto FAILED;
4016                }
4017            if (n > 255)            if (n > 255)
4018              {              {
4019              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 2887  for (;; ptr++) Line 4027  for (;; ptr++)
4027          previous = NULL;          previous = NULL;
4028          continue;          continue;
4029    
4030          case 'P':                 /* Named subpattern handling */  
4031          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4032            case 'P':                 /* Python-style named subpattern handling */
4033            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4034              {
4035              is_recurse = *ptr == '>';
4036              terminator = ')';
4037              goto NAMED_REF_OR_RECURSE;
4038              }
4039            else if (*ptr != '<')    /* Test for Python-style definition */
4040              {
4041              *errorcodeptr = ERR41;
4042              goto FAILED;
4043              }
4044            /* Fall through to handle (?P< as (?< is handled */
4045    
4046    
4047            /* ------------------------------------------------------------ */
4048            DEFINE_NAME:    /* Come here from (?< handling */
4049            case '\'':
4050            {            {
4051            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4052            uschar *slot = cd->name_table;            name = ++ptr;
4053            const uschar *name;     /* Don't amalgamate; some compilers */  
4054            name = ++ptr;           /* grumble at autoincrement in declaration */            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4055              namelen = ptr - name;
4056    
4057            while (*ptr++ != '>');            /* In the pre-compile phase, just do a syntax check. */
           namelen = ptr - name - 1;  
4058    
4059            for (i = 0; i < cd->names_found; i++)            if (lengthptr != NULL)
4060              {              {
4061              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
             if (crc == 0)  
4062                {                {
4063                if (slot[2+namelen] == 0)                *errorcodeptr = ERR42;
4064                  goto FAILED;
4065                  }
4066                if (cd->names_found >= MAX_NAME_COUNT)
4067                  {
4068                  *errorcodeptr = ERR49;
4069                  goto FAILED;
4070                  }
4071                if (namelen + 3 > cd->name_entry_size)
4072                  {
4073                  cd->name_entry_size = namelen + 3;
4074                  if (namelen > MAX_NAME_SIZE)
4075                  {                  {
4076                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4077                  goto FAILED;                  goto FAILED;
4078                  }                  }
               crc = -1;             /* Current name is substring */  
               }  
             if (crc < 0)  
               {  
               memmove(slot + cd->name_entry_size, slot,  
                 (cd->names_found - i) * cd->name_entry_size);  
               break;  
4079                }                }
             slot += cd->name_entry_size;  
4080              }              }
4081    
4082            PUT2(slot, 0, *brackets + 1);            /* In the real compile, create the entry in the table */
           memcpy(slot + 2, name, namelen);  
           slot[2+namelen] = 0;  
           cd->names_found++;  
           goto NUMBERED_GROUP;  
           }  
   
         if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */  
           {  
           int i, namelen;  
           int type = *ptr++;  
           const uschar *name = ptr;  
           uschar *slot = cd->name_table;  
   
           while (*ptr != ')') ptr++;  
           namelen = ptr - name;  
4083    
4084            for (i = 0; i < cd->names_found; i++)            else
             {  
             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;  
             slot += cd->name_entry_size;  
             }  
           if (i >= cd->names_found)  
4085              {              {
4086              *errorcodeptr = ERR15;              slot = cd->name_table;
4087              goto FAILED;              for (i = 0; i < cd->names_found; i++)
4088                  {
4089                  int crc = memcmp(name, slot+2, namelen);
4090                  if (crc == 0)
4091                    {
4092                    if (slot[2+namelen] == 0)
4093                      {
4094                      if ((options & PCRE_DUPNAMES) == 0)
4095                        {
4096                        *errorcodeptr = ERR43;
4097                        goto FAILED;
4098                        }
4099                      }
4100                    else crc = -1;      /* Current name is substring */
4101                    }
4102                  if (crc < 0)
4103                    {
4104                    memmove(slot + cd->name_entry_size, slot,
4105                      (cd->names_found - i) * cd->name_entry_size);
4106                    break;
4107                    }
4108                  slot += cd->name_entry_size;
4109                  }
4110    
4111                PUT2(slot, 0, cd->bracount + 1);
4112                memcpy(slot + 2, name, namelen);
4113                slot[2+namelen] = 0;
4114              }              }
4115              }
4116    
4117            recno = GET2(slot, 0);          /* In both cases, count the number of names we've encountered. */
4118    
4119            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */          ptr++;                    /* Move past > or ' */
4120            cd->names_found++;
4121            goto NUMBERED_GROUP;
4122    
           /* Back reference */  
4123    
4124            previous = code;          /* ------------------------------------------------------------ */
4125            *code++ = OP_REF;          case '&':                 /* Perl recursion/subroutine syntax */
4126            PUT2INC(code, 0, recno);          terminator = ')';
4127            cd->backref_map |= (recno < 32)? (1 << recno) : 1;          is_recurse = TRUE;
4128            if (recno > cd->top_backref) cd->top_backref = recno;          /* Fall through */
           continue;  
           }  
4129    
4130          /* Should never happen */          /* We come here from the Python syntax above that handles both
4131          break;          references (?P=name) and recursion (?P>name), as well as falling
4132            through from the Perl recursion syntax (?&name). */
4133    
4134          case 'R':                 /* Pattern recursion */          NAMED_REF_OR_RECURSE:
4135          ptr++;                    /* Same as (?0)      */          name = ++ptr;
4136          /* Fall through */          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4137            namelen = ptr - name;
4138    
4139          /* Recursion or "subroutine" call */          /* In the pre-compile phase, do a syntax check and set a dummy
4140            reference number. */
4141    
4142          case '0': case '1': case '2': case '3': case '4':          if (lengthptr != NULL)
4143          case '5': case '6': case '7': case '8': case '9':            {
4144              if (*ptr != terminator)
4145                {
4146                *errorcodeptr = ERR42;
4147                goto FAILED;
4148                }
4149              if (namelen > MAX_NAME_SIZE)
4150                {
4151                *errorcodeptr = ERR48;
4152                goto FAILED;
4153                }
4154              recno = 0;
4155              }
4156    
4157            /* In the real compile, seek the name in the table */
4158    
4159            else
4160              {
4161              slot = cd->name_table;
4162              for (i = 0; i < cd->names_found; i++)
4163                {
4164                if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4165                slot += cd->name_entry_size;
4166                }
4167    
4168              if (i < cd->names_found)         /* Back reference */
4169                {
4170                recno = GET2(slot, 0);
4171                }
4172              else if ((recno =                /* Forward back reference */
4173                        find_parens(ptr, cd->bracount, name, namelen,
4174                          (options & PCRE_EXTENDED) != 0)) <= 0)
4175                {
4176                *errorcodeptr = ERR15;
4177                goto FAILED;
4178                }
4179              }
4180    
4181            /* In both phases, we can now go to the code than handles numerical
4182            recursion or backreferences. */
4183    
4184            if (is_recurse) goto HANDLE_RECURSION;
4185              else goto HANDLE_REFERENCE;
4186    
4187    
4188            /* ------------------------------------------------------------ */
4189            case 'R':                 /* Recursion */
4190            ptr++;                    /* Same as (?0)      */
4191            /* Fall through */
4192    
4193    
4194            /* ------------------------------------------------------------ */
4195            case '-': case '+':
4196            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4197            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4198            {            {
4199            const uschar *called;            const uschar *called;
4200    
4201              if ((refsign = *ptr) == '+') ptr++;
4202              else if (refsign == '-')
4203                {
4204                if ((digitab[ptr[1]] & ctype_digit) == 0)
4205                  goto OTHER_CHAR_AFTER_QUERY;
4206                ptr++;
4207                }
4208    
4209            recno = 0;            recno = 0;
4210            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4211              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4212    
4213              if (*ptr != ')')
4214                {
4215                *errorcodeptr = ERR29;
4216                goto FAILED;
4217                }
4218    
4219              if (refsign == '-')
4220                {
4221                if (recno == 0)
4222                  {
4223                  *errorcodeptr = ERR58;
4224                  goto FAILED;
4225                  }
4226                recno = cd->bracount - recno + 1;
4227                if (recno <= 0)
4228                  {
4229                  *errorcodeptr = ERR15;
4230                  goto FAILED;
4231                  }
4232                }
4233              else if (refsign == '+')
4234                {
4235                if (recno == 0)
4236                  {
4237                  *errorcodeptr = ERR58;
4238                  goto FAILED;
4239                  }
4240                recno += cd->bracount;
4241                }
4242    
4243            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4244    
4245            HANDLE_RECURSION:            HANDLE_RECURSION:
4246    
4247            previous = code;            previous = code;
4248              called = cd->start_code;
4249    
4250            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4251            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4252              this point. If we end up with a forward reference, first check that
4253            *code = OP_END;            the bracket does occur later so we can give the error (and position)
4254            called = (recno == 0)?            now. Then remember this forward reference in the workspace so it can
4255              cd->start_code : find_bracket(cd->start_code, utf8, recno);            be filled in at the end. */
4256    
4257            if (called == NULL)            if (lengthptr == NULL)
4258              {              {
4259              *errorcodeptr = ERR15;              *code = OP_END;
4260              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4261    
4262            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4263    
4264            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4265              {                {
4266              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4267              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4268                    {
4269                    *errorcodeptr = ERR15;
4270                    goto FAILED;
4271                    }
4272                  called = cd->start_code + recno;
4273                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4274                  }
4275    
4276                /* If not a forward reference, and the subpattern is still open,
4277                this is a recursive call. We check to see if this is a left
4278                recursion that could loop for ever, and diagnose that case. */
4279    
4280                else if (GET(called, 1) == 0 &&
4281                         could_be_empty(called, code, bcptr, utf8))
4282                  {
4283                  *errorcodeptr = ERR40;
4284                  goto FAILED;
4285                  }
4286              }              }
4287    
4288            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4289              "once" brackets. Set up a "previous group" length so that a
4290              subsequent quantifier will work. */
4291    
4292              *code = OP_ONCE;
4293              PUT(code, 1, 2 + 2*LINK_SIZE);
4294              code += 1 + LINK_SIZE;
4295    
4296            *code = OP_RECURSE;            *code = OP_RECURSE;
4297            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4298            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4299    
4300              *code = OP_KET;
4301              PUT(code, 1, 2 + 2*LINK_SIZE);
4302              code += 1 + LINK_SIZE;
4303    
4304              length_prevgroup = 3 + 3*LINK_SIZE;
4305            }            }
4306    
4307            /* Can't determine a first byte now */
4308    
4309            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4310          continue;          continue;
4311    
         /* Character after (? not specially recognized */  
4312    
4313          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4314            default:              /* Other characters: check option setting */
4315            OTHER_CHAR_AFTER_QUERY:
4316          set = unset = 0;          set = unset = 0;
4317          optset = &set;          optset = &set;
4318    
# Line 3027  for (;; ptr++) Line 4322  for (;; ptr++)
4322              {              {
4323              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4324    
4325                case 'J':    /* Record that it changed in the external options */
4326                *optset |= PCRE_DUPNAMES;
4327                cd->external_options |= PCRE_JCHANGED;
4328                break;
4329    
4330              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4331              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4332              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4333              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4334              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4335              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4336    
4337                default:  *errorcodeptr = ERR12;
4338                          ptr--;    /* Correct the offset */
4339                          goto FAILED;
4340              }              }
4341            }            }
4342    
# Line 3041  for (;; ptr++) Line 4345  for (;; ptr++)
4345          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4346    
4347          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4348          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4349          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4350          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4351          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4352          a group), a resetting item can be compiled.          caseless checking of required bytes.
4353    
4354          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4355          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4356          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4357            that value after the start, because it gets reset as code is discarded
4358            during the pre-compile. However, this can happen only at top level - if
4359            we are within parentheses, the starting BRA will still be present. At
4360            any parenthesis level, the length value can be used to test if anything
4361            has been compiled at that level. Thus, a test for both these conditions
4362            is necessary to ensure we correctly detect the start of the pattern in
4363            both phases.
4364    
4365            If we are not at the pattern start, compile code to change the ims
4366            options if this setting actually changes any of them. We also pass the
4367            new setting back so that it can be put at the start of any following
4368            branches, and when this group ends (if we are in a group), a resetting
4369            item can be compiled. */
4370    
4371          if (*ptr == ')')          if (*ptr == ')')
4372            {            {
4373            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4374                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4375              {              {
4376              *code++ = OP_OPT;              cd->external_options = newoptions;
4377              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4378              }              }
4379             else
4380                {
4381                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4382                  {
4383                  *code++ = OP_OPT;
4384                  *code++ = newoptions & PCRE_IMS;
4385                  }
4386    
4387            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4388            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4389            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4390    
4391            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4392            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4393            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4394            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4395                }
4396    
4397            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4398            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3079  for (;; ptr++) Line 4405  for (;; ptr++)
4405    
4406          bravalue = OP_BRA;          bravalue = OP_BRA;
4407          ptr++;          ptr++;
4408          }          }     /* End of switch for character following (? */
4409        }        }       /* End of (? handling */
4410    
4411      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4412      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4413        brackets. */
4414    
4415      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4416        {        {
4417        bravalue = OP_BRA;        bravalue = OP_BRA;
4418        }        }
4419    
4420      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4421    
4422      else      else
4423        {        {
4424        NUMBERED_GROUP:        NUMBERED_GROUP:
4425        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4426          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4427          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4428        }        }
4429    
4430      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4431      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4432      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4433      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4434        they have changed. */
4435    
4436      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4437      *code = bravalue;      *code = bravalue;
4438      tempcode = code;      tempcode = code;
4439      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4440        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4441    
4442      if (!compile_regex(      if (!compile_regex(
4443           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4444           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4445           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4446           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4447           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4448           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4449            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4450           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           reset_bracount,               /* True if (?| group */
4451             skipbytes,                    /* Skip over bracket number */
4452           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4453           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4454           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4455           cd))                          /* Tables block */           cd,                           /* Tables block */
4456             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4457               &length_prevgroup           /* Pre-compile phase */
4458             ))
4459        goto FAILED;        goto FAILED;
4460    
4461      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3139  for (;; ptr++) Line 4464  for (;; ptr++)
4464      is on the bracket. */      is on the bracket. */
4465    
4466      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4467      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4468        in the real compile phase, not in the pre-pass, where the whole group may
4469        not be available. */
4470    
4471      else if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4472        {        {
4473        uschar *tc = code;        uschar *tc = code;
4474        condcount = 0;        int condcount = 0;
4475    
4476        do {        do {
4477           condcount++;           condcount++;
# Line 3152  for (;; ptr++) Line 4479  for (;; ptr++)
4479           }           }
4480        while (*tc != OP_KET);        while (*tc != OP_KET);
4481    
4482        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4483          false). It must have only one branch. */
4484    
4485          if (code[LINK_SIZE+1] == OP_DEF)
4486          {          {
4487          *errorcodeptr = ERR27;          if (condcount > 1)
4488          goto FAILED;            {
4489              *errorcodeptr = ERR54;
4490              goto FAILED;
4491              }
4492            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4493            }
4494    
4495          /* A "normal" conditional group. If there is just one branch, we must not
4496          make use of its firstbyte or reqbyte, because this is equivalent to an
4497          empty second branch. */
4498    
4499          else
4500            {
4501            if (condcount > 2)
4502              {
4503              *errorcodeptr = ERR27;
4504              goto FAILED;
4505              }
4506            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4507          }          }
4508          }
4509    
4510        /* If there is just one branch, we must not make use of its firstbyte or      /* Error if hit end of pattern */
4511        reqbyte, because this is equivalent to an empty second branch. */  
4512        if (*ptr != ')')
4513          {
4514          *errorcodeptr = ERR14;
4515          goto FAILED;
4516          }
4517    
4518        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4519        group, less the brackets at either end. Then reduce the compiled code to
4520        just the brackets so that it doesn't use much memory if it is duplicated by
4521        a quantifier. */
4522    
4523        if (lengthptr != NULL)
4524          {
4525          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4526          code++;
4527          PUTINC(code, 0, 1 + LINK_SIZE);
4528          *code++ = OP_KET;
4529          PUTINC(code, 0, 1 + LINK_SIZE);
4530        }        }
4531    
4532      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4533      brackets of all kinds, and conditions with two branches (see code above).  
4534      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4535      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4536      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4537        relevant. */
4538    
4539        if (bravalue == OP_DEF) break;
4540    
4541        /* Handle updating of the required and first characters for other types of
4542        group. Update for normal brackets of all kinds, and conditions with two
4543        branches (see code above). If the bracket is followed by a quantifier with
4544        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4545        zerofirstbyte outside the main loop so that they can be accessed for the
4546        back off. */
4547    
4548      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4549      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4550      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4551    
4552      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4553        {        {
4554        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4555        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3215  for (;; ptr++) Line 4590  for (;; ptr++)
4590      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4591    
4592      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4593        break;     /* End of processing '(' */
4594    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
   
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
   
     case '\\':  
     tempptr = ptr;  
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
4595    
4596      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* ===================================================================*/
4597        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4598      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4599      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4600      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4601      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4602      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4603    
4604        case '\\':
4605        tempptr = ptr;
4606        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4607        if (*errorcodeptr != 0) goto FAILED;
4608    
4609      if (c < 0)      if (c < 0)
4610        {        {
4611        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3253  for (;; ptr++) Line 4615  for (;; ptr++)
4615          continue;          continue;
4616          }          }
4617    
4618          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4619    
4620        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4621        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4622    
# Line 3264  for (;; ptr++) Line 4628  for (;; ptr++)
4628        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4629        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4630    
4631        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4632          We also support \k{name} (.NET syntax) */
4633    
4634          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4635            {
4636            is_recurse = FALSE;
4637            terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4638            goto NAMED_REF_OR_RECURSE;
4639            }
4640    
4641          /* Back references are handled specially; must disable firstbyte if
4642          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4643          ':' later. */
4644    
4645        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4646          {          {
4647          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4648    
4649            HANDLE_REFERENCE:    /* Come here from named backref handling */
4650            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4651          previous = code;          previous = code;
4652          *code++ = OP_REF;          *code++ = OP_REF;
4653          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4654            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4655            if (recno > cd->top_backref) cd->top_backref = recno;
4656          }          }
4657    
4658        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4659    
4660  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4661        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
4662          {          {
4663          BOOL negated;          BOOL negated;
4664          int value = get_ucp(&ptr, &negated, errorcodeptr);          int pdata;
4665            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4666            if (ptype < 0) goto FAILED;
4667          previous = code;          previous = code;
4668          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4669          *code++ = value;          *code++ = ptype;
4670            *code++ = pdata;
4671            }
4672    #else
4673    
4674          /* If Unicode properties are not supported, \X, \P, and \p are not
4675          allowed. */
4676    
4677          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4678            {
4679            *errorcodeptr = ERR45;
4680            goto FAILED;
4681          }          }
4682  #endif  #endif
4683    
4684        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4685        value */        can obtain the OP value by negating the escape value. */
4686    
4687        else        else
4688          {          {
# Line 3313  for (;; ptr++) Line 4706  for (;; ptr++)
4706       mcbuffer[0] = c;       mcbuffer[0] = c;
4707       mclength = 1;       mclength = 1;
4708       }       }
   
4709      goto ONE_CHAR;      goto ONE_CHAR;
4710    
4711    
4712        /* ===================================================================*/
4713      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4714      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4715      multi-byte literal character. */      multi-byte literal character. */
# Line 3326  for (;; ptr++) Line 4720  for (;; ptr++)
4720      mcbuffer[0] = c;      mcbuffer[0] = c;
4721    
4722  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4723      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4724        {        {
4725        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4726          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3377  for (;; ptr++) Line 4771  for (;; ptr++)
4771      }      }
4772    }                   /* end of big loop */    }                   /* end of big loop */
4773    
4774    
4775  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4776  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4777  to the user for diagnosing the error. */  to the user for diagnosing the error. */