/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 79 by nigel, Sat Feb 24 21:40:52 2007 UTC revision 175 by ph10, Mon Jun 11 13:38:38 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
53    /* When DEBUG is defined, we need the pcre_printint() function, which is also
54    used by pcretest. DEBUG is not defined when building a production library. */
55    
56    #ifdef DEBUG
57    #include "pcre_printint.src"
58    #endif
59    
60    
61  /*************************************************  /*************************************************
62  *      Code parameters and static tables         *  *      Code parameters and static tables         *
63  *************************************************/  *************************************************/
64    
65  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
66  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
67  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
68  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
69  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
70    so this number is very generous.
71    
72    The same workspace is used during the second, actual compile phase for
73    remembering forward references to groups so that they can be filled in at the
74    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75    is 4 there is plenty of room. */
76    
77  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
78    
79    
80  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 82  are simple data values; negative values
82  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
83  is invalid. */  is invalid. */
84    
85  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
86  static const short int escapes[] = {  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
94       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
95  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
98    
99  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
100  static const short int escapes[] = {  static const short int escapes[] = {
101  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
102  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 88  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
108  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
109  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
110  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
111  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
112  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
113  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 97  static const short int escapes[] = { Line 116  static const short int escapes[] = {
116  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
117  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
118  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
119  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
120  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
121  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
122  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
# Line 107  static const short int escapes[] = { Line 126  static const short int escapes[] = {
126    
127    
128  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
129  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
130  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
131    
132  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 118  static const char *const posix_names[] = Line 137  static const char *const posix_names[] =
137  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
138    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139    
140  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
141  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
142  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
143    characters are removed, and for [:alpha:] and [:alnum:] the underscore
144    character is removed. The triples in the table consist of the base map offset,
145    second map offset or -1 if no second map, and a non-negative value for map
146    addition or a negative value for map subtraction (if there are two maps). The
147    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148    remove vertical space characters, 2 => remove underscore. */
149    
150  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
151    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
152    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
153    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
154    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
155    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
156    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
157    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
158    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
159    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
160    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
161    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
162    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
163    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
164    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
165  };  };
166    
167    
168    #define STRING(a)  # a
169    #define XSTRING(s) STRING(s)
170    
171  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
172  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
173    they are documented. Always add a new error instead. Messages marked DEAD below
174    are no longer used. */
175    
176  static const char *error_texts[] = {  static const char *error_texts[] = {
177    "no error",    "no error",
# Line 156  static const char *error_texts[] = { Line 186  static const char *error_texts[] = {
186    "range out of order in character class",    "range out of order in character class",
187    "nothing to repeat",    "nothing to repeat",
188    /* 10 */    /* 10 */
189    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
190    "internal error: unexpected repeat",    "internal error: unexpected repeat",
191    "unrecognized character after (?",    "unrecognized character after (?",
192    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 166  static const char *error_texts[] = { Line 196  static const char *error_texts[] = {
196    "erroffset passed as NULL",    "erroffset passed as NULL",
197    "unknown option bit(s) set",    "unknown option bit(s) set",
198    "missing ) after comment",    "missing ) after comment",
199    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
200    /* 20 */    /* 20 */
201    "regular expression too large",    "regular expression too large",
202    "failed to get memory",    "failed to get memory",
# Line 175  static const char *error_texts[] = { Line 205  static const char *error_texts[] = {
205    "unrecognized character after (?<",    "unrecognized character after (?<",
206    /* 25 */    /* 25 */
207    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
208    "malformed number after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
215    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
216    "spare error",    "spare error",  /** DEAD **/
217    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
218    /* 35 */    /* 35 */
219    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 194  static const char *error_texts[] = { Line 224  static const char *error_texts[] = {
224    /* 40 */    /* 40 */
225    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
226    "unrecognized character after (?P",    "unrecognized character after (?P",
227    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
228    "two named groups have the same name",    "two named subpatterns have the same name",
229    "invalid UTF-8 string",    "invalid UTF-8 string",
230    /* 45 */    /* 45 */
231    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
232    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
233    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
234      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236      /* 50 */
237      "repeated subpattern is too long",
238      "octal value is greater than \\377 (not in UTF-8 mode)",
239      "internal error: overran compiling workspace",
240      "internal error: previously-checked referenced subpattern not found",
241      "DEFINE group contains more than one branch",
242      /* 55 */
243      "repeating a DEFINE group is not allowed",
244      "inconsistent NEWLINE options",
245      "\\g is not followed by a braced name or an optionally braced non-zero number",
246      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 220  For convenience, we use the same bit def Line 263  For convenience, we use the same bit def
263    
264  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
267  static const unsigned char digitab[] =  static const unsigned char digitab[] =
268    {    {
269    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 299  static const unsigned char digitab[] =
299    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
303  static const unsigned char digitab[] =  static const unsigned char digitab[] =
304    {    {
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 313  static const unsigned char digitab[] =
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
316    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
317    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 347  static const unsigned char ebcdic_charta
347    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
348    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
349    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
350    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
351    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
352    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 374  static const unsigned char ebcdic_charta
374  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
375    
376  static BOOL  static BOOL
377    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
378      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
379    
380    
381    
# Line 342  static BOOL Line 385  static BOOL
385    
386  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
387  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
388  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
389  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391    ptr is pointing at the \. On exit, it is on the final character of the escape
392    sequence.
393    
394  Arguments:  Arguments:
395    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 362  static int Line 407  static int
407  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408    int options, BOOL isclass)    int options, BOOL isclass)
409  {  {
410  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
411    const uschar *ptr = *ptrptr + 1;
412  int c, i;  int c, i;
413    
414    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
415    ptr--;                            /* Set pointer back to the last byte */
416    
417  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
418    
 c = *(++ptr);  
419  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
420    
421  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
423  Otherwise further processing may be required. */  Otherwise further processing may be required. */
424    
425  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
426  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
427  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
428    
429  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
430  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
431  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
432  #endif  #endif
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 436  else if ((i = escapes[c - 0x48]) != 0)
436  else  else
437    {    {
438    const uschar *oldptr;    const uschar *oldptr;
439      BOOL braced, negated;
440    
441    switch (c)    switch (c)
442      {      {
443      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 451  else
451      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
452      break;      break;
453    
454        /* \g must be followed by a number, either plain or braced. If positive, it
455        is an absolute backreference. If negative, it is a relative backreference.
456        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457        reference to a named group. This is part of Perl's movement towards a
458        unified syntax for back references. As this is synonymous with \k{name}, we
459        fudge it up by pretending it really was \k. */
460    
461        case 'g':
462        if (ptr[1] == '{')
463          {
464          const uschar *p;
465          for (p = ptr+2; *p != 0 && *p != '}'; p++)
466            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467          if (*p != 0 && *p != '}')
468            {
469            c = -ESC_k;
470            break;
471            }
472          braced = TRUE;
473          ptr++;
474          }
475        else braced = FALSE;
476    
477        if (ptr[1] == '-')
478          {
479          negated = TRUE;
480          ptr++;
481          }
482        else negated = FALSE;
483    
484        c = 0;
485        while ((digitab[ptr[1]] & ctype_digit) != 0)
486          c = c * 10 + *(++ptr) - '0';
487    
488        if (c == 0 || (braced && *(++ptr) != '}'))
489          {
490          *errorcodeptr = ERR57;
491          return 0;
492          }
493    
494        if (negated)
495          {
496          if (c > bracount)
497            {
498            *errorcodeptr = ERR15;
499            return 0;
500            }
501          c = bracount - (c - 1);
502          }
503    
504        c = -(ESC_REF + c);
505        break;
506    
507      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
508      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
509      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 442  else Line 545  else
545        }        }
546    
547      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
548      larger first octal digit. */      larger first octal digit. The original code used just to take the least
549        significant 8 bits of octal numbers (I think this is what early Perls used
550        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
551        than 3 octal digits. */
552    
553      case '0':      case '0':
554      c -= '0';      c -= '0';
555      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
556          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
557      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
558      break;      break;
559    
560      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
561      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
562        treated as a data character. */
563    
564      case 'x':      case 'x':
565  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
566        {        {
567        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
568        register int count = 0;        int count = 0;
569    
570        c = 0;        c = 0;
571        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
572          {          {
573          int cc = *pt++;          register int cc = *pt++;
574            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
575          count++;          count++;
576  #if !EBCDIC    /* ASCII coding */  
577    #ifndef EBCDIC  /* ASCII coding */
578          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
579          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
581          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
582          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583  #endif  #endif
584          }          }
585    
586        if (*pt == '}')        if (*pt == '}')
587          {          {
588          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
589          ptr = pt;          ptr = pt;
590          break;          break;
591          }          }
592    
593        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
594        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
595        }        }
 #endif  
596    
597      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
598    
599      c = 0;      c = 0;
600      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
601        {        {
602        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
603        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
604  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
605        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
606        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
608        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
609        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610  #endif  #endif
611        }        }
612      break;      break;
613    
614      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615        This coding is ASCII-specific, but then the whole concept of \cx is
616        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617    
618      case 'c':      case 'c':
619      c = *(++ptr);      c = *(++ptr);
# Line 511  else Line 623  else
623        return 0;        return 0;
624        }        }
625    
626      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
627      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
628      c ^= 0x40;      c ^= 0x40;
629  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
630      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
631      c ^= 0xC0;      c ^= 0xC0;
632  #endif  #endif
# Line 560  escape sequence. Line 668  escape sequence.
668  Argument:  Argument:
669    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
670    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
671      dptr           points to an int that is set to the detailed property value
672    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
673    
674  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
675  */  */
676    
677  static int  static int
678  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
679  {  {
680  int c, i, bot, top;  int c, i, bot, top;
681  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
682  char name[4];  char name[32];
683    
684  c = *(++ptr);  c = *(++ptr);
685  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
686    
687  *negptr = FALSE;  *negptr = FALSE;
688    
689  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
690  preceded by ^ for negation. */  negation. */
691    
692  if (c == '{')  if (c == '{')
693    {    {
# Line 587  if (c == '{') Line 696  if (c == '{')
696      *negptr = TRUE;      *negptr = TRUE;
697      ptr++;      ptr++;
698      }      }
699    for (i = 0; i <= 2; i++)    for (i = 0; i < sizeof(name) - 1; i++)
700      {      {
701      c = *(++ptr);      c = *(++ptr);
702      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
703      if (c == '}') break;      if (c == '}') break;
704      name[i] = c;      name[i] = c;
705      }      }
706    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
707    name[i] = 0;    name[i] = 0;
708    }    }
709    
# Line 619  top = _pcre_utt_size; Line 724  top = _pcre_utt_size;
724    
725  while (bot < top)  while (bot < top)
726    {    {
727    i = (bot + top)/2;    i = (bot + top) >> 1;
728    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
729    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
730        {
731        *dptr = _pcre_utt[i].value;
732        return _pcre_utt[i].type;
733        }
734    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
735    }    }
736    
 UNKNOWN_RETURN:  
737  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
738  *ptrptr = ptr;  *ptrptr = ptr;
739  return -1;  return -1;
# Line 698  read_repeat_counts(const uschar *p, int Line 806  read_repeat_counts(const uschar *p, int
806  int min = 0;  int min = 0;
807  int max = -1;  int max = -1;
808    
809    /* Read the minimum value and do a paranoid check: a negative value indicates
810    an integer overflow. */
811    
812  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
813    if (min < 0 || min > 65535)
814      {
815      *errorcodeptr = ERR5;
816      return p;
817      }
818    
819    /* Read the maximum value if there is one, and again do a paranoid on its size.
820    Also, max must not be less than min. */
821    
822  if (*p == '}') max = min; else  if (*p == '}') max = min; else
823    {    {
# Line 706  if (*p == '}') max = min; else Line 825  if (*p == '}') max = min; else
825      {      {
826      max = 0;      max = 0;
827      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
828        if (max < 0 || max > 65535)
829          {
830          *errorcodeptr = ERR5;
831          return p;
832          }
833      if (max < min)      if (max < min)
834        {        {
835        *errorcodeptr = ERR4;        *errorcodeptr = ERR4;
# Line 714  if (*p == '}') max = min; else Line 838  if (*p == '}') max = min; else
838      }      }
839    }    }
840    
841  /* Do paranoid checks, then fill in the required variables, and pass back the  /* Fill in the required variables, and pass back the pointer to the terminating
842  pointer to the terminating '}'. */  '}'. */
843    
844  if (min > 65535 || max > 65535)  *minp = min;
845    *errorcodeptr = ERR5;  *maxp = max;
846  else  return p;
847    }
848    
849    
850    
851    /*************************************************
852    *       Find forward referenced subpattern       *
853    *************************************************/
854    
855    /* This function scans along a pattern's text looking for capturing
856    subpatterns, and counting them. If it finds a named pattern that matches the
857    name it is given, it returns its number. Alternatively, if the name is NULL, it
858    returns when it reaches a given numbered subpattern. This is used for forward
859    references to subpatterns. We know that if (?P< is encountered, the name will
860    be terminated by '>' because that is checked in the first pass.
861    
862    Arguments:
863      ptr          current position in the pattern
864      count        current count of capturing parens so far encountered
865      name         name to seek, or NULL if seeking a numbered subpattern
866      lorn         name length, or subpattern number if name is NULL
867      xmode        TRUE if we are in /x mode
868    
869    Returns:       the number of the named subpattern, or -1 if not found
870    */
871    
872    static int
873    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874      BOOL xmode)
875    {
876    const uschar *thisname;
877    
878    for (; *ptr != 0; ptr++)
879    {    {
880    *minp = min;    int term;
881    *maxp = max;  
882      /* Skip over backslashed characters and also entire \Q...\E */
883    
884      if (*ptr == '\\')
885        {
886        if (*(++ptr) == 0) return -1;
887        if (*ptr == 'Q') for (;;)
888          {
889          while (*(++ptr) != 0 && *ptr != '\\');
890          if (*ptr == 0) return -1;
891          if (*(++ptr) == 'E') break;
892          }
893        continue;
894        }
895    
896      /* Skip over character classes */
897    
898      if (*ptr == '[')
899        {
900        while (*(++ptr) != ']')
901          {
902          if (*ptr == '\\')
903            {
904            if (*(++ptr) == 0) return -1;
905            if (*ptr == 'Q') for (;;)
906              {
907              while (*(++ptr) != 0 && *ptr != '\\');
908              if (*ptr == 0) return -1;
909              if (*(++ptr) == 'E') break;
910              }
911            continue;
912            }
913          }
914        continue;
915        }
916    
917      /* Skip comments in /x mode */
918    
919      if (xmode && *ptr == '#')
920        {
921        while (*(++ptr) != 0 && *ptr != '\n');
922        if (*ptr == 0) return -1;
923        continue;
924        }
925    
926      /* An opening parens must now be a real metacharacter */
927    
928      if (*ptr != '(') continue;
929      if (ptr[1] != '?')
930        {
931        count++;
932        if (name == NULL && count == lorn) return count;
933        continue;
934        }
935    
936      ptr += 2;
937      if (*ptr == 'P') ptr++;                      /* Allow optional P */
938    
939      /* We have to disambiguate (?<! and (?<= from (?<name> */
940    
941      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942           *ptr != '\'')
943        continue;
944    
945      count++;
946    
947      if (name == NULL && count == lorn) return count;
948      term = *ptr++;
949      if (term == '<') term = '>';
950      thisname = ptr;
951      while (*ptr != term) ptr++;
952      if (name != NULL && lorn == ptr - thisname &&
953          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954        return count;
955    }    }
956  return p;  
957    return -1;
958  }  }
959    
960    
# Line 778  for (;;) Line 1008  for (;;)
1008    
1009      case OP_CALLOUT:      case OP_CALLOUT:
1010      case OP_CREF:      case OP_CREF:
1011      case OP_BRANUMBER:      case OP_RREF:
1012        case OP_DEF:
1013      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1014      break;      break;
1015    
# Line 823  for (;;) Line 1054  for (;;)
1054    {    {
1055    int d;    int d;
1056    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1057    
1058    switch (op)    switch (op)
1059      {      {
1060        case OP_CBRA:
1061      case OP_BRA:      case OP_BRA:
1062      case OP_ONCE:      case OP_ONCE:
1063      case OP_COND:      case OP_COND:
1064      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065      if (d < 0) return d;      if (d < 0) return d;
1066      branchlength += d;      branchlength += d;
1067      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 865  for (;;) Line 1096  for (;;)
1096      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1097    
1098      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1099      case OP_CREF:      case OP_CREF:
1100        case OP_RREF:
1101        case OP_DEF:
1102      case OP_OPT:      case OP_OPT:
1103      case OP_CALLOUT:      case OP_CALLOUT:
1104      case OP_SOD:      case OP_SOD:
# Line 884  for (;;) Line 1116  for (;;)
1116    
1117      case OP_CHAR:      case OP_CHAR:
1118      case OP_CHARNC:      case OP_CHARNC:
1119        case OP_NOT:
1120      branchlength++;      branchlength++;
1121      cc += 2;      cc += 2;
1122  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 917  for (;;) Line 1150  for (;;)
1150    
1151      case OP_PROP:      case OP_PROP:
1152      case OP_NOTPROP:      case OP_NOTPROP:
1153      cc++;      cc += 2;
1154      /* Fall through */      /* Fall through */
1155    
1156      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 998  Returns:      pointer to the opcode for Line 1231  Returns:      pointer to the opcode for
1231  static const uschar *  static const uschar *
1232  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1233  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1234  for (;;)  for (;;)
1235    {    {
1236    register int c = *code;    register int c = *code;
1237    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1238    else if (c > OP_BRA)  
1239      /* XCLASS is used for classes that cannot be represented just by a bit
1240      map. This includes negated single high-valued characters. The length in
1241      the table is zero; the actual length is stored in the compiled code. */
1242    
1243      if (c == OP_XCLASS) code += GET(code, 1);
1244    
1245      /* Handle capturing bracket */
1246    
1247      else if (c == OP_CBRA)
1248      {      {
1249      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1250      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1251      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1252      }      }
1253    
1254      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255      a multi-byte character. The length in the table is a minimum, so we have to
1256      arrange to skip the extra bytes. */
1257    
1258    else    else
1259      {      {
1260      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1261  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1262      if (utf8) switch(c)      if (utf8) switch(c)
1263        {        {
1264        case OP_CHAR:        case OP_CHAR:
# Line 1031  for (;;) Line 1266  for (;;)
1266        case OP_EXACT:        case OP_EXACT:
1267        case OP_UPTO:        case OP_UPTO:
1268        case OP_MINUPTO:        case OP_MINUPTO:
1269          case OP_POSUPTO:
1270        case OP_STAR:        case OP_STAR:
1271        case OP_MINSTAR:        case OP_MINSTAR:
1272          case OP_POSSTAR:
1273        case OP_PLUS:        case OP_PLUS:
1274        case OP_MINPLUS:        case OP_MINPLUS:
1275          case OP_POSPLUS:
1276        case OP_QUERY:        case OP_QUERY:
1277        case OP_MINQUERY:        case OP_MINQUERY:
1278        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1279        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1280        break;        break;
1281        }        }
1282  #endif  #endif
# Line 1072  Returns:      pointer to the opcode for Line 1303  Returns:      pointer to the opcode for
1303  static const uschar *  static const uschar *
1304  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1305  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1306  for (;;)  for (;;)
1307    {    {
1308    register int c = *code;    register int c = *code;
1309    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1310    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1311    else if (c > OP_BRA)  
1312      {    /* XCLASS is used for classes that cannot be represented just by a bit
1313      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1314      }    the table is zero; the actual length is stored in the compiled code. */
1315    
1316      if (c == OP_XCLASS) code += GET(code, 1);
1317    
1318      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319      that are followed by a character may be followed by a multi-byte character.
1320      The length in the table is a minimum, so we have to arrange to skip the extra
1321      bytes. */
1322    
1323    else    else
1324      {      {
1325      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1326  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1327      if (utf8) switch(c)      if (utf8) switch(c)
1328        {        {
1329        case OP_CHAR:        case OP_CHAR:
# Line 1103  for (;;) Line 1331  for (;;)
1331        case OP_EXACT:        case OP_EXACT:
1332        case OP_UPTO:        case OP_UPTO:
1333        case OP_MINUPTO:        case OP_MINUPTO:
1334          case OP_POSUPTO:
1335        case OP_STAR:        case OP_STAR:
1336        case OP_MINSTAR:        case OP_MINSTAR:
1337          case OP_POSSTAR:
1338        case OP_PLUS:        case OP_PLUS:
1339        case OP_MINPLUS:        case OP_MINPLUS:
1340          case OP_POSPLUS:
1341        case OP_QUERY:        case OP_QUERY:
1342        case OP_MINQUERY:        case OP_MINQUERY:
1343        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1344        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1345        break;        break;
1346        }        }
1347  #endif  #endif
# Line 1132  for (;;) Line 1356  for (;;)
1356  *************************************************/  *************************************************/
1357    
1358  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1359  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1360  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1361  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1362  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363    struck an inner bracket whose current branch will already have been scanned.
1364    
1365  Arguments:  Arguments:
1366    code        points to start of search    code        points to start of search
# Line 1149  static BOOL Line 1374  static BOOL
1374  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375  {  {
1376  register int c;  register int c;
1377  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378       code < endcode;       code < endcode;
1379       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380    {    {
# Line 1157  for (code = first_significant_code(code Line 1382  for (code = first_significant_code(code
1382    
1383    c = *code;    c = *code;
1384    
1385    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1386    
1387      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388        {
1389        code += _pcre_OP_lengths[c];
1390        do code += GET(code, 1); while (*code == OP_ALT);
1391        c = *code;
1392        continue;
1393        }
1394    
1395      /* For other groups, scan the branches. */
1396    
1397      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398      {      {
1399      BOOL empty_branch;      BOOL empty_branch;
1400      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1173  for (code = first_significant_code(code Line 1410  for (code = first_significant_code(code
1410        }        }
1411      while (*code == OP_ALT);      while (*code == OP_ALT);
1412      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1413      c = *code;      c = *code;
1414        continue;
1415      }      }
1416    
1417    else switch (c)    /* Handle the other opcodes */
1418    
1419      switch (c)
1420      {      {
1421      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1422    
# Line 1233  for (code = first_significant_code(code Line 1472  for (code = first_significant_code(code
1472      case OP_NOT:      case OP_NOT:
1473      case OP_PLUS:      case OP_PLUS:
1474      case OP_MINPLUS:      case OP_MINPLUS:
1475        case OP_POSPLUS:
1476      case OP_EXACT:      case OP_EXACT:
1477      case OP_NOTPLUS:      case OP_NOTPLUS:
1478      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1479        case OP_NOTPOSPLUS:
1480      case OP_NOTEXACT:      case OP_NOTEXACT:
1481      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1482      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1483        case OP_TYPEPOSPLUS:
1484      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1485      return FALSE;      return FALSE;
1486    
# Line 1250  for (code = first_significant_code(code Line 1492  for (code = first_significant_code(code
1492      case OP_ALT:      case OP_ALT:
1493      return TRUE;      return TRUE;
1494    
1495      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1496      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1497    
1498  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1499      case OP_STAR:      case OP_STAR:
1500      case OP_MINSTAR:      case OP_MINSTAR:
1501        case OP_POSSTAR:
1502      case OP_QUERY:      case OP_QUERY:
1503      case OP_MINQUERY:      case OP_MINQUERY:
1504        case OP_POSQUERY:
1505      case OP_UPTO:      case OP_UPTO:
1506      case OP_MINUPTO:      case OP_MINUPTO:
1507        case OP_POSUPTO:
1508      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1509      break;      break;
1510  #endif  #endif
# Line 1377  earlier groups that are outside the curr Line 1622  earlier groups that are outside the curr
1622  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1623  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1624  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1625  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1626  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1627    
1628    This function has been extended with the possibility of forward references for
1629    recursions and subroutine calls. It must also check the list of such references
1630    for the group we are dealing with. If it finds that one of the recursions in
1631    the current group is on this list, it adjusts the offset in the list, not the
1632    value in the reference (which is a group number).
1633    
1634  Arguments:  Arguments:
1635    group      points to the start of the group    group      points to the start of the group
1636    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1637    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1638    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1639      save_hwm   the hwm forward reference pointer at the start of the group
1640    
1641  Returns:     nothing  Returns:     nothing
1642  */  */
1643    
1644  static void  static void
1645  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1646      uschar *save_hwm)
1647  {  {
1648  uschar *ptr = group;  uschar *ptr = group;
1649  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1650    {    {
1651    int offset = GET(ptr, 1);    int offset;
1652    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1653    
1654      /* See if this recursion is on the forward reference list. If so, adjust the
1655      reference. */
1656    
1657      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1658        {
1659        offset = GET(hc, 0);
1660        if (cd->start_code + offset == ptr + 1)
1661          {
1662          PUT(hc, 0, offset + adjust);
1663          break;
1664          }
1665        }
1666    
1667      /* Otherwise, adjust the recursion offset if it's after the start of this
1668      group. */
1669    
1670      if (hc >= cd->hwm)
1671        {
1672        offset = GET(ptr, 1);
1673        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1674        }
1675    
1676    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1677    }    }
1678  }  }
# Line 1475  Yield:        TRUE when range returned; Line 1751  Yield:        TRUE when range returned;
1751  */  */
1752    
1753  static BOOL  static BOOL
1754  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1755      unsigned int *odptr)
1756  {  {
1757  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1758    
1759  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1760    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1761    
1762  if (c > d) return FALSE;  if (c > d) return FALSE;
1763    
# Line 1492  next = othercase + 1; Line 1766  next = othercase + 1;
1766    
1767  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1768    {    {
1769    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1770    next++;    next++;
1771    }    }
1772    
# Line 1506  return TRUE; Line 1778  return TRUE;
1778  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1779    
1780    
1781    
1782  /*************************************************  /*************************************************
1783  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1784  *************************************************/  *************************************************/
1785    
1786  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1787  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1788  bits.  sense to automatically possessify the repeated item.
1789    
1790  Arguments:  Arguments:
1791    optionsptr     pointer to the option bits    op_code       the repeated op code
1792    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1793    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1794    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1795    errorcodeptr   points to error code variable    ptr           next character in pattern
1796    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1797    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1798    
1799  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1800  */  */
1801    
1802  static BOOL  static BOOL
1803  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1804    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1805  {  {
1806  int repeat_type, op_type;  int next;
1807  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1808  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
1809  int greedy_default, greedy_non_default;  
1810  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
1811  int zeroreqbyte, zerofirstbyte;    {
1812  int req_caseopt, reqvary, tempreqvary;    for (;;)
1813  int condcount = 0;      {
1814  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1815  int after_manual_callout = 0;      if (*ptr == '#')
1816  register int c;        {
1817  register uschar *code = *codeptr;        while (*(++ptr) != 0)
1818  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1819  BOOL inescq = FALSE;        }
1820  BOOL groupsetfirstbyte = FALSE;      else break;
1821  const uschar *ptr = *ptrptr;      }
1822  const uschar *tempptr;    }
1823  uschar *previous = NULL;  
1824  uschar *previous_callout = NULL;  /* If the next item is one that we can handle, get its value. A non-negative
1825  uschar classbits[32];  value is a character, a negative value is an escape value. */
1826    
1827    if (*ptr == '\\')
1828      {
1829      int temperrorcode = 0;
1830      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1831      if (temperrorcode != 0) return FALSE;
1832      ptr++;    /* Point after the escape sequence */
1833      }
1834    
1835    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1836      {
1837  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1838  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1839  #endif  #endif
1840      next = *ptr++;
1841      }
1842    
1843  /* Set up the default and non-default settings for greediness */  else return FALSE;
1844    
1845  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1846    
1847  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1848  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1849  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1850  find one.      {
1851        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1852        if (*ptr == '#')
1853          {
1854          while (*(++ptr) != 0)
1855            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1856          }
1857        else break;
1858        }
1859      }
1860    
1861  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1862    
1863  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1864      return FALSE;
1865    
1866  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1867  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1868  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1869  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1870    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1871    
1872  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1873    
1874  for (;; ptr++)  if (next >= 0) switch(op_code)
1875    {    {
1876    BOOL negate_class;    case OP_CHAR:
1877    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
1878    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1879    int class_charcount;  #endif
1880    int class_lastchar;    return item != next;
1881    int newoptions;  
1882    int recno;    /* For CHARNC (caseless character) we must check the other case. If we have
1883      Unicode property support, we can use it to test the other case of
1884      high-valued characters. */
1885    
1886      case OP_CHARNC:
1887    #ifdef SUPPORT_UTF8
1888      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1889    #endif
1890      if (item == next) return FALSE;
1891    #ifdef SUPPORT_UTF8
1892      if (utf8)
1893        {
1894        unsigned int othercase;
1895        if (next < 128) othercase = cd->fcc[next]; else
1896    #ifdef SUPPORT_UCP
1897        othercase = _pcre_ucp_othercase((unsigned int)next);
1898    #else
1899        othercase = NOTACHAR;
1900    #endif
1901        return (unsigned int)item != othercase;
1902        }
1903      else
1904    #endif  /* SUPPORT_UTF8 */
1905      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1906    
1907      /* For OP_NOT, "item" must be a single-byte character. */
1908    
1909      case OP_NOT:
1910      if (next < 0) return FALSE;  /* Not a character */
1911      if (item == next) return TRUE;
1912      if ((options & PCRE_CASELESS) == 0) return FALSE;
1913    #ifdef SUPPORT_UTF8
1914      if (utf8)
1915        {
1916        unsigned int othercase;
1917        if (next < 128) othercase = cd->fcc[next]; else
1918    #ifdef SUPPORT_UCP
1919        othercase = _pcre_ucp_othercase(next);
1920    #else
1921        othercase = NOTACHAR;
1922    #endif
1923        return (unsigned int)item == othercase;
1924        }
1925      else
1926    #endif  /* SUPPORT_UTF8 */
1927      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1928    
1929      case OP_DIGIT:
1930      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1931    
1932      case OP_NOT_DIGIT:
1933      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1934    
1935      case OP_WHITESPACE:
1936      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1937    
1938      case OP_NOT_WHITESPACE:
1939      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1940    
1941      case OP_WORDCHAR:
1942      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1943    
1944      case OP_NOT_WORDCHAR:
1945      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1946    
1947      default:
1948      return FALSE;
1949      }
1950    
1951    
1952    /* Handle the case when the next item is \d, \s, etc. */
1953    
1954    switch(op_code)
1955      {
1956      case OP_CHAR:
1957      case OP_CHARNC:
1958    #ifdef SUPPORT_UTF8
1959      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1960    #endif
1961      switch(-next)
1962        {
1963        case ESC_d:
1964        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1965    
1966        case ESC_D:
1967        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1968    
1969        case ESC_s:
1970        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1971    
1972        case ESC_S:
1973        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1974    
1975        case ESC_w:
1976        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1977    
1978        case ESC_W:
1979        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1980    
1981        default:
1982        return FALSE;
1983        }
1984    
1985      case OP_DIGIT:
1986      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1987    
1988      case OP_NOT_DIGIT:
1989      return next == -ESC_d;
1990    
1991      case OP_WHITESPACE:
1992      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1993    
1994      case OP_NOT_WHITESPACE:
1995      return next == -ESC_s;
1996    
1997      case OP_WORDCHAR:
1998      return next == -ESC_W || next == -ESC_s;
1999    
2000      case OP_NOT_WORDCHAR:
2001      return next == -ESC_w || next == -ESC_d;
2002    
2003      default:
2004      return FALSE;
2005      }
2006    
2007    /* Control does not reach here */
2008    }
2009    
2010    
2011    
2012    /*************************************************
2013    *           Compile one branch                   *
2014    *************************************************/
2015    
2016    /* Scan the pattern, compiling it into the a vector. If the options are
2017    changed during the branch, the pointer is used to change the external options
2018    bits. This function is used during the pre-compile phase when we are trying
2019    to find out the amount of memory needed, as well as during the real compile
2020    phase. The value of lengthptr distinguishes the two phases.
2021    
2022    Arguments:
2023      optionsptr     pointer to the option bits
2024      codeptr        points to the pointer to the current code point
2025      ptrptr         points to the current pattern pointer
2026      errorcodeptr   points to error code variable
2027      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2028      reqbyteptr     set to the last literal character required, else < 0
2029      bcptr          points to current branch chain
2030      cd             contains pointers to tables etc.
2031      lengthptr      NULL during the real compile phase
2032                     points to length accumulator during pre-compile phase
2033    
2034    Returns:         TRUE on success
2035                     FALSE, with *errorcodeptr set non-zero on error
2036    */
2037    
2038    static BOOL
2039    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2040      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2041      compile_data *cd, int *lengthptr)
2042    {
2043    int repeat_type, op_type;
2044    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2045    int bravalue = 0;
2046    int greedy_default, greedy_non_default;
2047    int firstbyte, reqbyte;
2048    int zeroreqbyte, zerofirstbyte;
2049    int req_caseopt, reqvary, tempreqvary;
2050    int options = *optionsptr;
2051    int after_manual_callout = 0;
2052    int length_prevgroup = 0;
2053    register int c;
2054    register uschar *code = *codeptr;
2055    uschar *last_code = code;
2056    uschar *orig_code = code;
2057    uschar *tempcode;
2058    BOOL inescq = FALSE;
2059    BOOL groupsetfirstbyte = FALSE;
2060    const uschar *ptr = *ptrptr;
2061    const uschar *tempptr;
2062    uschar *previous = NULL;
2063    uschar *previous_callout = NULL;
2064    uschar *save_hwm = NULL;
2065    uschar classbits[32];
2066    
2067    #ifdef SUPPORT_UTF8
2068    BOOL class_utf8;
2069    BOOL utf8 = (options & PCRE_UTF8) != 0;
2070    uschar *class_utf8data;
2071    uschar utf8_char[6];
2072    #else
2073    BOOL utf8 = FALSE;
2074    uschar *utf8_char = NULL;
2075    #endif
2076    
2077    #ifdef DEBUG
2078    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2079    #endif
2080    
2081    /* Set up the default and non-default settings for greediness */
2082    
2083    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2084    greedy_non_default = greedy_default ^ 1;
2085    
2086    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2087    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2088    matches a non-fixed char first char; reqbyte just remains unset if we never
2089    find one.
2090    
2091    When we hit a repeat whose minimum is zero, we may have to adjust these values
2092    to take the zero repeat into account. This is implemented by setting them to
2093    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2094    item types that can be repeated set these backoff variables appropriately. */
2095    
2096    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2097    
2098    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2099    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2100    value > 255. It is added into the firstbyte or reqbyte variables to record the
2101    case status of the value. This is used only for ASCII characters. */
2102    
2103    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2104    
2105    /* Switch on next character until the end of the branch */
2106    
2107    for (;; ptr++)
2108      {
2109      BOOL negate_class;
2110      BOOL possessive_quantifier;
2111      BOOL is_quantifier;
2112      BOOL is_recurse;
2113      BOOL reset_bracount;
2114      int class_charcount;
2115      int class_lastchar;
2116      int newoptions;
2117      int recno;
2118      int refsign;
2119    int skipbytes;    int skipbytes;
2120    int subreqbyte;    int subreqbyte;
2121    int subfirstbyte;    int subfirstbyte;
2122      int terminator;
2123    int mclength;    int mclength;
2124    uschar mcbuffer[8];    uschar mcbuffer[8];
2125    
2126    /* Next byte in the pattern */    /* Get next byte in the pattern */
2127    
2128    c = *ptr;    c = *ptr;
2129    
2130      /* If we are in the pre-compile phase, accumulate the length used for the
2131      previous cycle of this loop. */
2132    
2133      if (lengthptr != NULL)
2134        {
2135    #ifdef DEBUG
2136        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2137    #endif
2138        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2139          {
2140          *errorcodeptr = ERR52;
2141          goto FAILED;
2142          }
2143    
2144        /* There is at least one situation where code goes backwards: this is the
2145        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2146        the class is simply eliminated. However, it is created first, so we have to
2147        allow memory for it. Therefore, don't ever reduce the length at this point.
2148        */
2149    
2150        if (code < last_code) code = last_code;
2151        *lengthptr += code - last_code;
2152        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2153    
2154        /* If "previous" is set and it is not at the start of the work space, move
2155        it back to there, in order to avoid filling up the work space. Otherwise,
2156        if "previous" is NULL, reset the current code pointer to the start. */
2157    
2158        if (previous != NULL)
2159          {
2160          if (previous > orig_code)
2161            {
2162            memmove(orig_code, previous, code - previous);
2163            code -= previous - orig_code;
2164            previous = orig_code;
2165            }
2166          }
2167        else code = orig_code;
2168    
2169        /* Remember where this code item starts so we can pick up the length
2170        next time round. */
2171    
2172        last_code = code;
2173        }
2174    
2175      /* In the real compile phase, just check the workspace used by the forward
2176      reference list. */
2177    
2178      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2179        {
2180        *errorcodeptr = ERR52;
2181        goto FAILED;
2182        }
2183    
2184    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2185    
2186    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1623  for (;; ptr++) Line 2195  for (;; ptr++)
2195        {        {
2196        if (previous_callout != NULL)        if (previous_callout != NULL)
2197          {          {
2198          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2199              complete_callout(previous_callout, ptr, cd);
2200          previous_callout = NULL;          previous_callout = NULL;
2201          }          }
2202        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1644  for (;; ptr++) Line 2217  for (;; ptr++)
2217    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2218         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2219      {      {
2220      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2221          complete_callout(previous_callout, ptr, cd);
2222      previous_callout = NULL;      previous_callout = NULL;
2223      }      }
2224    
# Line 1655  for (;; ptr++) Line 2229  for (;; ptr++)
2229      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2230      if (c == '#')      if (c == '#')
2231        {        {
2232        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2233        on the Macintosh. */          {
2234        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2235        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2236          if (*ptr != 0) continue;
2237    
2238          /* Else fall through to handle end of string */
2239          c = 0;
2240        }        }
2241      }      }
2242    
# Line 1672  for (;; ptr++) Line 2250  for (;; ptr++)
2250    
2251    switch(c)    switch(c)
2252      {      {
2253      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2254        case 0:                        /* The branch terminates at string end */
2255      case 0:      case '|':                      /* or | or ) */
     case '|':  
2256      case ')':      case ')':
2257      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2258      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2259      *codeptr = code;      *codeptr = code;
2260      *ptrptr = ptr;      *ptrptr = ptr;
2261        if (lengthptr != NULL)
2262          {
2263          *lengthptr += code - last_code;   /* To include callout length */
2264          DPRINTF((">> end branch\n"));
2265          }
2266      return TRUE;      return TRUE;
2267    
2268    
2269        /* ===================================================================*/
2270      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2271      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2272    
# Line 1711  for (;; ptr++) Line 2295  for (;; ptr++)
2295      *code++ = OP_ANY;      *code++ = OP_ANY;
2296      break;      break;
2297    
2298      /* Character classes. If the included characters are all < 255 in value, we  
2299      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2300      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2301      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2302      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2303        map as usual, then invert it at the end. However, we use a different opcode
2304        so that data characters > 255 can be handled correctly.
2305    
2306      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2307      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1749  for (;; ptr++) Line 2335  for (;; ptr++)
2335        }        }
2336    
2337      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2338      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2339      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2340    
2341      class_charcount = 0;      class_charcount = 0;
2342      class_lastchar = -1;      class_lastchar = -1;
2343    
2344        /* Initialize the 32-char bit map to all zeros. We build the map in a
2345        temporary bit of memory, in case the class contains only 1 character (less
2346        than 256), because in that case the compiled code doesn't use the bit map.
2347        */
2348    
2349        memset(classbits, 0, 32 * sizeof(uschar));
2350    
2351  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2352      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2353      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2354  #endif  #endif
2355    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2356      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2357      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2358      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2359    
2360      do      if (c != 0) do
2361        {        {
2362          const uschar *oldptr;
2363    
2364  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2365        if (utf8 && c > 127)        if (utf8 && c > 127)
2366          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1786  for (;; ptr++) Line 2372  for (;; ptr++)
2372    
2373        if (inescq)        if (inescq)
2374          {          {
2375          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2376            {            {
2377            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2378            ptr++;            ptr++;                            /* Skip the 'E' */
2379            continue;            continue;                         /* Carry on with next */
2380            }            }
2381          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2382          }          }
2383    
2384        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1806  for (;; ptr++) Line 2392  for (;; ptr++)
2392            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2393          {          {
2394          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2395          int posix_class, i;          int posix_class, taboffset, tabopt;
2396          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2397            uschar pbits[32];
2398    
2399          if (ptr[1] != ':')          if (ptr[1] != ':')
2400            {            {
# Line 1836  for (;; ptr++) Line 2423  for (;; ptr++)
2423          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2424            posix_class = 0;            posix_class = 0;
2425    
2426          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2427          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2428          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2429          white space chars afterwards. */          result into the bit map that is being built. */
2430    
2431          posix_class *= 3;          posix_class *= 3;
2432          for (i = 0; i < 3; i++)  
2433            /* Copy in the first table (always present) */
2434    
2435            memcpy(pbits, cbits + posix_class_maps[posix_class],
2436              32 * sizeof(uschar));
2437    
2438            /* If there is a second table, add or remove it as required. */
2439    
2440            taboffset = posix_class_maps[posix_class + 1];
2441            tabopt = posix_class_maps[posix_class + 2];
2442    
2443            if (taboffset >= 0)
2444            {            {
2445            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2446            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2447            else            else
2448              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2449            }            }
2450    
2451            /* Not see if we need to remove any special characters. An option
2452            value of 1 removes vertical space and 2 removes underscore. */
2453    
2454            if (tabopt < 0) tabopt = -tabopt;
2455            if (tabopt == 1) pbits[1] &= ~0x3c;
2456              else if (tabopt == 2) pbits[11] &= 0x7f;
2457    
2458            /* Add the POSIX table or its complement into the main table that is
2459            being built and we are done. */
2460    
2461            if (local_negate)
2462              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2463            else
2464              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2465    
2466          ptr = tempptr + 1;          ptr = tempptr + 1;
2467          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2468          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2469          }          }
2470    
2471        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2472        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2473        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2474        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2475        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2476        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2477    
2478        if (c == '\\')        if (c == '\\')
2479          {          {
2480          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2481            if (*errorcodeptr != 0) goto FAILED;
2482    
2483          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2484          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2485            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2486          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2487            {            {
2488            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1895  for (;; ptr++) Line 2497  for (;; ptr++)
2497            {            {
2498            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2499            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2500            switch (-c)  
2501              /* Save time by not doing this in the pre-compile phase. */
2502    
2503              if (lengthptr == NULL) switch (-c)
2504              {              {
2505              case ESC_d:              case ESC_d:
2506              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1923  for (;; ptr++) Line 2528  for (;; ptr++)
2528              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2529              continue;              continue;
2530    
2531                case ESC_E: /* Perl ignores an orphan \E */
2532                continue;
2533    
2534                default:    /* Not recognized; fall through */
2535                break;      /* Need "default" setting to stop compiler warning. */
2536                }
2537    
2538              /* In the pre-compile phase, just do the recognition. */
2539    
2540              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2541                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2542    
2543              /* We need to deal with \P and \p in both phases. */
2544    
2545  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2546              case ESC_p:            if (-c == ESC_p || -c == ESC_P)
2547              case ESC_P:              {
2548                {              BOOL negated;
2549                BOOL negated;              int pdata;
2550                int property = get_ucp(&ptr, &negated, errorcodeptr);              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2551                if (property < 0) goto FAILED;              if (ptype < 0) goto FAILED;
2552                class_utf8 = TRUE;              class_utf8 = TRUE;
2553                *class_utf8data++ = ((-c == ESC_p) != negated)?              *class_utf8data++ = ((-c == ESC_p) != negated)?
2554                  XCL_PROP : XCL_NOTPROP;                XCL_PROP : XCL_NOTPROP;
2555                *class_utf8data++ = property;              *class_utf8data++ = ptype;
2556                class_charcount -= 2;   /* Not a < 256 character */              *class_utf8data++ = pdata;
2557                }              class_charcount -= 2;   /* Not a < 256 character */
2558              continue;              continue;
2559                }
2560  #endif  #endif
2561              /* Unrecognized escapes are faulted if PCRE is running in its
2562              strict mode. By default, for compatibility with Perl, they are
2563              treated as literals. */
2564    
2565              /* Unrecognized escapes are faulted if PCRE is running in its            if ((options & PCRE_EXTRA) != 0)
2566              strict mode. By default, for compatibility with Perl, they are              {
2567              treated as literals. */              *errorcodeptr = ERR7;
2568                goto FAILED;
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2569              }              }
2570    
2571              class_charcount -= 2;  /* Undo the default count from above */
2572              c = *ptr;              /* Get the final character and fall through */
2573            }            }
2574    
2575          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2576          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2577    
2578          }   /* End of backslash handling */          }   /* End of backslash handling */
2579    
2580        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2581        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2582        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2583          entirely. The code for handling \Q and \E is messy. */
2584    
2585          CHECK_RANGE:
2586          while (ptr[1] == '\\' && ptr[2] == 'E')
2587            {
2588            inescq = FALSE;
2589            ptr += 2;
2590            }
2591    
2592          oldptr = ptr;
2593    
2594        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
2595          {          {
2596          int d;          int d;
2597          ptr += 2;          ptr += 2;
2598            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2599    
2600            /* If we hit \Q (not followed by \E) at this point, go into escaped
2601            mode. */
2602    
2603            while (*ptr == '\\' && ptr[1] == 'Q')
2604              {
2605              ptr += 2;
2606              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2607              inescq = TRUE;
2608              break;
2609              }
2610    
2611            if (*ptr == 0 || (!inescq && *ptr == ']'))
2612              {
2613              ptr = oldptr;
2614              goto LONE_SINGLE_CHARACTER;
2615              }
2616    
2617  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2618          if (utf8)          if (utf8)
# Line 1981  for (;; ptr++) Line 2627  for (;; ptr++)
2627          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2628          in such circumstances. */          in such circumstances. */
2629    
2630          if (d == '\\')          if (!inescq && d == '\\')
2631            {            {
2632            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2633            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2634    
2635            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2636            was literal */            special means the '-' was literal */
2637    
2638            if (d < 0)            if (d < 0)
2639              {              {
2640              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2641              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2642                else if (d == -ESC_R) d = 'R'; else
2643                {                {
2644                ptr = oldptr - 2;                ptr = oldptr;
2645                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2646                }                }
2647              }              }
2648            }            }
2649    
2650          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2651          the pre-pass. Optimize one-character ranges */          one-character ranges */
2652    
2653            if (d < c)
2654              {
2655              *errorcodeptr = ERR8;
2656              goto FAILED;
2657              }
2658    
2659          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2660    
# Line 2022  for (;; ptr++) Line 2675  for (;; ptr++)
2675  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2676            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2677              {              {
2678              int occ, ocd;              unsigned int occ, ocd;
2679              int cc = c;              unsigned int cc = c;
2680              int origd = d;              unsigned int origd = d;
2681              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2682                {                {
2683                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
# Line 2082  for (;; ptr++) Line 2735  for (;; ptr++)
2735          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2736          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2737    
2738          for (; c <= d; c++)          class_charcount += d - c + 1;
2739            class_lastchar = d;
2740    
2741            /* We can save a bit of time by skipping this in the pre-compile. */
2742    
2743            if (lengthptr == NULL) for (; c <= d; c++)
2744            {            {
2745            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2746            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2090  for (;; ptr++) Line 2748  for (;; ptr++)
2748              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2749              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2750              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2751            }            }
2752    
2753          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2115  for (;; ptr++) Line 2771  for (;; ptr++)
2771  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2772          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2773            {            {
2774            int chartype;            unsigned int othercase;
2775            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
2776              {              {
2777              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2778              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2143  for (;; ptr++) Line 2797  for (;; ptr++)
2797          }          }
2798        }        }
2799    
2800      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
2801      loop. This "while" is the end of the "do" above. */  
2802        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2803    
2804      while ((c = *(++ptr)) != ']' || inescq);      if (c == 0)                          /* Missing terminating ']' */
2805          {
2806          *errorcodeptr = ERR6;
2807          goto FAILED;
2808          }
2809    
2810      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2811      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2210  for (;; ptr++) Line 2869  for (;; ptr++)
2869    
2870      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
2871      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
2872      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
2873    
2874  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2875      if (class_utf8)      if (class_utf8)
# Line 2220  for (;; ptr++) Line 2879  for (;; ptr++)
2879        code += LINK_SIZE;        code += LINK_SIZE;
2880        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
2881    
2882        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
2883        the extra data */        otherwise just move the code pointer to the end of the extra data. */
2884    
2885        if (class_charcount > 0)        if (class_charcount > 0)
2886          {          {
2887          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
2888            memmove(code + 32, code, class_utf8data - code);
2889          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
2890          code = class_utf8data;          code = class_utf8data + 32;
2891          }          }
2892          else code = class_utf8data;
2893    
2894        /* If the map is not required, slide down the extra data. */        /* Now fill in the complete length of the item */
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
         }  
   
       /* Now fill in the complete length of the item */  
2895    
2896        PUT(previous, 1, code - previous);        PUT(previous, 1, code - previous);
2897        break;   /* End of class handling */        break;   /* End of class handling */
# Line 2254  for (;; ptr++) Line 2906  for (;; ptr++)
2906      if (negate_class)      if (negate_class)
2907        {        {
2908        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
2909        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
2910            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2911        }        }
2912      else      else
2913        {        {
# Line 2264  for (;; ptr++) Line 2917  for (;; ptr++)
2917      code += 32;      code += 32;
2918      break;      break;
2919    
2920    
2921        /* ===================================================================*/
2922      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2923      has been tested above. */      has been tested above. */
2924    
# Line 2331  for (;; ptr++) Line 2986  for (;; ptr++)
2986        }        }
2987      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2988    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
2989      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
2990      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
2991      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2378  for (;; ptr++) Line 3019  for (;; ptr++)
3019          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3020          }          }
3021    
3022          /* If the repetition is unlimited, it pays to see if the next thing on
3023          the line is something that cannot possibly match this character. If so,
3024          automatically possessifying this item gains some performance in the case
3025          where the match fails. */
3026    
3027          if (!possessive_quantifier &&
3028              repeat_max < 0 &&
3029              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3030                options, cd))
3031            {
3032            repeat_type = 0;    /* Force greedy */
3033            possessive_quantifier = TRUE;
3034            }
3035    
3036        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3037        }        }
3038    
3039      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3040      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3041      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3042      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3043        currently used only for single-byte chars. */
3044    
3045      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3046        {        {
3047        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3048        c = previous[1];        c = previous[1];
3049          if (!possessive_quantifier &&
3050              repeat_max < 0 &&
3051              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3052            {
3053            repeat_type = 0;    /* Force greedy */
3054            possessive_quantifier = TRUE;
3055            }
3056        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3057        }        }
3058    
# Line 2403  for (;; ptr++) Line 3066  for (;; ptr++)
3066      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3067        {        {
3068        uschar *oldcode;        uschar *oldcode;
3069        int prop_type;        int prop_type, prop_value;
3070        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3071        c = *previous;        c = *previous;
3072    
3073          if (!possessive_quantifier &&
3074              repeat_max < 0 &&
3075              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3076            {
3077            repeat_type = 0;    /* Force greedy */
3078            possessive_quantifier = TRUE;
3079            }
3080    
3081        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3082        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3083          previous[1] : -1;          {
3084            prop_type = previous[1];
3085            prop_value = previous[2];
3086            }
3087          else prop_type = prop_value = -1;
3088    
3089        oldcode = code;        oldcode = code;
3090        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2443  for (;; ptr++) Line 3118  for (;; ptr++)
3118          }          }
3119    
3120        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3121        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3122        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3123        one less than the maximum. */        one less than the maximum. */
3124    
# Line 2470  for (;; ptr++) Line 3145  for (;; ptr++)
3145    
3146          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3147          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3148          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3149          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3150          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3151    
# Line 2486  for (;; ptr++) Line 3161  for (;; ptr++)
3161  #endif  #endif
3162              {              {
3163              *code++ = c;              *code++ = c;
3164              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3165                  {
3166                  *code++ = prop_type;
3167                  *code++ = prop_value;
3168                  }
3169              }              }
3170            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3171            }            }
3172    
3173          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3174          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3175            UPTO is just for 1 instance, we can use QUERY instead. */
3176    
3177          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3178            {            {
# Line 2505  for (;; ptr++) Line 3185  for (;; ptr++)
3185            else            else
3186  #endif  #endif
3187            *code++ = c;            *code++ = c;
3188            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3189                {
3190                *code++ = prop_type;
3191                *code++ = prop_value;
3192                }
3193            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3194            *code++ = OP_UPTO + repeat_type;  
3195            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3196                {
3197                *code++ = OP_QUERY + repeat_type;
3198                }
3199              else
3200                {
3201                *code++ = OP_UPTO + repeat_type;
3202                PUT2INC(code, 0, repeat_max);
3203                }
3204            }            }
3205          }          }
3206    
# Line 2524  for (;; ptr++) Line 3216  for (;; ptr++)
3216  #endif  #endif
3217        *code++ = c;        *code++ = c;
3218    
3219        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3220        defines the required property. */        define the required property. */
3221    
3222  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3223        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3224            {
3225            *code++ = prop_type;
3226            *code++ = prop_value;
3227            }
3228  #endif  #endif
3229        }        }
3230    
# Line 2571  for (;; ptr++) Line 3267  for (;; ptr++)
3267      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3268      cases. */      cases. */
3269    
3270      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3271               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3272        {        {
3273        register int i;        register int i;
3274        int ketoffset = 0;        int ketoffset = 0;
3275        int len = code - previous;        int len = code - previous;
3276        uschar *bralink = NULL;        uschar *bralink = NULL;
3277    
3278          /* Repeating a DEFINE group is pointless */
3279    
3280          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3281            {
3282            *errorcodeptr = ERR55;
3283            goto FAILED;
3284            }
3285    
3286          /* This is a paranoid check to stop integer overflow later on */
3287    
3288          if (len > MAX_DUPLENGTH)
3289            {
3290            *errorcodeptr = ERR50;
3291            goto FAILED;
3292            }
3293    
3294        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3295        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3296        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2613  for (;; ptr++) Line 3325  for (;; ptr++)
3325          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3326          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3327          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3328          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3329          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3330            doing this. */
3331    
3332          if (repeat_max <= 1)          if (repeat_max <= 1)
3333            {            {
3334            *code = OP_END;            *code = OP_END;
3335            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3336            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3337            code++;            code++;
3338            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2637  for (;; ptr++) Line 3350  for (;; ptr++)
3350            {            {
3351            int offset;            int offset;
3352            *code = OP_END;            *code = OP_END;
3353            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3354            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3355            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3356            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3370  for (;; ptr++)
3370        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3371        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3372        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3373        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3374          forward reference subroutine calls in the group, there will be entries on
3375          the workspace list; replicate these with an appropriate increment. */
3376    
3377        else        else
3378          {          {
3379          if (repeat_min > 1)          if (repeat_min > 1)
3380            {            {
3381            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3382            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3383    
3384              if (lengthptr != NULL)
3385                *lengthptr += (repeat_min - 1)*length_prevgroup;
3386    
3387              /* This is compiling for real */
3388    
3389              else
3390              {              {
3391              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3392              code += len;              for (i = 1; i < repeat_min; i++)
3393                  {
3394                  uschar *hc;
3395                  uschar *this_hwm = cd->hwm;
3396                  memcpy(code, previous, len);
3397                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3398                    {
3399                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3400                    cd->hwm += LINK_SIZE;
3401                    }
3402                  save_hwm = this_hwm;
3403                  code += len;
3404                  }
3405              }              }
3406            }            }
3407    
3408          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3409          }          }
3410    
# Line 2677  for (;; ptr++) Line 3412  for (;; ptr++)
3412        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3413        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3414        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3415        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3416          replicate entries on the forward reference list. */
3417    
3418        if (repeat_max >= 0)        if (repeat_max >= 0)
3419          {          {
3420          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3421            just adjust the length as if we had. For each repetition we must add 1
3422            to the length for BRAZERO and for all but the last repetition we must
3423            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3424    
3425            if (lengthptr != NULL && repeat_max > 0)
3426              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3427                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3428    
3429            /* This is compiling for real */
3430    
3431            else for (i = repeat_max - 1; i >= 0; i--)
3432            {            {
3433              uschar *hc;
3434              uschar *this_hwm = cd->hwm;
3435    
3436            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3437    
3438            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2698  for (;; ptr++) Line 3448  for (;; ptr++)
3448              }              }
3449    
3450            memcpy(code, previous, len);            memcpy(code, previous, len);
3451              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3452                {
3453                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3454                cd->hwm += LINK_SIZE;
3455                }
3456              save_hwm = this_hwm;
3457            code += len;            code += len;
3458            }            }
3459    
# Line 2720  for (;; ptr++) Line 3476  for (;; ptr++)
3476        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3477        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3478        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3479        correct offset was computed above. */        correct offset was computed above.
3480    
3481          Then, when we are doing the actual compile phase, check to see whether
3482          this group is a non-atomic one that could match an empty string. If so,
3483          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3484          that runtime checking can be done. [This check is also applied to
3485          atomic groups at runtime, but in a different way.] */
3486    
3487        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3488            {
3489            uschar *ketcode = code - ketoffset;
3490            uschar *bracode = ketcode - GET(ketcode, 1);
3491            *ketcode = OP_KETRMAX + repeat_type;
3492            if (lengthptr == NULL && *bracode != OP_ONCE)
3493              {
3494              uschar *scode = bracode;
3495              do
3496                {
3497                if (could_be_empty_branch(scode, ketcode, utf8))
3498                  {
3499                  *bracode += OP_SBRA - OP_BRA;
3500                  break;
3501                  }
3502                scode += GET(scode, 1);
3503                }
3504              while (*scode == OP_ALT);
3505              }
3506            }
3507        }        }
3508    
3509      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2733  for (;; ptr++) Line 3514  for (;; ptr++)
3514        goto FAILED;        goto FAILED;
3515        }        }
3516    
3517      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3518      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3519      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3520      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3521      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3522        but the special opcodes can optimize it a bit. The repeated item starts at
3523        tempcode, not at previous, which might be the first part of a string whose
3524        (former) last char we repeated.
3525    
3526        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3527        an 'upto' may follow. We skip over an 'exact' item, and then test the
3528        length of what remains before proceeding. */
3529    
3530      if (possessive_quantifier)      if (possessive_quantifier)
3531        {        {
3532        int len = code - tempcode;        int len;
3533        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3534        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3535        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3536        tempcode[0] = OP_ONCE;        len = code - tempcode;
3537        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3538        PUTINC(code, 0, len);          {
3539        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3540            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3541            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3542            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3543    
3544            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3545            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3546            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3547            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3548    
3549            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3550            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3551            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3552            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3553    
3554            default:
3555            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3556            code += 1 + LINK_SIZE;
3557            len += 1 + LINK_SIZE;
3558            tempcode[0] = OP_ONCE;
3559            *code++ = OP_KET;
3560            PUTINC(code, 0, len);
3561            PUT(tempcode, 1, len);
3562            break;
3563            }
3564        }        }
3565    
3566      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2761  for (;; ptr++) Line 3573  for (;; ptr++)
3573      break;      break;
3574    
3575    
3576      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3577      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3578      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3579      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3580      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3581      check for syntax errors here.  */      group. */
3582    
3583      case '(':      case '(':
3584      newoptions = options;      newoptions = options;
3585      skipbytes = 0;      skipbytes = 0;
3586        bravalue = OP_CBRA;
3587        save_hwm = cd->hwm;
3588        reset_bracount = FALSE;
3589    
3590      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3591        {        {
3592        int set, unset;        int i, set, unset, namelen;
3593        int *optset;        int *optset;
3594          const uschar *name;
3595          uschar *slot;
3596    
3597        switch (*(++ptr))        switch (*(++ptr))
3598          {          {
3599          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3600          ptr++;          ptr++;
3601          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3602            if (*ptr == 0)
3603              {
3604              *errorcodeptr = ERR18;
3605              goto FAILED;
3606              }
3607          continue;          continue;
3608    
3609          case ':':                 /* Non-extracting bracket */  
3610            /* ------------------------------------------------------------ */
3611            case '|':                 /* Reset capture count for each branch */
3612            reset_bracount = TRUE;
3613            /* Fall through */
3614    
3615            /* ------------------------------------------------------------ */
3616            case ':':                 /* Non-capturing bracket */
3617          bravalue = OP_BRA;          bravalue = OP_BRA;
3618          ptr++;          ptr++;
3619          break;          break;
3620    
3621    
3622            /* ------------------------------------------------------------ */
3623          case '(':          case '(':
3624          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3625    
3626          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3627            group), a name (referring to a named group), or 'R', referring to
3628            recursion. R<digits> and R&name are also permitted for recursion tests.
3629    
3630            There are several syntaxes for testing a named group: (?(name)) is used
3631            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3632    
3633            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3634            be the recursive thing or the name 'R' (and similarly for 'R' followed
3635            by digits), and (b) a number could be a name that consists of digits.
3636            In both cases, we look for a name first; if not found, we try the other
3637            cases. */
3638    
3639            /* For conditions that are assertions, check the syntax, and then exit
3640            the switch. This will take control down to where bracketed groups,
3641            including assertions, are processed. */
3642    
3643            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3644              break;
3645    
3646            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3647            below), and all need to skip 3 bytes at the start of the group. */
3648    
3649            code[1+LINK_SIZE] = OP_CREF;
3650            skipbytes = 3;
3651            refsign = -1;
3652    
3653            /* Check for a test for recursion in a named group. */
3654    
3655            if (ptr[1] == 'R' && ptr[2] == '&')
3656              {
3657              terminator = -1;
3658              ptr += 2;
3659              code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3660              }
3661    
3662            /* Check for a test for a named group's having been set, using the Perl
3663            syntax (?(<name>) or (?('name') */
3664    
3665            else if (ptr[1] == '<')
3666              {
3667              terminator = '>';
3668              ptr++;
3669              }
3670            else if (ptr[1] == '\'')
3671              {
3672              terminator = '\'';
3673              ptr++;
3674              }
3675            else
3676              {
3677              terminator = 0;
3678              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3679              }
3680    
3681            /* We now expect to read a name; any thing else is an error */
3682    
3683            if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3684              {
3685              ptr += 1;  /* To get the right offset */
3686              *errorcodeptr = ERR28;
3687              goto FAILED;
3688              }
3689    
3690            /* Read the name, but also get it as a number if it's all digits */
3691    
3692            recno = 0;
3693            name = ++ptr;
3694            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3695              {
3696              if (recno >= 0)
3697                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3698                  recno * 10 + *ptr - '0' : -1;
3699              ptr++;
3700              }
3701            namelen = ptr - name;
3702    
3703          if (ptr[1] == 'R')          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3704            {            {
3705            code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
3706            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            *errorcodeptr = ERR26;
3707            skipbytes = 3;            goto FAILED;
           ptr += 3;  
3708            }            }
3709    
3710          /* Condition to test for a numbered subpattern match. We know that          /* Do no further checking in the pre-compile phase. */
3711          if a digit follows ( then there will just be digits until ) because  
3712          the syntax was checked in the first pass. */          if (lengthptr != NULL) break;
3713    
3714          else if ((digitab[ptr[1]] && ctype_digit) != 0)          /* In the real compile we do the work of looking for the actual
3715            reference. If the string started with "+" or "-" we require the rest to
3716            be digits, in which case recno will be set. */
3717    
3718            if (refsign > 0)
3719            {            {
3720            int condref;                 /* Don't amalgamate; some compilers */            if (recno <= 0)
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
3721              {              {
3722              *errorcodeptr = ERR35;              *errorcodeptr = ERR58;
3723              goto FAILED;              goto FAILED;
3724              }              }
3725            ptr++;            if (refsign == '-')
3726            code[1+LINK_SIZE] = OP_CREF;              {
3727            PUT2(code, 2+LINK_SIZE, condref);              recno = cd->bracount - recno + 1;
3728            skipbytes = 3;              if (recno <= 0)
3729                  {
3730                  *errorcodeptr = ERR15;
3731                  goto FAILED;
3732                  }
3733                }
3734              else recno += cd->bracount;
3735              PUT2(code, 2+LINK_SIZE, recno);
3736              break;
3737              }
3738    
3739            /* Otherwise (did not start with "+" or "-"), start by looking for the
3740            name. */
3741    
3742            slot = cd->name_table;
3743            for (i = 0; i < cd->names_found; i++)
3744              {
3745              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3746              slot += cd->name_entry_size;
3747              }
3748    
3749            /* Found a previous named subpattern */
3750    
3751            if (i < cd->names_found)
3752              {
3753              recno = GET2(slot, 0);
3754              PUT2(code, 2+LINK_SIZE, recno);
3755              }
3756    
3757            /* Search the pattern for a forward reference */
3758    
3759            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3760                            (options & PCRE_EXTENDED) != 0)) > 0)
3761              {
3762              PUT2(code, 2+LINK_SIZE, i);
3763              }
3764    
3765            /* If terminator == 0 it means that the name followed directly after
3766            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3767            some further alternatives to try. For the cases where terminator != 0
3768            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3769            now checked all the possibilities, so give an error. */
3770    
3771            else if (terminator != 0)
3772              {
3773              *errorcodeptr = ERR15;
3774              goto FAILED;
3775              }
3776    
3777            /* Check for (?(R) for recursion. Allow digits after R to specify a
3778            specific group number. */
3779    
3780            else if (*name == 'R')
3781              {
3782              recno = 0;
3783              for (i = 1; i < namelen; i++)
3784                {
3785                if ((digitab[name[i]] & ctype_digit) == 0)
3786                  {
3787                  *errorcodeptr = ERR15;
3788                  goto FAILED;
3789                  }
3790                recno = recno * 10 + name[i] - '0';
3791                }
3792              if (recno == 0) recno = RREF_ANY;
3793              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3794              PUT2(code, 2+LINK_SIZE, recno);
3795              }
3796    
3797            /* Similarly, check for the (?(DEFINE) "condition", which is always
3798            false. */
3799    
3800            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3801              {
3802              code[1+LINK_SIZE] = OP_DEF;
3803              skipbytes = 1;
3804              }
3805    
3806            /* Check for the "name" actually being a subpattern number. */
3807    
3808            else if (recno > 0)
3809              {
3810              PUT2(code, 2+LINK_SIZE, recno);
3811              }
3812    
3813            /* Either an unidentified subpattern, or a reference to (?(0) */
3814    
3815            else
3816              {
3817              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3818              goto FAILED;
3819            }            }
         /* For conditions that are assertions, we just fall through, having  
         set bravalue above. */  
3820          break;          break;
3821    
3822    
3823            /* ------------------------------------------------------------ */
3824          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3825          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3826          ptr++;          ptr++;
3827          break;          break;
3828    
3829    
3830            /* ------------------------------------------------------------ */
3831          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3832          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3833          ptr++;          ptr++;
3834          break;          break;
3835    
3836          case '<':                 /* Lookbehinds */  
3837          switch (*(++ptr))          /* ------------------------------------------------------------ */
3838            case '<':                 /* Lookbehind or named define */
3839            switch (ptr[1])
3840            {            {
3841            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3842            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3843            ptr++;            ptr += 2;
3844            break;            break;
3845    
3846            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3847            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3848            ptr++;            ptr += 2;
3849            break;            break;
3850    
3851              default:                /* Could be name define, else bad */
3852              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3853              ptr++;                  /* Correct offset for error */
3854              *errorcodeptr = ERR24;
3855              goto FAILED;
3856            }            }
3857          break;          break;
3858    
3859    
3860            /* ------------------------------------------------------------ */
3861          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3862          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3863          ptr++;          ptr++;
3864          break;          break;
3865    
3866    
3867            /* ------------------------------------------------------------ */
3868          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
3869          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
3870          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
3871          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
3872            {                       /* closing parenthesis is present. */            {
3873            int n = 0;            int n = 0;
3874            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
3875              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
3876              if (*ptr != ')')
3877                {
3878                *errorcodeptr = ERR39;
3879                goto FAILED;
3880                }
3881            if (n > 255)            if (n > 255)
3882              {              {
3883              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 2876  for (;; ptr++) Line 3891  for (;; ptr++)
3891          previous = NULL;          previous = NULL;
3892          continue;          continue;
3893    
3894          case 'P':                 /* Named subpattern handling */  
3895          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
3896            case 'P':                 /* Python-style named subpattern handling */
3897            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
3898              {
3899              is_recurse = *ptr == '>';
3900              terminator = ')';
3901              goto NAMED_REF_OR_RECURSE;
3902              }
3903            else if (*ptr != '<')    /* Test for Python-style definition */
3904              {
3905              *errorcodeptr = ERR41;
3906              goto FAILED;
3907              }
3908            /* Fall through to handle (?P< as (?< is handled */
3909    
3910    
3911            /* ------------------------------------------------------------ */
3912            DEFINE_NAME:    /* Come here from (?< handling */
3913            case '\'':
3914            {            {
3915            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
3916            uschar *slot = cd->name_table;            name = ++ptr;
           const uschar *name;     /* Don't amalgamate; some compilers */  
           name = ++ptr;           /* grumble at autoincrement in declaration */  
3917    
3918            while (*ptr++ != '>');            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3919            namelen = ptr - name - 1;            namelen = ptr - name;
3920    
3921            for (i = 0; i < cd->names_found; i++)            /* In the pre-compile phase, just do a syntax check. */
3922    
3923              if (lengthptr != NULL)
3924              {              {
3925              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
3926              if (crc == 0)                {
3927                  *errorcodeptr = ERR42;
3928                  goto FAILED;
3929                  }
3930                if (cd->names_found >= MAX_NAME_COUNT)
3931                  {
3932                  *errorcodeptr = ERR49;
3933                  goto FAILED;
3934                  }
3935                if (namelen + 3 > cd->name_entry_size)
3936                {                {
3937                if (slot[2+namelen] == 0)                cd->name_entry_size = namelen + 3;
3938                  if (namelen > MAX_NAME_SIZE)
3939                  {                  {
3940                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
3941                  goto FAILED;                  goto FAILED;
3942                  }                  }
               crc = -1;             /* Current name is substring */  
3943                }                }
3944              if (crc < 0)              }
3945    
3946              /* In the real compile, create the entry in the table */
3947    
3948              else
3949                {
3950                slot = cd->name_table;
3951                for (i = 0; i < cd->names_found; i++)
3952                {                {
3953                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
3954                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
3955                break;                  {
3956                    if (slot[2+namelen] == 0)
3957                      {
3958                      if ((options & PCRE_DUPNAMES) == 0)
3959                        {
3960                        *errorcodeptr = ERR43;
3961                        goto FAILED;
3962                        }
3963                      }
3964                    else crc = -1;      /* Current name is substring */
3965                    }
3966                  if (crc < 0)
3967                    {
3968                    memmove(slot + cd->name_entry_size, slot,
3969                      (cd->names_found - i) * cd->name_entry_size);
3970                    break;
3971                    }
3972                  slot += cd->name_entry_size;
3973                }                }
             slot += cd->name_entry_size;  
             }  
3974    
3975            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
3976            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
3977            slot[2+namelen] = 0;              slot[2+namelen] = 0;
3978            cd->names_found++;              }
           goto NUMBERED_GROUP;  
3979            }            }
3980    
3981          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
3982    
3983            ptr++;                    /* Move past > or ' */
3984            cd->names_found++;
3985            goto NUMBERED_GROUP;
3986    
3987    
3988            /* ------------------------------------------------------------ */
3989            case '&':                 /* Perl recursion/subroutine syntax */
3990            terminator = ')';
3991            is_recurse = TRUE;
3992            /* Fall through */
3993    
3994            /* We come here from the Python syntax above that handles both
3995            references (?P=name) and recursion (?P>name), as well as falling
3996            through from the Perl recursion syntax (?&name). */
3997    
3998            NAMED_REF_OR_RECURSE:
3999            name = ++ptr;
4000            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4001            namelen = ptr - name;
4002    
4003            /* In the pre-compile phase, do a syntax check and set a dummy
4004            reference number. */
4005    
4006            if (lengthptr != NULL)
4007            {            {
4008            int i, namelen;            if (*ptr != terminator)
4009            int type = *ptr++;              {
4010            const uschar *name = ptr;              *errorcodeptr = ERR42;
4011            uschar *slot = cd->name_table;              goto FAILED;
4012                }
4013              if (namelen > MAX_NAME_SIZE)
4014                {
4015                *errorcodeptr = ERR48;
4016                goto FAILED;
4017                }
4018              recno = 0;
4019              }
4020    
4021            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4022    
4023            else
4024              {
4025              slot = cd->name_table;
4026            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4027              {              {
4028              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4029              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4030              }              }
4031            if (i >= cd->names_found)  
4032              if (i < cd->names_found)         /* Back reference */
4033                {
4034                recno = GET2(slot, 0);
4035                }
4036              else if ((recno =                /* Forward back reference */
4037                        find_parens(ptr, cd->bracount, name, namelen,
4038                          (options & PCRE_EXTENDED) != 0)) <= 0)
4039              {              {
4040              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4041              goto FAILED;              goto FAILED;
4042              }              }
4043              }
4044    
4045            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4046            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4047    
4048            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4049            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4050    
         /* Should never happen */  
         break;  
4051    
4052          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4053            case 'R':                 /* Recursion */
4054          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4055          /* Fall through */          /* Fall through */
4056    
         /* Recursion or "subroutine" call */  
4057    
4058          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4059          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4060            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4061            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4062            {            {
4063            const uschar *called;            const uschar *called;
4064    
4065              if ((refsign = *ptr) == '+') ptr++;
4066              else if (refsign == '-')
4067                {
4068                if ((digitab[ptr[1]] & ctype_digit) == 0)
4069                  goto OTHER_CHAR_AFTER_QUERY;
4070                ptr++;
4071                }
4072    
4073            recno = 0;            recno = 0;
4074            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4075              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4076    
4077            /* Come here from code above that handles a named recursion */            if (*ptr != ')')
   
           HANDLE_RECURSION:  
   
           previous = code;  
   
           /* Find the bracket that is being referenced. Temporarily end the  
           regex in case it doesn't exist. */  
   
           *code = OP_END;  
           called = (recno == 0)?  
             cd->start_code : find_bracket(cd->start_code, utf8, recno);  
   
           if (called == NULL)  
4078              {              {
4079              *errorcodeptr = ERR15;              *errorcodeptr = ERR29;
4080              goto FAILED;              goto FAILED;
4081              }              }
4082    
4083            /* If the subpattern is still open, this is a recursive call. We            if (refsign == '-')
4084            check to see if this is a left recursion that could loop for ever,              {
4085            and diagnose that case. */              if (recno == 0)
4086                  {
4087                  *errorcodeptr = ERR58;
4088                  goto FAILED;
4089                  }
4090                recno = cd->bracount - recno + 1;
4091                if (recno <= 0)
4092                  {
4093                  *errorcodeptr = ERR15;
4094                  goto FAILED;
4095                  }
4096                }
4097              else if (refsign == '+')
4098                {
4099                if (recno == 0)
4100                  {
4101                  *errorcodeptr = ERR58;
4102                  goto FAILED;
4103                  }
4104                recno += cd->bracount;
4105                }
4106    
4107              /* Come here from code above that handles a named recursion */
4108    
4109              HANDLE_RECURSION:
4110    
4111              previous = code;
4112              called = cd->start_code;
4113    
4114              /* When we are actually compiling, find the bracket that is being
4115              referenced. Temporarily end the regex in case it doesn't exist before
4116              this point. If we end up with a forward reference, first check that
4117              the bracket does occur later so we can give the error (and position)
4118              now. Then remember this forward reference in the workspace so it can
4119              be filled in at the end. */
4120    
4121            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))            if (lengthptr == NULL)
4122              {              {
4123              *errorcodeptr = ERR40;              *code = OP_END;
4124              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4125    
4126                /* Forward reference */
4127    
4128                if (called == NULL)
4129                  {
4130                  if (find_parens(ptr, cd->bracount, NULL, recno,
4131                       (options & PCRE_EXTENDED) != 0) < 0)
4132                    {
4133                    *errorcodeptr = ERR15;
4134                    goto FAILED;
4135                    }
4136                  called = cd->start_code + recno;
4137                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4138                  }
4139    
4140                /* If not a forward reference, and the subpattern is still open,
4141                this is a recursive call. We check to see if this is a left
4142                recursion that could loop for ever, and diagnose that case. */
4143    
4144                else if (GET(called, 1) == 0 &&
4145                         could_be_empty(called, code, bcptr, utf8))
4146                  {
4147                  *errorcodeptr = ERR40;
4148                  goto FAILED;
4149                  }
4150              }              }
4151    
4152            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4153              "once" brackets. Set up a "previous group" length so that a
4154              subsequent quantifier will work. */
4155    
4156              *code = OP_ONCE;
4157              PUT(code, 1, 2 + 2*LINK_SIZE);
4158              code += 1 + LINK_SIZE;
4159    
4160            *code = OP_RECURSE;            *code = OP_RECURSE;
4161            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4162            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4163    
4164              *code = OP_KET;
4165              PUT(code, 1, 2 + 2*LINK_SIZE);
4166              code += 1 + LINK_SIZE;
4167    
4168              length_prevgroup = 3 + 3*LINK_SIZE;
4169            }            }
4170    
4171            /* Can't determine a first byte now */
4172    
4173            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4174          continue;          continue;
4175    
         /* Character after (? not specially recognized */  
4176    
4177          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4178            default:              /* Other characters: check option setting */
4179            OTHER_CHAR_AFTER_QUERY:
4180          set = unset = 0;          set = unset = 0;
4181          optset = &set;          optset = &set;
4182    
# Line 3016  for (;; ptr++) Line 4186  for (;; ptr++)
4186              {              {
4187              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4188    
4189                case 'J':    /* Record that it changed in the external options */
4190                *optset |= PCRE_DUPNAMES;
4191                cd->external_options |= PCRE_JCHANGED;
4192                break;
4193    
4194              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4195              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4196              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4197              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4198              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4199              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4200    
4201                default:  *errorcodeptr = ERR12;
4202                          ptr--;    /* Correct the offset */
4203                          goto FAILED;
4204              }              }
4205            }            }
4206    
# Line 3030  for (;; ptr++) Line 4209  for (;; ptr++)
4209          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4210    
4211          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4212          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4213          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4214          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4215          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4216          a group), a resetting item can be compiled.          caseless checking of required bytes.
4217    
4218          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4219          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4220          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4221            that value after the start, because it gets reset as code is discarded
4222            during the pre-compile. However, this can happen only at top level - if
4223            we are within parentheses, the starting BRA will still be present. At
4224            any parenthesis level, the length value can be used to test if anything
4225            has been compiled at that level. Thus, a test for both these conditions
4226            is necessary to ensure we correctly detect the start of the pattern in
4227            both phases.
4228    
4229            If we are not at the pattern start, compile code to change the ims
4230            options if this setting actually changes any of them. We also pass the
4231            new setting back so that it can be put at the start of any following
4232            branches, and when this group ends (if we are in a group), a resetting
4233            item can be compiled. */
4234    
4235          if (*ptr == ')')          if (*ptr == ')')
4236            {            {
4237            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4238                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4239              {              {
4240              *code++ = OP_OPT;              cd->external_options = newoptions;
4241              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4242              }              }
4243             else
4244                {
4245                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4246                  {
4247                  *code++ = OP_OPT;
4248                  *code++ = newoptions & PCRE_IMS;
4249                  }
4250    
4251            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4252            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4253            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4254    
4255            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4256            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4257            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4258            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4259                }
4260    
4261            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4262            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3068  for (;; ptr++) Line 4269  for (;; ptr++)
4269    
4270          bravalue = OP_BRA;          bravalue = OP_BRA;
4271          ptr++;          ptr++;
4272          }          }     /* End of switch for character following (? */
4273        }        }       /* End of (? handling */
4274    
4275      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4276      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4277        brackets. */
4278    
4279      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4280        {        {
4281        bravalue = OP_BRA;        bravalue = OP_BRA;
4282        }        }
4283    
4284      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4285    
4286      else      else
4287        {        {
4288        NUMBERED_GROUP:        NUMBERED_GROUP:
4289        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4290          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4291          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4292        }        }
4293    
4294      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4295      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4296      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4297      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4298        they have changed. */
4299    
4300      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4301      *code = bravalue;      *code = bravalue;
4302      tempcode = code;      tempcode = code;
4303      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4304        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4305    
4306      if (!compile_regex(      if (!compile_regex(
4307           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4308           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4309           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4310           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4311           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4312           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4313            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4314           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           reset_bracount,               /* True if (?| group */
4315             skipbytes,                    /* Skip over bracket number */
4316           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4317           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4318           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4319           cd))                          /* Tables block */           cd,                           /* Tables block */
4320             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4321               &length_prevgroup           /* Pre-compile phase */
4322             ))
4323        goto FAILED;        goto FAILED;
4324    
4325      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3128  for (;; ptr++) Line 4328  for (;; ptr++)
4328      is on the bracket. */      is on the bracket. */
4329    
4330      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4331      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4332        in the real compile phase, not in the pre-pass, where the whole group may
4333        not be available. */
4334    
4335      else if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4336        {        {
4337        uschar *tc = code;        uschar *tc = code;
4338        condcount = 0;        int condcount = 0;
4339    
4340        do {        do {
4341           condcount++;           condcount++;
# Line 3141  for (;; ptr++) Line 4343  for (;; ptr++)
4343           }           }
4344        while (*tc != OP_KET);        while (*tc != OP_KET);
4345    
4346        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4347          false). It must have only one branch. */
4348    
4349          if (code[LINK_SIZE+1] == OP_DEF)
4350          {          {
4351          *errorcodeptr = ERR27;          if (condcount > 1)
4352          goto FAILED;            {
4353              *errorcodeptr = ERR54;
4354              goto FAILED;
4355              }
4356            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4357            }
4358    
4359          /* A "normal" conditional group. If there is just one branch, we must not
4360          make use of its firstbyte or reqbyte, because this is equivalent to an
4361          empty second branch. */
4362    
4363          else
4364            {
4365            if (condcount > 2)
4366              {
4367              *errorcodeptr = ERR27;
4368              goto FAILED;
4369              }
4370            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4371          }          }
4372          }
4373    
4374        /* Error if hit end of pattern */
4375    
4376        /* If there is just one branch, we must not make use of its firstbyte or      if (*ptr != ')')
4377        reqbyte, because this is equivalent to an empty second branch. */        {
4378          *errorcodeptr = ERR14;
4379          goto FAILED;
4380          }
4381    
4382        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4383        group, less the brackets at either end. Then reduce the compiled code to
4384        just the brackets so that it doesn't use much memory if it is duplicated by
4385        a quantifier. */
4386    
4387        if (lengthptr != NULL)
4388          {
4389          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4390          code++;
4391          PUTINC(code, 0, 1 + LINK_SIZE);
4392          *code++ = OP_KET;
4393          PUTINC(code, 0, 1 + LINK_SIZE);
4394        }        }
4395    
4396      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4397      brackets of all kinds, and conditions with two branches (see code above).  
4398      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4399      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4400      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4401        relevant. */
4402    
4403        if (bravalue == OP_DEF) break;
4404    
4405        /* Handle updating of the required and first characters for other types of
4406        group. Update for normal brackets of all kinds, and conditions with two
4407        branches (see code above). If the bracket is followed by a quantifier with
4408        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4409        zerofirstbyte outside the main loop so that they can be accessed for the
4410        back off. */
4411    
4412      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4413      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4414      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4415    
4416      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4417        {        {
4418        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4419        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3204  for (;; ptr++) Line 4454  for (;; ptr++)
4454      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4455    
4456      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4457        break;     /* End of processing '(' */
4458    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
4459    
4460      /* Error if hit end of pattern */      /* ===================================================================*/
4461        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
   
     case '\\':  
     tempptr = ptr;  
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
   
     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values  
4462      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4463      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4464      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4465      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4466      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4467    
4468        case '\\':
4469        tempptr = ptr;
4470        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4471        if (*errorcodeptr != 0) goto FAILED;
4472    
4473      if (c < 0)      if (c < 0)
4474        {        {
4475        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3242  for (;; ptr++) Line 4479  for (;; ptr++)
4479          continue;          continue;
4480          }          }
4481    
4482          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4483    
4484        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4485        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4486    
# Line 3253  for (;; ptr++) Line 4492  for (;; ptr++)
4492        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4493        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4494    
4495        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4496          We also support \k{name} (.NET syntax) */
4497    
4498          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4499            {
4500            is_recurse = FALSE;
4501            terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4502            goto NAMED_REF_OR_RECURSE;
4503            }
4504    
4505          /* Back references are handled specially; must disable firstbyte if
4506          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4507          ':' later. */
4508    
4509        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4510          {          {
4511          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4512    
4513            HANDLE_REFERENCE:    /* Come here from named backref handling */
4514            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4515          previous = code;          previous = code;
4516          *code++ = OP_REF;          *code++ = OP_REF;
4517          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4518            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4519            if (recno > cd->top_backref) cd->top_backref = recno;
4520          }          }
4521    
4522        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4523    
4524  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4525        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
4526          {          {
4527          BOOL negated;          BOOL negated;
4528          int value = get_ucp(&ptr, &negated, errorcodeptr);          int pdata;
4529            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4530            if (ptype < 0) goto FAILED;
4531          previous = code;          previous = code;