/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 223 by ph10, Mon Aug 20 11:07:53 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 53  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 96  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 115  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143    /* Table of special "verbs" like (*PRUNE) */
144    
145    typedef struct verbitem {
146      const char *name;
147      int   len;
148      int   op;
149    } verbitem;
150    
151    static verbitem verbs[] = {
152      { "ACCEPT", 6, OP_ACCEPT },
153      { "COMMIT", 6, OP_COMMIT },
154      { "F",      1, OP_FAIL },
155      { "FAIL",   4, OP_FAIL },
156      { "PRUNE",  5, OP_PRUNE },
157      { "SKIP",   4, OP_SKIP  },
158      { "THEN",   4, OP_THEN  }
159    };
160    
161    static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
165  terminated by a zero length entry. The first three must be alpha, lower, upper,  terminated by a zero length entry. The first three must be alpha, lower, upper,
166  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
# Line 155  static const int posix_class_maps[] = { Line 201  static const int posix_class_maps[] = {
201  };  };
202    
203    
204    #define STRING(a)  # a
205    #define XSTRING(s) STRING(s)
206    
207  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
208  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
209    they are documented. Always add a new error instead. Messages marked DEAD below
210    are no longer used. */
211    
212  static const char *error_texts[] = {  static const char *error_texts[] = {
213    "no error",    "no error",
# Line 171  static const char *error_texts[] = { Line 222  static const char *error_texts[] = {
222    "range out of order in character class",    "range out of order in character class",
223    "nothing to repeat",    "nothing to repeat",
224    /* 10 */    /* 10 */
225    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
226    "internal error: unexpected repeat",    "internal error: unexpected repeat",
227    "unrecognized character after (?",    "unrecognized character after (?",
228    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 181  static const char *error_texts[] = { Line 232  static const char *error_texts[] = {
232    "erroffset passed as NULL",    "erroffset passed as NULL",
233    "unknown option bit(s) set",    "unknown option bit(s) set",
234    "missing ) after comment",    "missing ) after comment",
235    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
236    /* 20 */    /* 20 */
237    "regular expression too large",    "regular expression is too large",
238    "failed to get memory",    "failed to get memory",
239    "unmatched parentheses",    "unmatched parentheses",
240    "internal error: code overflow",    "internal error: code overflow",
241    "unrecognized character after (?<",    "unrecognized character after (?<",
242    /* 25 */    /* 25 */
243    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
244    "malformed number after (?(",    "malformed number or name after (?(",
245    "conditional group contains more than two branches",    "conditional group contains more than two branches",
246    "assertion expected after (?(",    "assertion expected after (?(",
247    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
248    /* 30 */    /* 30 */
249    "unknown POSIX class name",    "unknown POSIX class name",
250    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
251    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
252    "spare error",    "spare error",  /** DEAD **/
253    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
254    /* 35 */    /* 35 */
255    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 209  static const char *error_texts[] = { Line 260  static const char *error_texts[] = {
260    /* 40 */    /* 40 */
261    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
262    "unrecognized character after (?P",    "unrecognized character after (?P",
263    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
264    "two named groups have the same name",    "two named subpatterns have the same name",
265    "invalid UTF-8 string",    "invalid UTF-8 string",
266    /* 45 */    /* 45 */
267    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
268    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
269    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
270      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272      /* 50 */
273      "repeated subpattern is too long",    /** DEAD **/
274      "octal value is greater than \\377 (not in UTF-8 mode)",
275      "internal error: overran compiling workspace",
276      "internal error: previously-checked referenced subpattern not found",
277      "DEFINE group contains more than one branch",
278      /* 55 */
279      "repeating a DEFINE group is not allowed",
280      "inconsistent NEWLINE options",
281      "\\g is not followed by a braced name or an optionally braced non-zero number",
282      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283      "(*VERB) with an argument is not supported",
284      /* 60 */
285      "(*VERB) not recognized",
286      "number is too big"
287  };  };
288    
289    
# Line 235  For convenience, we use the same bit def Line 303  For convenience, we use the same bit def
303    
304  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
307  static const unsigned char digitab[] =  static const unsigned char digitab[] =
308    {    {
309    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 271  static const unsigned char digitab[] = Line 339  static const unsigned char digitab[] =
339    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
343  static const unsigned char digitab[] =  static const unsigned char digitab[] =
344    {    {
345    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 285  static const unsigned char digitab[] = Line 353  static const unsigned char digitab[] =
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
354    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
355    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
356    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
357    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 319  static const unsigned char ebcdic_charta Line 387  static const unsigned char ebcdic_charta
387    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
388    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
389    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
390    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
391    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
392    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
393    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 346  static const unsigned char ebcdic_charta Line 414  static const unsigned char ebcdic_charta
414  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
415    
416  static BOOL  static BOOL
417    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
419    
420    
421    
# Line 357  static BOOL Line 425  static BOOL
425    
426  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
427  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
428  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
429  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431    ptr is pointing at the \. On exit, it is on the final character of the escape
432    sequence.
433    
434  Arguments:  Arguments:
435    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 370  Arguments: Line 440  Arguments:
440    
441  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
442                   negative => a special escape sequence                   negative => a special escape sequence
443                   on error, errorptr is set                   on error, errorcodeptr is set
444  */  */
445    
446  static int  static int
# Line 392  if (c == 0) *errorcodeptr = ERR1; Line 462  if (c == 0) *errorcodeptr = ERR1;
462  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
463  Otherwise further processing may be required. */  Otherwise further processing may be required. */
464    
465  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
466  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
467  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
468    
469  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
470  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
471  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
472  #endif  #endif
# Line 406  else if ((i = escapes[c - 0x48]) != 0) Line 476  else if ((i = escapes[c - 0x48]) != 0)
476  else  else
477    {    {
478    const uschar *oldptr;    const uschar *oldptr;
479      BOOL braced, negated;
480    
481    switch (c)    switch (c)
482      {      {
483      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 419  else Line 491  else
491      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
492      break;      break;
493    
494        /* \g must be followed by a number, either plain or braced. If positive, it
495        is an absolute backreference. If negative, it is a relative backreference.
496        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497        reference to a named group. This is part of Perl's movement towards a
498        unified syntax for back references. As this is synonymous with \k{name}, we
499        fudge it up by pretending it really was \k. */
500    
501        case 'g':
502        if (ptr[1] == '{')
503          {
504          const uschar *p;
505          for (p = ptr+2; *p != 0 && *p != '}'; p++)
506            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507          if (*p != 0 && *p != '}')
508            {
509            c = -ESC_k;
510            break;
511            }
512          braced = TRUE;
513          ptr++;
514          }
515        else braced = FALSE;
516    
517        if (ptr[1] == '-')
518          {
519          negated = TRUE;
520          ptr++;
521          }
522        else negated = FALSE;
523    
524        c = 0;
525        while ((digitab[ptr[1]] & ctype_digit) != 0)
526          c = c * 10 + *(++ptr) - '0';
527    
528        if (c < 0)
529          {
530          *errorcodeptr = ERR61;
531          break;
532          }
533    
534        if (c == 0 || (braced && *(++ptr) != '}'))
535          {
536          *errorcodeptr = ERR57;
537          break;
538          }
539    
540        if (negated)
541          {
542          if (c > bracount)
543            {
544            *errorcodeptr = ERR15;
545            break;
546            }
547          c = bracount - (c - 1);
548          }
549    
550        c = -(ESC_REF + c);
551        break;
552    
553      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
554      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
555      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 440  else Line 571  else
571        c -= '0';        c -= '0';
572        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
573          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
574          if (c < 0)
575            {
576            *errorcodeptr = ERR61;
577            break;
578            }
579        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
580          {          {
581          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 460  else Line 596  else
596        }        }
597    
598      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
599      larger first octal digit. */      larger first octal digit. The original code used just to take the least
600        significant 8 bits of octal numbers (I think this is what early Perls used
601        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602        than 3 octal digits. */
603    
604      case '0':      case '0':
605      c -= '0';      c -= '0';
606      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
608      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
609      break;      break;
610    
611      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 486  else Line 625  else
625          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
626          count++;          count++;
627    
628  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
629          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
630          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
632          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
633          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634  #endif  #endif
# Line 513  else Line 652  else
652        {        {
653        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
654        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
655  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
656        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
657        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
659        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
660        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661  #endif  #endif
662        }        }
663      break;      break;
664    
665      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666        This coding is ASCII-specific, but then the whole concept of \cx is
667        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668    
669      case 'c':      case 'c':
670      c = *(++ptr);      c = *(++ptr);
671      if (c == 0)      if (c == 0)
672        {        {
673        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
674        return 0;        break;
675        }        }
676    
677      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
678      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
679      c ^= 0x40;      c ^= 0x40;
680  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
681      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
682      c ^= 0xC0;      c ^= 0xC0;
683  #endif  #endif
# Line 610  if (c == '{') Line 747  if (c == '{')
747      *negptr = TRUE;      *negptr = TRUE;
748      ptr++;      ptr++;
749      }      }
750    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
751      {      {
752      c = *(++ptr);      c = *(++ptr);
753      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 763  return p; Line 900  return p;
900    
901    
902  /*************************************************  /*************************************************
903    *       Find forward referenced subpattern       *
904    *************************************************/
905    
906    /* This function scans along a pattern's text looking for capturing
907    subpatterns, and counting them. If it finds a named pattern that matches the
908    name it is given, it returns its number. Alternatively, if the name is NULL, it
909    returns when it reaches a given numbered subpattern. This is used for forward
910    references to subpatterns. We know that if (?P< is encountered, the name will
911    be terminated by '>' because that is checked in the first pass.
912    
913    Arguments:
914      ptr          current position in the pattern
915      count        current count of capturing parens so far encountered
916      name         name to seek, or NULL if seeking a numbered subpattern
917      lorn         name length, or subpattern number if name is NULL
918      xmode        TRUE if we are in /x mode
919    
920    Returns:       the number of the named subpattern, or -1 if not found
921    */
922    
923    static int
924    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925      BOOL xmode)
926    {
927    const uschar *thisname;
928    
929    for (; *ptr != 0; ptr++)
930      {
931      int term;
932    
933      /* Skip over backslashed characters and also entire \Q...\E */
934    
935      if (*ptr == '\\')
936        {
937        if (*(++ptr) == 0) return -1;
938        if (*ptr == 'Q') for (;;)
939          {
940          while (*(++ptr) != 0 && *ptr != '\\');
941          if (*ptr == 0) return -1;
942          if (*(++ptr) == 'E') break;
943          }
944        continue;
945        }
946    
947      /* Skip over character classes */
948    
949      if (*ptr == '[')
950        {
951        while (*(++ptr) != ']')
952          {
953          if (*ptr == 0) return -1;
954          if (*ptr == '\\')
955            {
956            if (*(++ptr) == 0) return -1;
957            if (*ptr == 'Q') for (;;)
958              {
959              while (*(++ptr) != 0 && *ptr != '\\');
960              if (*ptr == 0) return -1;
961              if (*(++ptr) == 'E') break;
962              }
963            continue;
964            }
965          }
966        continue;
967        }
968    
969      /* Skip comments in /x mode */
970    
971      if (xmode && *ptr == '#')
972        {
973        while (*(++ptr) != 0 && *ptr != '\n');
974        if (*ptr == 0) return -1;
975        continue;
976        }
977    
978      /* An opening parens must now be a real metacharacter */
979    
980      if (*ptr != '(') continue;
981      if (ptr[1] != '?' && ptr[1] != '*')
982        {
983        count++;
984        if (name == NULL && count == lorn) return count;
985        continue;
986        }
987    
988      ptr += 2;
989      if (*ptr == 'P') ptr++;                      /* Allow optional P */
990    
991      /* We have to disambiguate (?<! and (?<= from (?<name> */
992    
993      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994           *ptr != '\'')
995        continue;
996    
997      count++;
998    
999      if (name == NULL && count == lorn) return count;
1000      term = *ptr++;
1001      if (term == '<') term = '>';
1002      thisname = ptr;
1003      while (*ptr != term) ptr++;
1004      if (name != NULL && lorn == ptr - thisname &&
1005          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006        return count;
1007      }
1008    
1009    return -1;
1010    }
1011    
1012    
1013    
1014    /*************************************************
1015  *      Find first significant op code            *  *      Find first significant op code            *
1016  *************************************************/  *************************************************/
1017    
# Line 811  for (;;) Line 1060  for (;;)
1060    
1061      case OP_CALLOUT:      case OP_CALLOUT:
1062      case OP_CREF:      case OP_CREF:
1063      case OP_BRANUMBER:      case OP_RREF:
1064        case OP_DEF:
1065      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1066      break;      break;
1067    
# Line 856  for (;;) Line 1106  for (;;)
1106    {    {
1107    int d;    int d;
1108    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1109    switch (op)    switch (op)
1110      {      {
1111        case OP_CBRA:
1112      case OP_BRA:      case OP_BRA:
1113      case OP_ONCE:      case OP_ONCE:
1114      case OP_COND:      case OP_COND:
1115      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116      if (d < 0) return d;      if (d < 0) return d;
1117      branchlength += d;      branchlength += d;
1118      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 898  for (;;) Line 1147  for (;;)
1147      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1148    
1149      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1150      case OP_CREF:      case OP_CREF:
1151        case OP_RREF:
1152        case OP_DEF:
1153      case OP_OPT:      case OP_OPT:
1154      case OP_CALLOUT:      case OP_CALLOUT:
1155      case OP_SOD:      case OP_SOD:
# Line 917  for (;;) Line 1167  for (;;)
1167    
1168      case OP_CHAR:      case OP_CHAR:
1169      case OP_CHARNC:      case OP_CHARNC:
1170        case OP_NOT:
1171      branchlength++;      branchlength++;
1172      cc += 2;      cc += 2;
1173  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 943  for (;;) Line 1194  for (;;)
1194    
1195      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1196      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1197        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1198      cc += 4;      cc += 4;
1199      break;      break;
1200    
# Line 1031  Returns:      pointer to the opcode for Line 1283  Returns:      pointer to the opcode for
1283  static const uschar *  static const uschar *
1284  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1285  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1286  for (;;)  for (;;)
1287    {    {
1288    register int c = *code;    register int c = *code;
1289    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1290    else if (c > OP_BRA)  
1291      /* XCLASS is used for classes that cannot be represented just by a bit
1292      map. This includes negated single high-valued characters. The length in
1293      the table is zero; the actual length is stored in the compiled code. */
1294    
1295      if (c == OP_XCLASS) code += GET(code, 1);
1296    
1297      /* Handle capturing bracket */
1298    
1299      else if (c == OP_CBRA)
1300      {      {
1301      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1302      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1303      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1304      }      }
1305    
1306      /* Otherwise, we can get the item's length from the table, except that for
1307      repeated character types, we have to test for \p and \P, which have an extra
1308      two bytes of parameters. */
1309    
1310    else    else
1311      {      {
1312      code += _pcre_OP_lengths[c];      switch(c)
1313          {
1314          case OP_TYPESTAR:
1315          case OP_TYPEMINSTAR:
1316          case OP_TYPEPLUS:
1317          case OP_TYPEMINPLUS:
1318          case OP_TYPEQUERY:
1319          case OP_TYPEMINQUERY:
1320          case OP_TYPEPOSSTAR:
1321          case OP_TYPEPOSPLUS:
1322          case OP_TYPEPOSQUERY:
1323          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1324          break;
1325    
1326  #ifdef SUPPORT_UTF8        case OP_TYPEUPTO:
1327          case OP_TYPEMINUPTO:
1328          case OP_TYPEEXACT:
1329          case OP_TYPEPOSUPTO:
1330          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1331          break;
1332          }
1333    
1334      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* Add in the fixed length from the table */
1335      by a multi-byte character. The length in the table is a minimum, so we have  
1336      to scan along to skip the extra bytes. All opcodes are less than 128, so we      code += _pcre_OP_lengths[c];
1337      can use relatively efficient code. */  
1338      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1339      a multi-byte character. The length in the table is a minimum, so we have to
1340      arrange to skip the extra bytes. */
1341    
1342    #ifdef SUPPORT_UTF8
1343      if (utf8) switch(c)      if (utf8) switch(c)
1344        {        {
1345        case OP_CHAR:        case OP_CHAR:
# Line 1064  for (;;) Line 1347  for (;;)
1347        case OP_EXACT:        case OP_EXACT:
1348        case OP_UPTO:        case OP_UPTO:
1349        case OP_MINUPTO:        case OP_MINUPTO:
1350          case OP_POSUPTO:
1351        case OP_STAR:        case OP_STAR:
1352        case OP_MINSTAR:        case OP_MINSTAR:
1353          case OP_POSSTAR:
1354        case OP_PLUS:        case OP_PLUS:
1355        case OP_MINPLUS:        case OP_MINPLUS:
1356          case OP_POSPLUS:
1357        case OP_QUERY:        case OP_QUERY:
1358        case OP_MINQUERY:        case OP_MINQUERY:
1359        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1360        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1361        break;        break;
1362        }        }
1363  #endif  #endif
# Line 1105  Returns:      pointer to the opcode for Line 1384  Returns:      pointer to the opcode for
1384  static const uschar *  static const uschar *
1385  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1386  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1387  for (;;)  for (;;)
1388    {    {
1389    register int c = *code;    register int c = *code;
1390    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1391    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1392    else if (c > OP_BRA)  
1393      {    /* XCLASS is used for classes that cannot be represented just by a bit
1394      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1395      }    the table is zero; the actual length is stored in the compiled code. */
1396    
1397      if (c == OP_XCLASS) code += GET(code, 1);
1398    
1399      /* Otherwise, we can get the item's length from the table, except that for
1400      repeated character types, we have to test for \p and \P, which have an extra
1401      two bytes of parameters. */
1402    
1403    else    else
1404      {      {
1405      code += _pcre_OP_lengths[c];      switch(c)
1406          {
1407          case OP_TYPESTAR:
1408          case OP_TYPEMINSTAR:
1409          case OP_TYPEPLUS:
1410          case OP_TYPEMINPLUS:
1411          case OP_TYPEQUERY:
1412          case OP_TYPEMINQUERY:
1413          case OP_TYPEPOSSTAR:
1414          case OP_TYPEPOSPLUS:
1415          case OP_TYPEPOSQUERY:
1416          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1417          break;
1418    
1419  #ifdef SUPPORT_UTF8        case OP_TYPEPOSUPTO:
1420          case OP_TYPEUPTO:
1421          case OP_TYPEMINUPTO:
1422          case OP_TYPEEXACT:
1423          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1424          break;
1425          }
1426    
1427        /* Add in the fixed length from the table */
1428    
1429        code += _pcre_OP_lengths[c];
1430    
1431      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1432      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
1433      to scan along to skip the extra bytes. All opcodes are less than 128, so we      to arrange to skip the extra bytes. */
     can use relatively efficient code. */  
1434    
1435    #ifdef SUPPORT_UTF8
1436      if (utf8) switch(c)      if (utf8) switch(c)
1437        {        {
1438        case OP_CHAR:        case OP_CHAR:
# Line 1136  for (;;) Line 1440  for (;;)
1440        case OP_EXACT:        case OP_EXACT:
1441        case OP_UPTO:        case OP_UPTO:
1442        case OP_MINUPTO:        case OP_MINUPTO:
1443          case OP_POSUPTO:
1444        case OP_STAR:        case OP_STAR:
1445        case OP_MINSTAR:        case OP_MINSTAR:
1446          case OP_POSSTAR:
1447        case OP_PLUS:        case OP_PLUS:
1448        case OP_MINPLUS:        case OP_MINPLUS:
1449          case OP_POSPLUS:
1450        case OP_QUERY:        case OP_QUERY:
1451        case OP_MINQUERY:        case OP_MINQUERY:
1452        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1453        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1454        break;        break;
1455        }        }
1456  #endif  #endif
# Line 1165  for (;;) Line 1465  for (;;)
1465  *************************************************/  *************************************************/
1466    
1467  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1468  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1469  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1470  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1471  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1472    struck an inner bracket whose current branch will already have been scanned.
1473    
1474  Arguments:  Arguments:
1475    code        points to start of search    code        points to start of search
# Line 1182  static BOOL Line 1483  static BOOL
1483  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1484  {  {
1485  register int c;  register int c;
1486  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1487       code < endcode;       code < endcode;
1488       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1489    {    {
# Line 1190  for (code = first_significant_code(code Line 1491  for (code = first_significant_code(code
1491    
1492    c = *code;    c = *code;
1493    
1494    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1495    
1496      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1497        {
1498        code += _pcre_OP_lengths[c];
1499        do code += GET(code, 1); while (*code == OP_ALT);
1500        c = *code;
1501        continue;
1502        }
1503    
1504      /* For other groups, scan the branches. */
1505    
1506      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1507      {      {
1508      BOOL empty_branch;      BOOL empty_branch;
1509      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1206  for (code = first_significant_code(code Line 1519  for (code = first_significant_code(code
1519        }        }
1520      while (*code == OP_ALT);      while (*code == OP_ALT);
1521      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1522      c = *code;      c = *code;
1523        continue;
1524      }      }
1525    
1526    else switch (c)    /* Handle the other opcodes */
1527    
1528      switch (c)
1529      {      {
1530      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1531        cannot be represented just by a bit map. This includes negated single
1532        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1533        actual length is stored in the compiled code, so we must update "code"
1534        here. */
1535    
1536  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1537      case OP_XCLASS:      case OP_XCLASS:
1538      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1539      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1540  #endif  #endif
1541    
# Line 1266  for (code = first_significant_code(code Line 1585  for (code = first_significant_code(code
1585      case OP_NOT:      case OP_NOT:
1586      case OP_PLUS:      case OP_PLUS:
1587      case OP_MINPLUS:      case OP_MINPLUS:
1588        case OP_POSPLUS:
1589      case OP_EXACT:      case OP_EXACT:
1590      case OP_NOTPLUS:      case OP_NOTPLUS:
1591      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1592        case OP_NOTPOSPLUS:
1593      case OP_NOTEXACT:      case OP_NOTEXACT:
1594      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1595      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1596        case OP_TYPEPOSPLUS:
1597      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1598      return FALSE;      return FALSE;
1599    
# Line 1283  for (code = first_significant_code(code Line 1605  for (code = first_significant_code(code
1605      case OP_ALT:      case OP_ALT:
1606      return TRUE;      return TRUE;
1607    
1608      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1609      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1610    
1611  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1612      case OP_STAR:      case OP_STAR:
1613      case OP_MINSTAR:      case OP_MINSTAR:
1614        case OP_POSSTAR:
1615      case OP_QUERY:      case OP_QUERY:
1616      case OP_MINQUERY:      case OP_MINQUERY:
1617        case OP_POSQUERY:
1618      case OP_UPTO:      case OP_UPTO:
1619      case OP_MINUPTO:      case OP_MINUPTO:
1620        case OP_POSUPTO:
1621      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1622      break;      break;
1623  #endif  #endif
# Line 1410  earlier groups that are outside the curr Line 1735  earlier groups that are outside the curr
1735  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1736  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1737  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1738  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1739  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1740    
1741    This function has been extended with the possibility of forward references for
1742    recursions and subroutine calls. It must also check the list of such references
1743    for the group we are dealing with. If it finds that one of the recursions in
1744    the current group is on this list, it adjusts the offset in the list, not the
1745    value in the reference (which is a group number).
1746    
1747  Arguments:  Arguments:
1748    group      points to the start of the group    group      points to the start of the group
1749    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1750    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1751    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1752      save_hwm   the hwm forward reference pointer at the start of the group
1753    
1754  Returns:     nothing  Returns:     nothing
1755  */  */
1756    
1757  static void  static void
1758  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1759      uschar *save_hwm)
1760  {  {
1761  uschar *ptr = group;  uschar *ptr = group;
1762  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1763    {    {
1764    int offset = GET(ptr, 1);    int offset;
1765    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1766    
1767      /* See if this recursion is on the forward reference list. If so, adjust the
1768      reference. */
1769    
1770      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1771        {
1772        offset = GET(hc, 0);
1773        if (cd->start_code + offset == ptr + 1)
1774          {
1775          PUT(hc, 0, offset + adjust);
1776          break;
1777          }
1778        }
1779    
1780      /* Otherwise, adjust the recursion offset if it's after the start of this
1781      group. */
1782    
1783      if (hc >= cd->hwm)
1784        {
1785        offset = GET(ptr, 1);
1786        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1787        }
1788    
1789    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1790    }    }
1791  }  }
# Line 1508  Yield:        TRUE when range returned; Line 1864  Yield:        TRUE when range returned;
1864  */  */
1865    
1866  static BOOL  static BOOL
1867  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1868      unsigned int *odptr)
1869  {  {
1870  int c, othercase, next;  unsigned int c, othercase, next;
1871    
1872  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1873    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1874    
1875  if (c > d) return FALSE;  if (c > d) return FALSE;
1876    
# Line 1534  return TRUE; Line 1891  return TRUE;
1891  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1892    
1893    
1894    
1895  /*************************************************  /*************************************************
1896  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1897  *************************************************/  *************************************************/
1898    
1899  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1900  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1901  bits.  sense to automatically possessify the repeated item.
1902    
1903  Arguments:  Arguments:
1904    optionsptr     pointer to the option bits    op_code       the repeated op code
1905    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1906    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1907    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1908    errorcodeptr   points to error code variable    ptr           next character in pattern
1909    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1910    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1911    
1912  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1913  */  */
1914    
1915  static BOOL  static BOOL
1916  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1917    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1918  {  {
1919  int repeat_type, op_type;  int next;
1920  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1921  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
1922  int greedy_default, greedy_non_default;  
1923  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
1924  int zeroreqbyte, zerofirstbyte;    {
1925  int req_caseopt, reqvary, tempreqvary;    for (;;)
1926  int condcount = 0;      {
1927  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1928  int after_manual_callout = 0;      if (*ptr == '#')
1929  register int c;        {
1930  register uschar *code = *codeptr;        while (*(++ptr) != 0)
1931  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1932  BOOL inescq = FALSE;        }
1933  BOOL groupsetfirstbyte = FALSE;      else break;
1934  const uschar *ptr = *ptrptr;      }
1935  const uschar *tempptr;    }
1936  uschar *previous = NULL;  
1937  uschar *previous_callout = NULL;  /* If the next item is one that we can handle, get its value. A non-negative
1938  uschar classbits[32];  value is a character, a negative value is an escape value. */
1939    
1940    if (*ptr == '\\')
1941      {
1942      int temperrorcode = 0;
1943      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1944      if (temperrorcode != 0) return FALSE;
1945      ptr++;    /* Point after the escape sequence */
1946      }
1947    
1948    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1949      {
1950  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1951  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1952  #endif  #endif
1953      next = *ptr++;
1954      }
1955    
1956  /* Set up the default and non-default settings for greediness */  else return FALSE;
1957    
1958  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1959    
1960  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1961  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1962  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1963  find one.      {
1964        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1965        if (*ptr == '#')
1966          {
1967          while (*(++ptr) != 0)
1968            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1969          }
1970        else break;
1971        }
1972      }
1973    
1974  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1975    
1976  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1977      return FALSE;
1978    
1979  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1980  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1981  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1982  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1983    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1984    
1985  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1986    
1987  for (;; ptr++)  if (next >= 0) switch(op_code)
1988      {
1989      case OP_CHAR:
1990    #ifdef SUPPORT_UTF8
1991      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1992    #endif
1993      return item != next;
1994    
1995      /* For CHARNC (caseless character) we must check the other case. If we have
1996      Unicode property support, we can use it to test the other case of
1997      high-valued characters. */
1998    
1999      case OP_CHARNC:
2000    #ifdef SUPPORT_UTF8
2001      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2002    #endif
2003      if (item == next) return FALSE;
2004    #ifdef SUPPORT_UTF8
2005      if (utf8)
2006        {
2007        unsigned int othercase;
2008        if (next < 128) othercase = cd->fcc[next]; else
2009    #ifdef SUPPORT_UCP
2010        othercase = _pcre_ucp_othercase((unsigned int)next);
2011    #else
2012        othercase = NOTACHAR;
2013    #endif
2014        return (unsigned int)item != othercase;
2015        }
2016      else
2017    #endif  /* SUPPORT_UTF8 */
2018      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2019    
2020      /* For OP_NOT, "item" must be a single-byte character. */
2021    
2022      case OP_NOT:
2023      if (next < 0) return FALSE;  /* Not a character */
2024      if (item == next) return TRUE;
2025      if ((options & PCRE_CASELESS) == 0) return FALSE;
2026    #ifdef SUPPORT_UTF8
2027      if (utf8)
2028        {
2029        unsigned int othercase;
2030        if (next < 128) othercase = cd->fcc[next]; else
2031    #ifdef SUPPORT_UCP
2032        othercase = _pcre_ucp_othercase(next);
2033    #else
2034        othercase = NOTACHAR;
2035    #endif
2036        return (unsigned int)item == othercase;
2037        }
2038      else
2039    #endif  /* SUPPORT_UTF8 */
2040      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2041    
2042      case OP_DIGIT:
2043      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2044    
2045      case OP_NOT_DIGIT:
2046      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2047    
2048      case OP_WHITESPACE:
2049      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2050    
2051      case OP_NOT_WHITESPACE:
2052      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2053    
2054      case OP_WORDCHAR:
2055      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2056    
2057      case OP_NOT_WORDCHAR:
2058      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2059    
2060      case OP_HSPACE:
2061      case OP_NOT_HSPACE:
2062      switch(next)
2063        {
2064        case 0x09:
2065        case 0x20:
2066        case 0xa0:
2067        case 0x1680:
2068        case 0x180e:
2069        case 0x2000:
2070        case 0x2001:
2071        case 0x2002:
2072        case 0x2003:
2073        case 0x2004:
2074        case 0x2005:
2075        case 0x2006:
2076        case 0x2007:
2077        case 0x2008:
2078        case 0x2009:
2079        case 0x200A:
2080        case 0x202f:
2081        case 0x205f:
2082        case 0x3000:
2083        return op_code != OP_HSPACE;
2084        default:
2085        return op_code == OP_HSPACE;
2086        }
2087    
2088      case OP_VSPACE:
2089      case OP_NOT_VSPACE:
2090      switch(next)
2091        {
2092        case 0x0a:
2093        case 0x0b:
2094        case 0x0c:
2095        case 0x0d:
2096        case 0x85:
2097        case 0x2028:
2098        case 0x2029:
2099        return op_code != OP_VSPACE;
2100        default:
2101        return op_code == OP_VSPACE;
2102        }
2103    
2104      default:
2105      return FALSE;
2106      }
2107    
2108    
2109    /* Handle the case when the next item is \d, \s, etc. */
2110    
2111    switch(op_code)
2112      {
2113      case OP_CHAR:
2114      case OP_CHARNC:
2115    #ifdef SUPPORT_UTF8
2116      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2117    #endif
2118      switch(-next)
2119        {
2120        case ESC_d:
2121        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2122    
2123        case ESC_D:
2124        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2125    
2126        case ESC_s:
2127        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2128    
2129        case ESC_S:
2130        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2131    
2132        case ESC_w:
2133        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2134    
2135        case ESC_W:
2136        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2137    
2138        case ESC_h:
2139        case ESC_H:
2140        switch(item)
2141          {
2142          case 0x09:
2143          case 0x20:
2144          case 0xa0:
2145          case 0x1680:
2146          case 0x180e:
2147          case 0x2000:
2148          case 0x2001:
2149          case 0x2002:
2150          case 0x2003:
2151          case 0x2004:
2152          case 0x2005:
2153          case 0x2006:
2154          case 0x2007:
2155          case 0x2008:
2156          case 0x2009:
2157          case 0x200A:
2158          case 0x202f:
2159          case 0x205f:
2160          case 0x3000:
2161          return -next != ESC_h;
2162          default:
2163          return -next == ESC_h;
2164          }
2165    
2166        case ESC_v:
2167        case ESC_V:
2168        switch(item)
2169          {
2170          case 0x0a:
2171          case 0x0b:
2172          case 0x0c:
2173          case 0x0d:
2174          case 0x85:
2175          case 0x2028:
2176          case 0x2029:
2177          return -next != ESC_v;
2178          default:
2179          return -next == ESC_v;
2180          }
2181    
2182        default:
2183        return FALSE;
2184        }
2185    
2186      case OP_DIGIT:
2187      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2188             next == -ESC_h || next == -ESC_v;
2189    
2190      case OP_NOT_DIGIT:
2191      return next == -ESC_d;
2192    
2193      case OP_WHITESPACE:
2194      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2195    
2196      case OP_NOT_WHITESPACE:
2197      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2198    
2199      case OP_HSPACE:
2200      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2201    
2202      case OP_NOT_HSPACE:
2203      return next == -ESC_h;
2204    
2205      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2206      case OP_VSPACE:
2207      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2208    
2209      case OP_NOT_VSPACE:
2210      return next == -ESC_v;
2211    
2212      case OP_WORDCHAR:
2213      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2214    
2215      case OP_NOT_WORDCHAR:
2216      return next == -ESC_w || next == -ESC_d;
2217    
2218      default:
2219      return FALSE;
2220      }
2221    
2222    /* Control does not reach here */
2223    }
2224    
2225    
2226    
2227    /*************************************************
2228    *           Compile one branch                   *
2229    *************************************************/
2230    
2231    /* Scan the pattern, compiling it into the a vector. If the options are
2232    changed during the branch, the pointer is used to change the external options
2233    bits. This function is used during the pre-compile phase when we are trying
2234    to find out the amount of memory needed, as well as during the real compile
2235    phase. The value of lengthptr distinguishes the two phases.
2236    
2237    Arguments:
2238      optionsptr     pointer to the option bits
2239      codeptr        points to the pointer to the current code point
2240      ptrptr         points to the current pattern pointer
2241      errorcodeptr   points to error code variable
2242      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2243      reqbyteptr     set to the last literal character required, else < 0
2244      bcptr          points to current branch chain
2245      cd             contains pointers to tables etc.
2246      lengthptr      NULL during the real compile phase
2247                     points to length accumulator during pre-compile phase
2248    
2249    Returns:         TRUE on success
2250                     FALSE, with *errorcodeptr set non-zero on error
2251    */
2252    
2253    static BOOL
2254    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2255      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2256      compile_data *cd, int *lengthptr)
2257    {
2258    int repeat_type, op_type;
2259    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2260    int bravalue = 0;
2261    int greedy_default, greedy_non_default;
2262    int firstbyte, reqbyte;
2263    int zeroreqbyte, zerofirstbyte;
2264    int req_caseopt, reqvary, tempreqvary;
2265    int options = *optionsptr;
2266    int after_manual_callout = 0;
2267    int length_prevgroup = 0;
2268    register int c;
2269    register uschar *code = *codeptr;
2270    uschar *last_code = code;
2271    uschar *orig_code = code;
2272    uschar *tempcode;
2273    BOOL inescq = FALSE;
2274    BOOL groupsetfirstbyte = FALSE;
2275    const uschar *ptr = *ptrptr;
2276    const uschar *tempptr;
2277    uschar *previous = NULL;
2278    uschar *previous_callout = NULL;
2279    uschar *save_hwm = NULL;
2280    uschar classbits[32];
2281    
2282    #ifdef SUPPORT_UTF8
2283    BOOL class_utf8;
2284    BOOL utf8 = (options & PCRE_UTF8) != 0;
2285    uschar *class_utf8data;
2286    uschar utf8_char[6];
2287    #else
2288    BOOL utf8 = FALSE;
2289    uschar *utf8_char = NULL;
2290    #endif
2291    
2292    #ifdef DEBUG
2293    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2294    #endif
2295    
2296    /* Set up the default and non-default settings for greediness */
2297    
2298    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2299    greedy_non_default = greedy_default ^ 1;
2300    
2301    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2302    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2303    matches a non-fixed char first char; reqbyte just remains unset if we never
2304    find one.
2305    
2306    When we hit a repeat whose minimum is zero, we may have to adjust these values
2307    to take the zero repeat into account. This is implemented by setting them to
2308    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2309    item types that can be repeated set these backoff variables appropriately. */
2310    
2311    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2312    
2313    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2314    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2315    value > 255. It is added into the firstbyte or reqbyte variables to record the
2316    case status of the value. This is used only for ASCII characters. */
2317    
2318    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2319    
2320    /* Switch on next character until the end of the branch */
2321    
2322    for (;; ptr++)
2323    {    {
2324    BOOL negate_class;    BOOL negate_class;
2325    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2326    BOOL is_quantifier;    BOOL is_quantifier;
2327      BOOL is_recurse;
2328      BOOL reset_bracount;
2329    int class_charcount;    int class_charcount;
2330    int class_lastchar;    int class_lastchar;
2331    int newoptions;    int newoptions;
2332    int recno;    int recno;
2333      int refsign;
2334    int skipbytes;    int skipbytes;
2335    int subreqbyte;    int subreqbyte;
2336    int subfirstbyte;    int subfirstbyte;
2337      int terminator;
2338    int mclength;    int mclength;
2339    uschar mcbuffer[8];    uschar mcbuffer[8];
2340    
2341    /* Next byte in the pattern */    /* Get next byte in the pattern */
2342    
2343    c = *ptr;    c = *ptr;
2344    
2345      /* If we are in the pre-compile phase, accumulate the length used for the
2346      previous cycle of this loop. */
2347    
2348      if (lengthptr != NULL)
2349        {
2350    #ifdef DEBUG
2351        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2352    #endif
2353        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2354          {
2355          *errorcodeptr = ERR52;
2356          goto FAILED;
2357          }
2358    
2359        /* There is at least one situation where code goes backwards: this is the
2360        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2361        the class is simply eliminated. However, it is created first, so we have to
2362        allow memory for it. Therefore, don't ever reduce the length at this point.
2363        */
2364    
2365        if (code < last_code) code = last_code;
2366    
2367        /* Paranoid check for integer overflow */
2368    
2369        if (OFLOW_MAX - *lengthptr < code - last_code)
2370          {
2371          *errorcodeptr = ERR20;
2372          goto FAILED;
2373          }
2374    
2375        *lengthptr += code - last_code;
2376        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2377    
2378        /* If "previous" is set and it is not at the start of the work space, move
2379        it back to there, in order to avoid filling up the work space. Otherwise,
2380        if "previous" is NULL, reset the current code pointer to the start. */
2381    
2382        if (previous != NULL)
2383          {
2384          if (previous > orig_code)
2385            {
2386            memmove(orig_code, previous, code - previous);
2387            code -= previous - orig_code;
2388            previous = orig_code;
2389            }
2390          }
2391        else code = orig_code;
2392    
2393        /* Remember where this code item starts so we can pick up the length
2394        next time round. */
2395    
2396        last_code = code;
2397        }
2398    
2399      /* In the real compile phase, just check the workspace used by the forward
2400      reference list. */
2401    
2402      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2403        {
2404        *errorcodeptr = ERR52;
2405        goto FAILED;
2406        }
2407    
2408    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2409    
2410    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1651  for (;; ptr++) Line 2419  for (;; ptr++)
2419        {        {
2420        if (previous_callout != NULL)        if (previous_callout != NULL)
2421          {          {
2422          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2423              complete_callout(previous_callout, ptr, cd);
2424          previous_callout = NULL;          previous_callout = NULL;
2425          }          }
2426        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1672  for (;; ptr++) Line 2441  for (;; ptr++)
2441    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2442         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2443      {      {
2444      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2445          complete_callout(previous_callout, ptr, cd);
2446      previous_callout = NULL;      previous_callout = NULL;
2447      }      }
2448    
# Line 1683  for (;; ptr++) Line 2453  for (;; ptr++)
2453      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2454      if (c == '#')      if (c == '#')
2455        {        {
2456        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2457        on the Macintosh. */          {
2458        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2459        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2460          if (*ptr != 0) continue;
2461    
2462          /* Else fall through to handle end of string */
2463          c = 0;
2464        }        }
2465      }      }
2466    
# Line 1700  for (;; ptr++) Line 2474  for (;; ptr++)
2474    
2475    switch(c)    switch(c)
2476      {      {
2477      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2478        case 0:                        /* The branch terminates at string end */
2479      case 0:      case '|':                      /* or | or ) */
     case '|':  
2480      case ')':      case ')':
2481      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2482      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2483      *codeptr = code;      *codeptr = code;
2484      *ptrptr = ptr;      *ptrptr = ptr;
2485        if (lengthptr != NULL)
2486          {
2487          if (OFLOW_MAX - *lengthptr < code - last_code)
2488            {
2489            *errorcodeptr = ERR20;
2490            goto FAILED;
2491            }
2492          *lengthptr += code - last_code;   /* To include callout length */
2493          DPRINTF((">> end branch\n"));
2494          }
2495      return TRUE;      return TRUE;
2496    
2497    
2498        /* ===================================================================*/
2499      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2500      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2501    
# Line 1739  for (;; ptr++) Line 2524  for (;; ptr++)
2524      *code++ = OP_ANY;      *code++ = OP_ANY;
2525      break;      break;
2526    
2527    
2528        /* ===================================================================*/
2529      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2530      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2531      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1764  for (;; ptr++) Line 2551  for (;; ptr++)
2551        goto FAILED;        goto FAILED;
2552        }        }
2553    
2554      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2555        if the first few characters (either before or after ^) are \Q\E or \E we
2556        skip them too. This makes for compatibility with Perl. */
2557    
2558      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2559        for (;;)
2560        {        {
       negate_class = TRUE;  
2561        c = *(++ptr);        c = *(++ptr);
2562        }        if (c == '\\')
2563      else          {
2564        {          if (ptr[1] == 'E') ptr++;
2565        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2566                else break;
2567            }
2568          else if (!negate_class && c == '^')
2569            negate_class = TRUE;
2570          else break;
2571        }        }
2572    
2573      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2574      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2575      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2576    
2577      class_charcount = 0;      class_charcount = 0;
2578      class_lastchar = -1;      class_lastchar = -1;
2579    
2580        /* Initialize the 32-char bit map to all zeros. We build the map in a
2581        temporary bit of memory, in case the class contains only 1 character (less
2582        than 256), because in that case the compiled code doesn't use the bit map.
2583        */
2584    
2585        memset(classbits, 0, 32 * sizeof(uschar));
2586    
2587  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2588      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2589      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2590  #endif  #endif
2591    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2592      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2593      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2594      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2595    
2596      do      if (c != 0) do
2597        {        {
2598          const uschar *oldptr;
2599    
2600  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2601        if (utf8 && c > 127)        if (utf8 && c > 127)
2602          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1814  for (;; ptr++) Line 2608  for (;; ptr++)
2608    
2609        if (inescq)        if (inescq)
2610          {          {
2611          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2612            {            {
2613            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2614            ptr++;            ptr++;                            /* Skip the 'E' */
2615            continue;            continue;                         /* Carry on with next */
2616            }            }
2617          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2618          }          }
2619    
2620        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1911  for (;; ptr++) Line 2705  for (;; ptr++)
2705          }          }
2706    
2707        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2708        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2709        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2710        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2711        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2712        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2713    
2714        if (c == '\\')        if (c == '\\')
2715          {          {
2716          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2717            if (*errorcodeptr != 0) goto FAILED;
2718    
2719          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2720          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2721            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2722          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2723            {            {
2724            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1933  for (;; ptr++) Line 2728  for (;; ptr++)
2728            else inescq = TRUE;            else inescq = TRUE;
2729            continue;            continue;
2730            }            }
2731            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2732    
2733          if (c < 0)          if (c < 0)
2734            {            {
2735            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2736            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2737            switch (-c)  
2738              /* Save time by not doing this in the pre-compile phase. */
2739    
2740              if (lengthptr == NULL) switch (-c)
2741              {              {
2742              case ESC_d:              case ESC_d:
2743              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1966  for (;; ptr++) Line 2765  for (;; ptr++)
2765              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2766              continue;              continue;
2767    
2768  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
2769              case ESC_p:              continue;
2770              case ESC_P:  
2771                default:    /* Not recognized; fall through */
2772                break;      /* Need "default" setting to stop compiler warning. */
2773                }
2774    
2775              /* In the pre-compile phase, just do the recognition. */
2776    
2777              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2778                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2779    
2780              /* We need to deal with \H, \h, \V, and \v in both phases because
2781              they use extra memory. */
2782    
2783              if (-c == ESC_h)
2784                {
2785                SETBIT(classbits, 0x09); /* VT */
2786                SETBIT(classbits, 0x20); /* SPACE */
2787                SETBIT(classbits, 0xa0); /* NSBP */
2788    #ifdef SUPPORT_UTF8
2789                if (utf8)
2790                {                {
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
2791                class_utf8 = TRUE;                class_utf8 = TRUE;
2792                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2793                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2794                *class_utf8data++ = ptype;                *class_utf8data++ = XCL_SINGLE;
2795                *class_utf8data++ = pdata;                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2796                class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = XCL_RANGE;
2797                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2798                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2799                  *class_utf8data++ = XCL_SINGLE;
2800                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2801                  *class_utf8data++ = XCL_SINGLE;
2802                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2803                  *class_utf8data++ = XCL_SINGLE;
2804                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2805                }                }
             continue;  
2806  #endif  #endif
2807                continue;
2808                }
2809    
2810              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
2811              strict mode. By default, for compatibility with Perl, they are              {
2812              treated as literals. */              for (c = 0; c < 32; c++)
2813                  {
2814                  int x = 0xff;
2815                  switch (c)
2816                    {
2817                    case 0x09/8: x ^= 1 << (0x09%8); break;
2818                    case 0x20/8: x ^= 1 << (0x20%8); break;
2819                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2820                    default: break;
2821                    }
2822                  classbits[c] |= x;
2823                  }
2824    
2825              default:  #ifdef SUPPORT_UTF8
2826              if ((options & PCRE_EXTRA) != 0)              if (utf8)
2827                {                {
2828                *errorcodeptr = ERR7;                class_utf8 = TRUE;
2829                goto FAILED;                *class_utf8data++ = XCL_RANGE;
2830                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2831                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2832                  *class_utf8data++ = XCL_RANGE;
2833                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2834                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2835                  *class_utf8data++ = XCL_RANGE;
2836                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2837                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2838                  *class_utf8data++ = XCL_RANGE;
2839                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2840                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2841                  *class_utf8data++ = XCL_RANGE;
2842                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2843                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2844                  *class_utf8data++ = XCL_RANGE;
2845                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2846                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2847                  *class_utf8data++ = XCL_RANGE;
2848                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2849                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2850                }                }
2851              c = *ptr;              /* The final character */  #endif
2852              class_charcount -= 2;  /* Undo the default count from above */              continue;
2853              }              }
2854            }  
2855              if (-c == ESC_v)
2856          /* Fall through if we have a single character (c >= 0). This may be              {
2857          > 256 in UTF-8 mode. */              SETBIT(classbits, 0x0a); /* LF */
2858                SETBIT(classbits, 0x0b); /* VT */
2859          }   /* End of backslash handling */              SETBIT(classbits, 0x0c); /* FF */
2860                SETBIT(classbits, 0x0d); /* CR */
2861        /* A single character may be followed by '-' to form a range. However,              SETBIT(classbits, 0x85); /* NEL */
2862        Perl does not permit ']' to be the end of the range. A '-' character  #ifdef SUPPORT_UTF8
2863        here is treated as a literal. */              if (utf8)
2864                  {
2865        if (ptr[1] == '-' && ptr[2] != ']')                class_utf8 = TRUE;
2866          {                *class_utf8data++ = XCL_RANGE;
2867          int d;                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2868          ptr += 2;                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2869                  }
2870    #endif
2871                continue;
2872                }
2873    
2874              if (-c == ESC_V)
2875                {
2876                for (c = 0; c < 32; c++)
2877                  {
2878                  int x = 0xff;
2879                  switch (c)
2880                    {
2881                    case 0x0a/8: x ^= 1 << (0x0a%8);
2882                                 x ^= 1 << (0x0b%8);
2883                                 x ^= 1 << (0x0c%8);
2884                                 x ^= 1 << (0x0d%8);
2885                                 break;
2886                    case 0x85/8: x ^= 1 << (0x85%8); break;
2887                    default: break;
2888                    }
2889                  classbits[c] |= x;
2890                  }
2891    
2892    #ifdef SUPPORT_UTF8
2893                if (utf8)
2894                  {
2895                  class_utf8 = TRUE;
2896                  *class_utf8data++ = XCL_RANGE;
2897                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2898                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2899                  *class_utf8data++ = XCL_RANGE;
2900                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2901                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2902                  }
2903    #endif
2904                continue;
2905                }
2906    
2907              /* We need to deal with \P and \p in both phases. */
2908    
2909    #ifdef SUPPORT_UCP
2910              if (-c == ESC_p || -c == ESC_P)
2911                {
2912                BOOL negated;
2913                int pdata;
2914                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2915                if (ptype < 0) goto FAILED;
2916                class_utf8 = TRUE;
2917                *class_utf8data++ = ((-c == ESC_p) != negated)?
2918                  XCL_PROP : XCL_NOTPROP;
2919                *class_utf8data++ = ptype;
2920                *class_utf8data++ = pdata;
2921                class_charcount -= 2;   /* Not a < 256 character */
2922                continue;
2923                }
2924    #endif
2925              /* Unrecognized escapes are faulted if PCRE is running in its
2926              strict mode. By default, for compatibility with Perl, they are
2927              treated as literals. */
2928    
2929              if ((options & PCRE_EXTRA) != 0)
2930                {
2931                *errorcodeptr = ERR7;
2932                goto FAILED;
2933                }
2934    
2935              class_charcount -= 2;  /* Undo the default count from above */
2936              c = *ptr;              /* Get the final character and fall through */
2937              }
2938    
2939            /* Fall through if we have a single character (c >= 0). This may be
2940            greater than 256 in UTF-8 mode. */
2941    
2942            }   /* End of backslash handling */
2943    
2944          /* A single character may be followed by '-' to form a range. However,
2945          Perl does not permit ']' to be the end of the range. A '-' character
2946          at the end is treated as a literal. Perl ignores orphaned \E sequences
2947          entirely. The code for handling \Q and \E is messy. */
2948    
2949          CHECK_RANGE:
2950          while (ptr[1] == '\\' && ptr[2] == 'E')
2951            {
2952            inescq = FALSE;
2953            ptr += 2;
2954            }
2955    
2956          oldptr = ptr;
2957    
2958          if (!inescq && ptr[1] == '-')
2959            {
2960            int d;
2961            ptr += 2;
2962            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2963    
2964            /* If we hit \Q (not followed by \E) at this point, go into escaped
2965            mode. */
2966    
2967            while (*ptr == '\\' && ptr[1] == 'Q')
2968              {
2969              ptr += 2;
2970              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2971              inescq = TRUE;
2972              break;
2973              }
2974    
2975            if (*ptr == 0 || (!inescq && *ptr == ']'))
2976              {
2977              ptr = oldptr;
2978              goto LONE_SINGLE_CHARACTER;
2979              }
2980    
2981  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2982          if (utf8)          if (utf8)
# Line 2026  for (;; ptr++) Line 2991  for (;; ptr++)
2991          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2992          in such circumstances. */          in such circumstances. */
2993    
2994          if (d == '\\')          if (!inescq && d == '\\')
2995            {            {
2996            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2997            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2998    
2999            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
3000            was literal */            special means the '-' was literal */
3001    
3002            if (d < 0)            if (d < 0)
3003              {              {
3004              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
3005              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
3006                else if (d == -ESC_R) d = 'R'; else
3007                {                {
3008                ptr = oldptr - 2;                ptr = oldptr;
3009                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3010                }                }
3011              }              }
3012            }            }
3013    
3014          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3015          the pre-pass. Optimize one-character ranges */          one-character ranges */
3016    
3017            if (d < c)
3018              {
3019              *errorcodeptr = ERR8;
3020              goto FAILED;
3021              }
3022    
3023          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3024    
# Line 2067  for (;; ptr++) Line 3039  for (;; ptr++)
3039  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3040            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3041              {              {
3042              int occ, ocd;              unsigned int occ, ocd;
3043              int cc = c;              unsigned int cc = c;
3044              int origd = d;              unsigned int origd = d;
3045              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3046                {                {
3047                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3048                      ocd <= (unsigned int)d)
3049                    continue;                          /* Skip embedded ranges */
3050    
3051                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3052                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3053                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3054                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3055                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3056                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3057                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3058                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3059                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3060                  d = ocd;                  d = ocd;
3061                  continue;                  continue;
# Line 2127  for (;; ptr++) Line 3103  for (;; ptr++)
3103          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3104          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3105    
3106          for (; c <= d; c++)          class_charcount += d - c + 1;
3107            class_lastchar = d;
3108    
3109            /* We can save a bit of time by skipping this in the pre-compile. */
3110    
3111            if (lengthptr == NULL) for (; c <= d; c++)
3112            {            {
3113            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3114            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2135  for (;; ptr++) Line 3116  for (;; ptr++)
3116              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3117              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3118              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3119            }            }
3120    
3121          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2160  for (;; ptr++) Line 3139  for (;; ptr++)
3139  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3140          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3141            {            {
3142            int othercase;            unsigned int othercase;
3143            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3144              {              {
3145              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3146              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2186  for (;; ptr++) Line 3165  for (;; ptr++)
3165          }          }
3166        }        }
3167    
3168      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3169      loop. This "while" is the end of the "do" above. */  
3170        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3171    
3172      while ((c = *(++ptr)) != ']' || inescq);      if (c == 0)                          /* Missing terminating ']' */
3173          {
3174          *errorcodeptr = ERR6;
3175          goto FAILED;
3176          }
3177    
3178      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3179      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3180      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3181      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3182      single-bytes only. This is an historical hangover. Maybe one day we can  
3183      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3184        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3185        operate on single-bytes only. This is an historical hangover. Maybe one day
3186        we can tidy these opcodes to handle multi-byte characters.
3187    
3188      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3189      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 2206  for (;; ptr++) Line 3193  for (;; ptr++)
3193      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3194    
3195  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3196      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3197            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3198  #else  #else
3199      if (class_charcount == 1)      if (class_charcount == 1)
3200  #endif  #endif
# Line 2253  for (;; ptr++) Line 3238  for (;; ptr++)
3238    
3239      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3240      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3241      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3242    
3243  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3244      if (class_utf8)      if (class_utf8)
# Line 2263  for (;; ptr++) Line 3248  for (;; ptr++)
3248        code += LINK_SIZE;        code += LINK_SIZE;
3249        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3250    
3251        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3252        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3253    
3254        if (class_charcount > 0)        if (class_charcount > 0)
3255          {          {
3256          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3257            memmove(code + 32, code, class_utf8data - code);
3258          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3259          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3260          }          }
3261          else code = class_utf8data;
3262    
3263        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3264    
# Line 2297  for (;; ptr++) Line 3275  for (;; ptr++)
3275      if (negate_class)      if (negate_class)
3276        {        {
3277        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3278        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3279            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3280        }        }
3281      else      else
3282        {        {
# Line 2307  for (;; ptr++) Line 3286  for (;; ptr++)
3286      code += 32;      code += 32;
3287      break;      break;
3288    
3289    
3290        /* ===================================================================*/
3291      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3292      has been tested above. */      has been tested above. */
3293    
# Line 2374  for (;; ptr++) Line 3355  for (;; ptr++)
3355        }        }
3356      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3357    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3358      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3359      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3360      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2421  for (;; ptr++) Line 3388  for (;; ptr++)
3388          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3389          }          }
3390    
3391          /* If the repetition is unlimited, it pays to see if the next thing on
3392          the line is something that cannot possibly match this character. If so,
3393          automatically possessifying this item gains some performance in the case
3394          where the match fails. */
3395    
3396          if (!possessive_quantifier &&
3397              repeat_max < 0 &&
3398              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3399                options, cd))
3400            {
3401            repeat_type = 0;    /* Force greedy */
3402            possessive_quantifier = TRUE;
3403            }
3404    
3405        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3406        }        }
3407    
3408      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3409      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3410      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3411      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3412        currently used only for single-byte chars. */
3413    
3414      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3415        {        {
3416        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3417        c = previous[1];        c = previous[1];
3418          if (!possessive_quantifier &&
3419              repeat_max < 0 &&
3420              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3421            {
3422            repeat_type = 0;    /* Force greedy */
3423            possessive_quantifier = TRUE;
3424            }
3425        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3426        }        }
3427    
# Line 2450  for (;; ptr++) Line 3439  for (;; ptr++)
3439        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3440        c = *previous;        c = *previous;
3441    
3442          if (!possessive_quantifier &&
3443              repeat_max < 0 &&
3444              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3445            {
3446            repeat_type = 0;    /* Force greedy */
3447            possessive_quantifier = TRUE;
3448            }
3449    
3450        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3451        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3452          {          {
# Line 2490  for (;; ptr++) Line 3487  for (;; ptr++)
3487          }          }
3488    
3489        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3490        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3491        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3492        one less than the maximum. */        one less than the maximum. */
3493    
# Line 2543  for (;; ptr++) Line 3540  for (;; ptr++)
3540            }            }
3541    
3542          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3543          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3544            UPTO is just for 1 instance, we can use QUERY instead. */
3545    
3546          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3547            {            {
# Line 2562  for (;; ptr++) Line 3560  for (;; ptr++)
3560              *code++ = prop_value;              *code++ = prop_value;
3561              }              }
3562            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3563            *code++ = OP_UPTO + repeat_type;  
3564            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3565                {
3566                *code++ = OP_QUERY + repeat_type;
3567                }
3568              else
3569                {
3570                *code++ = OP_UPTO + repeat_type;
3571                PUT2INC(code, 0, repeat_max);
3572                }
3573            }            }
3574          }          }
3575    
# Line 2630  for (;; ptr++) Line 3636  for (;; ptr++)
3636      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3637      cases. */      cases. */
3638    
3639      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3640               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3641        {        {
3642        register int i;        register int i;
3643        int ketoffset = 0;        int ketoffset = 0;
3644        int len = code - previous;        int len = code - previous;
3645        uschar *bralink = NULL;        uschar *bralink = NULL;
3646    
3647          /* Repeating a DEFINE group is pointless */
3648    
3649          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3650            {
3651            *errorcodeptr = ERR55;
3652            goto FAILED;
3653            }
3654    
3655        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3656        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3657        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2672  for (;; ptr++) Line 3686  for (;; ptr++)
3686          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3687          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3688          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3689          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3690          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3691            doing this. */
3692    
3693          if (repeat_max <= 1)          if (repeat_max <= 1)
3694            {            {
3695            *code = OP_END;            *code = OP_END;
3696            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3697            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3698            code++;            code++;
3699            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2696  for (;; ptr++) Line 3711  for (;; ptr++)
3711            {            {
3712            int offset;            int offset;
3713            *code = OP_END;            *code = OP_END;
3714            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3715            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3716            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3717            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2716  for (;; ptr++) Line 3731  for (;; ptr++)
3731        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3732        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3733        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3734        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3735          forward reference subroutine calls in the group, there will be entries on
3736          the workspace list; replicate these with an appropriate increment. */
3737    
3738        else        else
3739          {          {
3740          if (repeat_min > 1)          if (repeat_min > 1)
3741            {            {
3742            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3743            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3744              potential integer overflow. */
3745    
3746              if (lengthptr != NULL)
3747                {
3748                int delta = (repeat_min - 1)*length_prevgroup;
3749                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3750                                                                (double)INT_MAX ||
3751                    OFLOW_MAX - *lengthptr < delta)
3752                  {
3753                  *errorcodeptr = ERR20;
3754                  goto FAILED;
3755                  }
3756                *lengthptr += delta;
3757                }
3758    
3759              /* This is compiling for real */
3760    
3761              else
3762              {              {
3763              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3764              code += len;              for (i = 1; i < repeat_min; i++)
3765                  {
3766                  uschar *hc;
3767                  uschar *this_hwm = cd->hwm;
3768                  memcpy(code, previous, len);
3769                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3770                    {
3771                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3772                    cd->hwm += LINK_SIZE;
3773                    }
3774                  save_hwm = this_hwm;
3775                  code += len;
3776                  }
3777              }              }
3778            }            }
3779    
3780          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3781          }          }
3782    
# Line 2736  for (;; ptr++) Line 3784  for (;; ptr++)
3784        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3785        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3786        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3787        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3788          replicate entries on the forward reference list. */
3789    
3790        if (repeat_max >= 0)        if (repeat_max >= 0)
3791          {          {
3792          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3793            just adjust the length as if we had. For each repetition we must add 1
3794            to the length for BRAZERO and for all but the last repetition we must
3795            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3796            paranoid checks to avoid integer overflow. */
3797    
3798            if (lengthptr != NULL && repeat_max > 0)
3799              {
3800              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3801                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3802              if ((double)repeat_max *
3803                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3804                      > (double)INT_MAX ||
3805                  OFLOW_MAX - *lengthptr < delta)
3806                {
3807                *errorcodeptr = ERR20;
3808                goto FAILED;
3809                }
3810              *lengthptr += delta;
3811              }
3812    
3813            /* This is compiling for real */
3814    
3815            else for (i = repeat_max - 1; i >= 0; i--)
3816            {            {
3817              uschar *hc;
3818              uschar *this_hwm = cd->hwm;
3819    
3820            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3821    
3822            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2757  for (;; ptr++) Line 3832  for (;; ptr++)
3832              }              }
3833    
3834            memcpy(code, previous, len);            memcpy(code, previous, len);
3835              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3836                {
3837                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3838                cd->hwm += LINK_SIZE;
3839                }
3840              save_hwm = this_hwm;
3841            code += len;            code += len;
3842            }            }
3843    
# Line 2779  for (;; ptr++) Line 3860  for (;; ptr++)
3860        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3861        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3862        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3863        correct offset was computed above. */        correct offset was computed above.
3864    
3865          Then, when we are doing the actual compile phase, check to see whether
3866          this group is a non-atomic one that could match an empty string. If so,
3867          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3868          that runtime checking can be done. [This check is also applied to
3869          atomic groups at runtime, but in a different way.] */
3870    
3871        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3872            {
3873            uschar *ketcode = code - ketoffset;
3874            uschar *bracode = ketcode - GET(ketcode, 1);
3875            *ketcode = OP_KETRMAX + repeat_type;
3876            if (lengthptr == NULL && *bracode != OP_ONCE)
3877              {
3878              uschar *scode = bracode;
3879              do
3880                {
3881                if (could_be_empty_branch(scode, ketcode, utf8))
3882                  {
3883                  *bracode += OP_SBRA - OP_BRA;
3884                  break;
3885                  }
3886                scode += GET(scode, 1);
3887                }
3888              while (*scode == OP_ALT);
3889              }
3890            }
3891        }        }
3892    
3893      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2792  for (;; ptr++) Line 3898  for (;; ptr++)
3898        goto FAILED;        goto FAILED;
3899        }        }
3900    
3901      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3902      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3903      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3904      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3905      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3906        but the special opcodes can optimize it a bit. The repeated item starts at
3907        tempcode, not at previous, which might be the first part of a string whose
3908        (former) last char we repeated.
3909    
3910        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3911        an 'upto' may follow. We skip over an 'exact' item, and then test the
3912        length of what remains before proceeding. */
3913    
3914      if (possessive_quantifier)      if (possessive_quantifier)
3915        {        {
3916        int len = code - tempcode;        int len;
3917        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3918        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3919        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3920        tempcode[0] = OP_ONCE;        len = code - tempcode;
3921        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3922        PUTINC(code, 0, len);          {
3923        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3924            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3925            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3926            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3927    
3928            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3929            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3930            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3931            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3932    
3933            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3934            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3935            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3936            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3937    
3938            default:
3939            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3940            code += 1 + LINK_SIZE;
3941            len += 1 + LINK_SIZE;
3942            tempcode[0] = OP_ONCE;
3943            *code++ = OP_KET;
3944            PUTINC(code, 0, len);
3945            PUT(tempcode, 1, len);
3946            break;
3947            }
3948        }        }
3949    
3950      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2820  for (;; ptr++) Line 3957  for (;; ptr++)
3957      break;      break;
3958    
3959    
3960      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3961      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3962      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3963      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
3964    
3965      case '(':      case '(':
3966      newoptions = options;      newoptions = options;
3967      skipbytes = 0;      skipbytes = 0;
3968        bravalue = OP_CBRA;
3969        save_hwm = cd->hwm;
3970        reset_bracount = FALSE;
3971    
3972        /* First deal with various "verbs" that can be introduced by '*'. */
3973    
3974        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3975          {
3976          int i, namelen;
3977          const uschar *name = ++ptr;
3978          previous = NULL;
3979          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3980          if (*ptr == ':')
3981            {
3982            *errorcodeptr = ERR59;   /* Not supported */
3983            goto FAILED;
3984            }
3985          if (*ptr != ')')
3986            {
3987            *errorcodeptr = ERR60;
3988            goto FAILED;
3989            }
3990          namelen = ptr - name;
3991          for (i = 0; i < verbcount; i++)
3992            {
3993            if (namelen == verbs[i].len &&
3994                strncmp((char *)name, verbs[i].name, namelen) == 0)
3995              {
3996              *code = verbs[i].op;
3997              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3998              break;
3999              }
4000            }
4001          if (i < verbcount) continue;
4002          *errorcodeptr = ERR60;
4003          goto FAILED;
4004          }
4005    
4006        /* Deal with the extended parentheses; all are introduced by '?', and the
4007        appearance of any of them means that this is not a capturing group. */
4008    
4009      if (*(++ptr) == '?')      else if (*ptr == '?')
4010        {        {
4011        int set, unset;        int i, set, unset, namelen;
4012        int *optset;        int *optset;
4013          const uschar *name;
4014          uschar *slot;
4015    
4016        switch (*(++ptr))        switch (*(++ptr))
4017          {          {
4018          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
4019          ptr++;          ptr++;
4020          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
4021            if (*ptr == 0)
4022              {
4023              *errorcodeptr = ERR18;
4024              goto FAILED;
4025              }
4026          continue;          continue;
4027    
4028          case ':':                 /* Non-extracting bracket */  
4029            /* ------------------------------------------------------------ */
4030            case '|':                 /* Reset capture count for each branch */
4031            reset_bracount = TRUE;
4032            /* Fall through */
4033    
4034            /* ------------------------------------------------------------ */
4035            case ':':                 /* Non-capturing bracket */
4036          bravalue = OP_BRA;          bravalue = OP_BRA;
4037          ptr++;          ptr++;
4038          break;          break;
4039    
4040    
4041            /* ------------------------------------------------------------ */
4042          case '(':          case '(':
4043          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4044    
4045          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
4046            group), a name (referring to a named group), or 'R', referring to
4047            recursion. R<digits> and R&name are also permitted for recursion tests.
4048    
4049            There are several syntaxes for testing a named group: (?(name)) is used
4050            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4051    
4052            There are two unfortunate ambiguities, caused by history. (a) 'R' can
4053            be the recursive thing or the name 'R' (and similarly for 'R' followed
4054            by digits), and (b) a number could be a name that consists of digits.
4055            In both cases, we look for a name first; if not found, we try the other
4056            cases. */
4057    
4058            /* For conditions that are assertions, check the syntax, and then exit
4059            the switch. This will take control down to where bracketed groups,
4060            including assertions, are processed. */
4061    
4062            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4063              break;
4064    
4065            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4066            below), and all need to skip 3 bytes at the start of the group. */
4067    
4068          if (ptr[1] == 'R')          code[1+LINK_SIZE] = OP_CREF;
4069            skipbytes = 3;
4070            refsign = -1;
4071    
4072            /* Check for a test for recursion in a named group. */
4073    
4074            if (ptr[1] == 'R' && ptr[2] == '&')
4075            {            {
4076            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4077            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4078            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4079            }            }
4080    
4081          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4082          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4083    
4084          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4085            {            {
4086            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4087            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4088            }            }
4089          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
         set bravalue above. */  
         break;  
   
         case '=':                 /* Positive lookahead */  
         bravalue = OP_ASSERT;  
         ptr++;  
         break;  
   
         case '!':                 /* Negative lookahead */  
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
   
         case '<':                 /* Lookbehinds */  
         switch (*(++ptr))  
4090            {            {
4091            case '=':               /* Positive lookbehind */            terminator = '\'';
           bravalue = OP_ASSERTBACK;  
4092            ptr++;            ptr++;
4093            break;            }
4094            else
4095              {
4096              terminator = 0;
4097              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4098              }
4099    
4100            case '!':               /* Negative lookbehind */          /* We now expect to read a name; any thing else is an error */
4101            bravalue = OP_ASSERTBACK_NOT;  
4102            ptr++;          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4103            break;            {
4104              ptr += 1;  /* To get the right offset */
4105              *errorcodeptr = ERR28;
4106              goto FAILED;
4107            }            }
         break;  
4108    
4109          case '>':                 /* One-time brackets */          /* Read the name, but also get it as a number if it's all digits */
         bravalue = OP_ONCE;  
         ptr++;  
         break;  
4110    
4111          case 'C':                 /* Callout - may be followed by digits; */          recno = 0;
4112          previous_callout = code;  /* Save for later completion */          name = ++ptr;
4113          after_manual_callout = 1; /* Skip one item before completing */          while ((cd->ctypes[*ptr] & ctype_word) != 0)
4114          *code++ = OP_CALLOUT;     /* Already checked that the terminating */            {
4115            {                       /* closing parenthesis is present. */            if (recno >= 0)
4116            int n = 0;              recno = ((digitab[*ptr] & ctype_digit) != 0)?
4117            while ((digitab[*(++ptr)] & ctype_digit) != 0)                recno * 10 + *ptr - '0' : -1;
4118              n = n * 10 + *ptr - '0';            ptr++;
           if (n > 255)  
             {  
             *errorcodeptr = ERR38;  
             goto FAILED;  
             }  
           *code++ = n;  
           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */  
           PUT(code, LINK_SIZE, 0);                    /* Default length */  
           code += 2 * LINK_SIZE;  
4119            }            }
4120          previous = NULL;          namelen = ptr - name;
         continue;  
4121    
4122          case 'P':                 /* Named subpattern handling */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
         if (*(++ptr) == '<')      /* Definition */  
4123            {            {
4124            int i, namelen;            ptr--;      /* Error offset */
4125            uschar *slot = cd->name_table;            *errorcodeptr = ERR26;
4126            const uschar *name;     /* Don't amalgamate; some compilers */            goto FAILED;
4127            name = ++ptr;           /* grumble at autoincrement in declaration */            }
4128    
4129            while (*ptr++ != '>');          /* Do no further checking in the pre-compile phase. */
           namelen = ptr - name - 1;  
4130    
4131            for (i = 0; i < cd->names_found; i++)          if (lengthptr != NULL) break;
4132    
4133            /* In the real compile we do the work of looking for the actual
4134            reference. If the string started with "+" or "-" we require the rest to
4135            be digits, in which case recno will be set. */
4136    
4137            if (refsign > 0)
4138              {
4139              if (recno <= 0)
4140              {              {
4141              int crc = memcmp(name, slot+2, namelen);              *errorcodeptr = ERR58;
4142              if (crc == 0)              goto FAILED;
4143                {              }
4144                if (slot[2+namelen] == 0)            if (refsign == '-')
4145                  {              {
4146                  *errorcodeptr = ERR43;              recno = cd->bracount - recno + 1;
4147                  goto FAILED;              if (recno <= 0)
                 }  
               crc = -1;             /* Current name is substring */  
               }  
             if (crc < 0)  
4148                {                {
4149                memmove(slot + cd->name_entry_size, slot,                *errorcodeptr = ERR15;
4150                  (cd->names_found - i) * cd->name_entry_size);                goto FAILED;
               break;  
4151                }                }
             slot += cd->name_entry_size;  
4152              }              }
4153              else recno += cd->bracount;
4154              PUT2(code, 2+LINK_SIZE, recno);
4155              break;
4156              }
4157    
4158            PUT2(slot, 0, *brackets + 1);          /* Otherwise (did not start with "+" or "-"), start by looking for the
4159            memcpy(slot + 2, name, namelen);          name. */
4160            slot[2+namelen] = 0;  
4161            cd->names_found++;          slot = cd->name_table;
4162            goto NUMBERED_GROUP;          for (i = 0; i < cd->names_found; i++)
4163              {
4164              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4165              slot += cd->name_entry_size;
4166            }            }
4167    
4168          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* Found a previous named subpattern */
4169    
4170            if (i < cd->names_found)
4171            {            {
4172            int i, namelen;            recno = GET2(slot, 0);
4173            int type = *ptr++;            PUT2(code, 2+LINK_SIZE, recno);
4174            const uschar *name = ptr;            }
           uschar *slot = cd->name_table;  
4175    
4176            while (*ptr != ')') ptr++;          /* Search the pattern for a forward reference */
           namelen = ptr - name;  
4177    
4178            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4179                            (options & PCRE_EXTENDED) != 0)) > 0)
4180              {
4181              PUT2(code, 2+LINK_SIZE, i);
4182              }
4183    
4184            /* If terminator == 0 it means that the name followed directly after
4185            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4186            some further alternatives to try. For the cases where terminator != 0
4187            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4188            now checked all the possibilities, so give an error. */
4189    
4190            else if (terminator != 0)
4191              {
4192              *errorcodeptr = ERR15;
4193              goto FAILED;
4194              }
4195    
4196            /* Check for (?(R) for recursion. Allow digits after R to specify a
4197            specific group number. */
4198    
4199            else if (*name == 'R')
4200              {
4201              recno = 0;
4202              for (i = 1; i < namelen; i++)
4203                {
4204                if ((digitab[name[i]] & ctype_digit) == 0)
4205                  {
4206                  *errorcodeptr = ERR15;
4207                  goto FAILED;
4208                  }
4209                recno = recno * 10 + name[i] - '0';
4210                }
4211              if (recno == 0) recno = RREF_ANY;
4212              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4213              PUT2(code, 2+LINK_SIZE, recno);
4214              }
4215    
4216            /* Similarly, check for the (?(DEFINE) "condition", which is always
4217            false. */
4218    
4219            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4220              {
4221              code[1+LINK_SIZE] = OP_DEF;
4222              skipbytes = 1;
4223              }
4224    
4225            /* Check for the "name" actually being a subpattern number. */
4226    
4227            else if (recno > 0)
4228              {
4229              PUT2(code, 2+LINK_SIZE, recno);
4230              }
4231    
4232            /* Either an unidentified subpattern, or a reference to (?(0) */
4233    
4234            else
4235              {
4236              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4237              goto FAILED;
4238              }
4239            break;
4240    
4241    
4242            /* ------------------------------------------------------------ */
4243            case '=':                 /* Positive lookahead */
4244            bravalue = OP_ASSERT;
4245            ptr++;
4246            break;
4247    
4248    
4249            /* ------------------------------------------------------------ */
4250            case '!':                 /* Negative lookahead */
4251            ptr++;
4252            if (*ptr == ')')          /* Optimize (?!) */
4253              {
4254              *code++ = OP_FAIL;
4255              previous = NULL;
4256              continue;
4257              }
4258            bravalue = OP_ASSERT_NOT;
4259            break;
4260    
4261    
4262            /* ------------------------------------------------------------ */
4263            case '<':                 /* Lookbehind or named define */
4264            switch (ptr[1])
4265              {
4266              case '=':               /* Positive lookbehind */
4267              bravalue = OP_ASSERTBACK;
4268              ptr += 2;
4269              break;
4270    
4271              case '!':               /* Negative lookbehind */
4272              bravalue = OP_ASSERTBACK_NOT;
4273              ptr += 2;
4274              break;
4275    
4276              default:                /* Could be name define, else bad */
4277              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4278              ptr++;                  /* Correct offset for error */
4279              *errorcodeptr = ERR24;
4280              goto FAILED;
4281              }
4282            break;
4283    
4284    
4285            /* ------------------------------------------------------------ */
4286            case '>':                 /* One-time brackets */
4287            bravalue = OP_ONCE;
4288            ptr++;
4289            break;
4290    
4291    
4292            /* ------------------------------------------------------------ */
4293            case 'C':                 /* Callout - may be followed by digits; */
4294            previous_callout = code;  /* Save for later completion */
4295            after_manual_callout = 1; /* Skip one item before completing */
4296            *code++ = OP_CALLOUT;
4297              {
4298              int n = 0;
4299              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4300                n = n * 10 + *ptr - '0';
4301              if (*ptr != ')')
4302                {
4303                *errorcodeptr = ERR39;
4304                goto FAILED;
4305                }
4306              if (n > 255)
4307                {
4308                *errorcodeptr = ERR38;
4309                goto FAILED;
4310                }
4311              *code++ = n;
4312              PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4313              PUT(code, LINK_SIZE, 0);                    /* Default length */
4314              code += 2 * LINK_SIZE;
4315              }
4316            previous = NULL;
4317            continue;
4318    
4319    
4320            /* ------------------------------------------------------------ */
4321            case 'P':                 /* Python-style named subpattern handling */
4322            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4323              {
4324              is_recurse = *ptr == '>';
4325              terminator = ')';
4326              goto NAMED_REF_OR_RECURSE;
4327              }
4328            else if (*ptr != '<')    /* Test for Python-style definition */
4329              {
4330              *errorcodeptr = ERR41;
4331              goto FAILED;
4332              }
4333            /* Fall through to handle (?P< as (?< is handled */
4334    
4335    
4336            /* ------------------------------------------------------------ */
4337            DEFINE_NAME:    /* Come here from (?< handling */
4338            case '\'':
4339              {
4340              terminator = (*ptr == '<')? '>' : '\'';
4341              name = ++ptr;
4342    
4343              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4344              namelen = ptr - name;
4345    
4346              /* In the pre-compile phase, just do a syntax check. */
4347    
4348              if (lengthptr != NULL)
4349                {
4350                if (*ptr != terminator)
4351                  {
4352                  *errorcodeptr = ERR42;
4353                  goto FAILED;
4354                  }
4355                if (cd->names_found >= MAX_NAME_COUNT)
4356                  {
4357                  *errorcodeptr = ERR49;
4358                  goto FAILED;
4359                  }
4360                if (namelen + 3 > cd->name_entry_size)
4361                  {
4362                  cd->name_entry_size = namelen + 3;
4363                  if (namelen > MAX_NAME_SIZE)
4364                    {
4365                    *errorcodeptr = ERR48;
4366                    goto FAILED;
4367                    }
4368                  }
4369                }
4370    
4371              /* In the real compile, create the entry in the table */
4372    
4373              else
4374                {
4375                slot = cd->name_table;
4376                for (i = 0; i < cd->names_found; i++)
4377                  {
4378                  int crc = memcmp(name, slot+2, namelen);
4379                  if (crc == 0)
4380                    {
4381                    if (slot[2+namelen] == 0)
4382                      {
4383                      if ((options & PCRE_DUPNAMES) == 0)
4384                        {
4385                        *errorcodeptr = ERR43;
4386                        goto FAILED;
4387                        }
4388                      }
4389                    else crc = -1;      /* Current name is substring */
4390                    }
4391                  if (crc < 0)
4392                    {
4393                    memmove(slot + cd->name_entry_size, slot,
4394                      (cd->names_found - i) * cd->name_entry_size);
4395                    break;
4396                    }
4397                  slot += cd->name_entry_size;
4398                  }
4399    
4400                PUT2(slot, 0, cd->bracount + 1);
4401                memcpy(slot + 2, name, namelen);
4402                slot[2+namelen] = 0;
4403                }
4404              }
4405    
4406            /* In both cases, count the number of names we've encountered. */
4407    
4408            ptr++;                    /* Move past > or ' */
4409            cd->names_found++;
4410            goto NUMBERED_GROUP;
4411    
4412    
4413            /* ------------------------------------------------------------ */
4414            case '&':                 /* Perl recursion/subroutine syntax */
4415            terminator = ')';
4416            is_recurse = TRUE;
4417            /* Fall through */
4418    
4419            /* We come here from the Python syntax above that handles both
4420            references (?P=name) and recursion (?P>name), as well as falling
4421            through from the Perl recursion syntax (?&name). */
4422    
4423            NAMED_REF_OR_RECURSE:
4424            name = ++ptr;
4425            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4426            namelen = ptr - name;
4427    
4428            /* In the pre-compile phase, do a syntax check and set a dummy
4429            reference number. */
4430    
4431            if (lengthptr != NULL)
4432              {
4433              if (*ptr != terminator)
4434                {
4435                *errorcodeptr = ERR42;
4436                goto FAILED;
4437                }
4438              if (namelen > MAX_NAME_SIZE)
4439                {
4440                *errorcodeptr = ERR48;
4441                goto FAILED;
4442                }
4443              recno = 0;
4444              }
4445    
4446            /* In the real compile, seek the name in the table */
4447    
4448            else
4449              {
4450              slot = cd->name_table;
4451            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4452              {              {
4453              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4454              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4455              }              }
4456            if (i >= cd->names_found)  
4457              if (i < cd->names_found)         /* Back reference */
4458                {
4459                recno = GET2(slot, 0);
4460                }
4461              else if ((recno =                /* Forward back reference */
4462                        find_parens(ptr, cd->bracount, name, namelen,
4463                          (options & PCRE_EXTENDED) != 0)) <= 0)
4464              {              {
4465              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4466              goto FAILED;              goto FAILED;
4467              }              }
4468              }
4469    
4470            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4471            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4472    
4473            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4474            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4475    
         /* Should never happen */  
         break;  
4476    
4477          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4478            case 'R':                 /* Recursion */
4479          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4480          /* Fall through */          /* Fall through */
4481    
         /* Recursion or "subroutine" call */  
4482    
4483          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4484          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4485            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4486            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4487            {            {
4488            const uschar *called;            const uschar *called;
4489    
4490              if ((refsign = *ptr) == '+') ptr++;
4491              else if (refsign == '-')
4492                {
4493                if ((digitab[ptr[1]] & ctype_digit) == 0)
4494                  goto OTHER_CHAR_AFTER_QUERY;
4495                ptr++;
4496                }
4497    
4498            recno = 0;            recno = 0;
4499            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4500              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4501    
4502              if (*ptr != ')')
4503                {
4504                *errorcodeptr = ERR29;
4505                goto FAILED;
4506                }
4507    
4508              if (refsign == '-')
4509                {
4510                if (recno == 0)
4511                  {
4512                  *errorcodeptr = ERR58;
4513                  goto FAILED;
4514                  }
4515                recno = cd->bracount - recno + 1;
4516                if (recno <= 0)
4517                  {
4518                  *errorcodeptr = ERR15;
4519                  goto FAILED;
4520                  }
4521                }
4522              else if (refsign == '+')
4523                {
4524                if (recno == 0)
4525                  {
4526                  *errorcodeptr = ERR58;
4527                  goto FAILED;
4528                  }
4529                recno += cd->bracount;
4530                }
4531    
4532            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4533    
4534            HANDLE_RECURSION:            HANDLE_RECURSION:
4535    
4536            previous = code;            previous = code;
4537              called = cd->start_code;
4538    
4539            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4540            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4541              this point. If we end up with a forward reference, first check that
4542            *code = OP_END;            the bracket does occur later so we can give the error (and position)
4543            called = (recno == 0)?            now. Then remember this forward reference in the workspace so it can
4544              cd->start_code : find_bracket(cd->start_code, utf8, recno);            be filled in at the end. */
4545    
4546            if (called == NULL)            if (lengthptr == NULL)
4547              {              {
4548              *errorcodeptr = ERR15;              *code = OP_END;
4549              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4550    
4551            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4552    
4553            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4554              {                {
4555              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4556              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4557                    {
4558                    *errorcodeptr = ERR15;
4559                    goto FAILED;
4560                    }
4561                  called = cd->start_code + recno;
4562                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4563                  }
4564    
4565                /* If not a forward reference, and the subpattern is still open,
4566                this is a recursive call. We check to see if this is a left
4567                recursion that could loop for ever, and diagnose that case. */
4568    
4569                else if (GET(called, 1) == 0 &&
4570                         could_be_empty(called, code, bcptr, utf8))
4571                  {
4572                  *errorcodeptr = ERR40;
4573                  goto FAILED;
4574                  }
4575              }              }
4576    
4577            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4578            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4579              subsequent quantifier will work. */
4580    
4581            *code = OP_ONCE;            *code = OP_ONCE;
4582            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3069  for (;; ptr++) Line 4589  for (;; ptr++)
4589            *code = OP_KET;            *code = OP_KET;
4590            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4591            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4592    
4593              length_prevgroup = 3 + 3*LINK_SIZE;
4594            }            }
4595    
4596            /* Can't determine a first byte now */
4597    
4598            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4599          continue;          continue;
4600    
         /* Character after (? not specially recognized */  
4601    
4602          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4603            default:              /* Other characters: check option setting */
4604            OTHER_CHAR_AFTER_QUERY:
4605          set = unset = 0;          set = unset = 0;
4606          optset = &set;          optset = &set;
4607    
# Line 3084  for (;; ptr++) Line 4611  for (;; ptr++)
4611              {              {
4612              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4613    
4614                case 'J':    /* Record that it changed in the external options */
4615                *optset |= PCRE_DUPNAMES;
4616                cd->external_options |= PCRE_JCHANGED;
4617                break;
4618    
4619              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4620              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4621              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4622              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4623              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4624              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4625    
4626                default:  *errorcodeptr = ERR12;
4627                          ptr--;    /* Correct the offset */
4628                          goto FAILED;
4629              }              }
4630            }            }
4631    
# Line 3098  for (;; ptr++) Line 4634  for (;; ptr++)
4634          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4635    
4636          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4637          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4638          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4639          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4640          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4641          a group), a resetting item can be compiled.          caseless checking of required bytes.
4642    
4643          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4644          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4645          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4646            that value after the start, because it gets reset as code is discarded
4647            during the pre-compile. However, this can happen only at top level - if
4648            we are within parentheses, the starting BRA will still be present. At
4649            any parenthesis level, the length value can be used to test if anything
4650            has been compiled at that level. Thus, a test for both these conditions
4651            is necessary to ensure we correctly detect the start of the pattern in
4652            both phases.
4653    
4654            If we are not at the pattern start, compile code to change the ims
4655            options if this setting actually changes any of them. We also pass the
4656            new setting back so that it can be put at the start of any following
4657            branches, and when this group ends (if we are in a group), a resetting
4658            item can be compiled. */
4659    
4660          if (*ptr == ')')          if (*ptr == ')')
4661            {            {
4662            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4663                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4664              {              {
4665              *code++ = OP_OPT;              cd->external_options = newoptions;
4666              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4667              }              }
4668             else
4669                {
4670                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4671                  {
4672                  *code++ = OP_OPT;
4673                  *code++ = newoptions & PCRE_IMS;
4674                  }
4675    
4676            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4677            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4678            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4679    
4680            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4681            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4682            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4683            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4684                }
4685    
4686            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4687            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3136  for (;; ptr++) Line 4694  for (;; ptr++)
4694    
4695          bravalue = OP_BRA;          bravalue = OP_BRA;
4696          ptr++;          ptr++;
4697          }          }     /* End of switch for character following (? */
4698        }        }       /* End of (? handling */
4699    
4700      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4701      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4702        brackets. */
4703    
4704      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4705        {        {
4706        bravalue = OP_BRA;        bravalue = OP_BRA;
4707        }        }
4708    
4709      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4710    
4711      else      else
4712        {        {
4713        NUMBERED_GROUP:        NUMBERED_GROUP:
4714        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4715          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4716          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4717        }        }
4718    
4719      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4720      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4721      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4722      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4723        they have changed. */
4724    
4725      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4726      *code = bravalue;      *code = bravalue;
4727      tempcode = code;      tempcode = code;
4728      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4729        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4730    
4731      if (!compile_regex(      if (!compile_regex(
4732           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4733           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4734           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4735           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4736           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4737           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4738            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4739           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           reset_bracount,               /* True if (?| group */
4740             skipbytes,                    /* Skip over bracket number */
4741           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4742           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4743           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4744           cd))                          /* Tables block */           cd,                           /* Tables block */
4745             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4746               &length_prevgroup           /* Pre-compile phase */
4747             ))
4748        goto FAILED;        goto FAILED;
4749    
4750      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3196  for (;; ptr++) Line 4753  for (;; ptr++)
4753      is on the bracket. */      is on the bracket. */
4754    
4755      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4756      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4757        in the real compile phase, not in the pre-pass, where the whole group may
4758        not be available. */
4759    
4760      else if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4761        {        {
4762        uschar *tc = code;        uschar *tc = code;
4763        condcount = 0;        int condcount = 0;
4764    
4765        do {        do {
4766           condcount++;           condcount++;
# Line 3209  for (;; ptr++) Line 4768  for (;; ptr++)
4768           }           }
4769        while (*tc != OP_KET);        while (*tc != OP_KET);
4770    
4771        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4772          false). It must have only one branch. */
4773    
4774          if (code[LINK_SIZE+1] == OP_DEF)
4775          {          {
4776          *errorcodeptr = ERR27;          if (condcount > 1)
4777          goto FAILED;            {
4778              *errorcodeptr = ERR54;
4779              goto FAILED;
4780              }
4781            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4782            }
4783    
4784          /* A "normal" conditional group. If there is just one branch, we must not
4785          make use of its firstbyte or reqbyte, because this is equivalent to an
4786          empty second branch. */
4787    
4788          else
4789            {
4790            if (condcount > 2)
4791              {
4792              *errorcodeptr = ERR27;
4793              goto FAILED;
4794              }
4795            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4796          }          }
4797          }
4798    
4799        /* Error if hit end of pattern */
4800    
4801        if (*ptr != ')')
4802          {
4803          *errorcodeptr = ERR14;
4804          goto FAILED;
4805          }
4806    
4807        /* If there is just one branch, we must not make use of its firstbyte or      /* In the pre-compile phase, update the length by the length of the group,
4808        reqbyte, because this is equivalent to an empty second branch. */      less the brackets at either end. Then reduce the compiled code to just a
4809        set of non-capturing brackets so that it doesn't use much memory if it is
4810        duplicated by a quantifier.*/
4811    
4812        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      if (lengthptr != NULL)
4813          {
4814          if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4815            {
4816            *errorcodeptr = ERR20;
4817            goto FAILED;
4818            }
4819          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4820          *code++ = OP_BRA;
4821          PUTINC(code, 0, 1 + LINK_SIZE);
4822          *code++ = OP_KET;
4823          PUTINC(code, 0, 1 + LINK_SIZE);
4824          break;    /* No need to waste time with special character handling */
4825        }        }
4826    
4827      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4828      brackets of all kinds, and conditions with two branches (see code above).  
4829      If the bracket is followed by a quantifier with zero repeat, we have to      code = tempcode;
4830      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4831      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4832        relevant. */
4833    
4834        if (bravalue == OP_DEF) break;
4835    
4836        /* Handle updating of the required and first characters for other types of
4837        group. Update for normal brackets of all kinds, and conditions with two
4838        branches (see code above). If the bracket is followed by a quantifier with
4839        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4840        zerofirstbyte outside the main loop so that they can be accessed for the
4841        back off. */
4842    
4843      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4844      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4845      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4846    
4847      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4848        {        {
4849        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4850        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3272  for (;; ptr++) Line 4885  for (;; ptr++)
4885      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4886    
4887      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4888        break;     /* End of processing '(' */
4889    
     /* Now update the main code pointer to the end of the group. */  
4890    
4891      code = tempcode;      /* ===================================================================*/
4892        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
   
     case '\\':  
     tempptr = ptr;  
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
   
     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values  
4893      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4894      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4895      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4896      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4897      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4898    
4899        case '\\':
4900        tempptr = ptr;
4901        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4902        if (*errorcodeptr != 0) goto FAILED;
4903    
4904      if (c < 0)      if (c < 0)
4905        {        {
4906        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3310  for (;; ptr++) Line 4910  for (;; ptr++)
4910          continue;          continue;
4911          }          }
4912    
4913          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4914    
4915        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4916        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4917    
# Line 3321  for (;; ptr++) Line 4923  for (;; ptr++)
4923        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4924        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4925    
4926        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4927          We also support \k{name} (.NET syntax) */
4928    
4929          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4930            {
4931            is_recurse = FALSE;
4932            terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4933            goto NAMED_REF_OR_RECURSE;
4934            }
4935    
4936          /* Back references are handled specially; must disable firstbyte if
4937          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4938          ':' later. */
4939    
4940        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4941          {          {
4942          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4943    
4944            HANDLE_REFERENCE:    /* Come here from named backref handling */
4945            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4946          previous = code;          previous = code;
4947          *code++ = OP_REF;          *code++ = OP_REF;
4948          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4949            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4950            if (recno > cd->top_backref) cd->top_backref = recno;
4951          }          }
4952    
4953        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4954    
4955  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4956        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
# Line 3340  for (;; ptr++) Line 4958  for (;; ptr++)
4958          BOOL negated;          BOOL negated;
4959          int pdata;          int pdata;
4960          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4961            if (ptype < 0) goto FAILED;
4962          previous = code;          previous = code;
4963          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4964          *code++ = ptype;          *code++ = ptype;
4965          *code++ = pdata;          *code++ = pdata;
4