/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 79 by nigel, Sat Feb 24 21:40:52 2007 UTC revision 208 by ph10, Mon Aug 6 15:23:29 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56    /* When DEBUG is defined, we need the pcre_printint() function, which is also
57    used by pcretest. DEBUG is not defined when building a production library. */
58    
59    #ifdef DEBUG
60    #include "pcre_printint.src"
61    #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 107  static const short int escapes[] = { Line 141  static const short int escapes[] = {
141    
142    
143  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
144  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
145  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
146    
147  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 118  static const char *const posix_names[] = Line 152  static const char *const posix_names[] =
152  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
153    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
154    
155  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
156  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
157  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
158    characters are removed, and for [:alpha:] and [:alnum:] the underscore
159    character is removed. The triples in the table consist of the base map offset,
160    second map offset or -1 if no second map, and a non-negative value for map
161    addition or a negative value for map subtraction (if there are two maps). The
162    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
163    remove vertical space characters, 2 => remove underscore. */
164    
165  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
166    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
167    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
168    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
169    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
170    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
171    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
172    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
173    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
174    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
175    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
176    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
177    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
178    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
179    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
180  };  };
181    
182    
183    #define STRING(a)  # a
184    #define XSTRING(s) STRING(s)
185    
186  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
187  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
188    they are documented. Always add a new error instead. Messages marked DEAD below
189    are no longer used. */
190    
191  static const char *error_texts[] = {  static const char *error_texts[] = {
192    "no error",    "no error",
# Line 156  static const char *error_texts[] = { Line 201  static const char *error_texts[] = {
201    "range out of order in character class",    "range out of order in character class",
202    "nothing to repeat",    "nothing to repeat",
203    /* 10 */    /* 10 */
204    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
205    "internal error: unexpected repeat",    "internal error: unexpected repeat",
206    "unrecognized character after (?",    "unrecognized character after (?",
207    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 166  static const char *error_texts[] = { Line 211  static const char *error_texts[] = {
211    "erroffset passed as NULL",    "erroffset passed as NULL",
212    "unknown option bit(s) set",    "unknown option bit(s) set",
213    "missing ) after comment",    "missing ) after comment",
214    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
215    /* 20 */    /* 20 */
216    "regular expression too large",    "regular expression is too large",
217    "failed to get memory",    "failed to get memory",
218    "unmatched parentheses",    "unmatched parentheses",
219    "internal error: code overflow",    "internal error: code overflow",
220    "unrecognized character after (?<",    "unrecognized character after (?<",
221    /* 25 */    /* 25 */
222    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
223    "malformed number after (?(",    "malformed number or name after (?(",
224    "conditional group contains more than two branches",    "conditional group contains more than two branches",
225    "assertion expected after (?(",    "assertion expected after (?(",
226    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
227    /* 30 */    /* 30 */
228    "unknown POSIX class name",    "unknown POSIX class name",
229    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
230    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
231    "spare error",    "spare error",  /** DEAD **/
232    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
233    /* 35 */    /* 35 */
234    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 194  static const char *error_texts[] = { Line 239  static const char *error_texts[] = {
239    /* 40 */    /* 40 */
240    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
241    "unrecognized character after (?P",    "unrecognized character after (?P",
242    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
243    "two named groups have the same name",    "two named subpatterns have the same name",
244    "invalid UTF-8 string",    "invalid UTF-8 string",
245    /* 45 */    /* 45 */
246    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
247    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
248    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
249      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
250      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
251      /* 50 */
252      "repeated subpattern is too long",    /** DEAD **/
253      "octal value is greater than \\377 (not in UTF-8 mode)",
254      "internal error: overran compiling workspace",
255      "internal error: previously-checked referenced subpattern not found",
256      "DEFINE group contains more than one branch",
257      /* 55 */
258      "repeating a DEFINE group is not allowed",
259      "inconsistent NEWLINE options",
260      "\\g is not followed by a braced name or an optionally braced non-zero number",
261      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
262  };  };
263    
264    
# Line 220  For convenience, we use the same bit def Line 278  For convenience, we use the same bit def
278    
279  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
280    
281  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
282  static const unsigned char digitab[] =  static const unsigned char digitab[] =
283    {    {
284    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 314  static const unsigned char digitab[] =
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
316    
317  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
318  static const unsigned char digitab[] =  static const unsigned char digitab[] =
319    {    {
320    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 328  static const unsigned char digitab[] =
328    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
329    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
330    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
331    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
332    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
333    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
334    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 362  static const unsigned char ebcdic_charta
362    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
363    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
364    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
365    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
366    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
367    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
368    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 389  static const unsigned char ebcdic_charta
389  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
390    
391  static BOOL  static BOOL
392    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
393      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
394    
395    
396    
# Line 342  static BOOL Line 400  static BOOL
400    
401  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
402  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
403  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
404  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
405  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
406    ptr is pointing at the \. On exit, it is on the final character of the escape
407    sequence.
408    
409  Arguments:  Arguments:
410    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 362  static int Line 422  static int
422  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
423    int options, BOOL isclass)    int options, BOOL isclass)
424  {  {
425  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
426    const uschar *ptr = *ptrptr + 1;
427  int c, i;  int c, i;
428    
429    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
430    ptr--;                            /* Set pointer back to the last byte */
431    
432  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
433    
 c = *(++ptr);  
434  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
435    
436  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
437  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
438  Otherwise further processing may be required. */  Otherwise further processing may be required. */
439    
440  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
441  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
442  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
443    
444  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
445  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
446  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
447  #endif  #endif
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 451  else if ((i = escapes[c - 0x48]) != 0)
451  else  else
452    {    {
453    const uschar *oldptr;    const uschar *oldptr;
454      BOOL braced, negated;
455    
456    switch (c)    switch (c)
457      {      {
458      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 466  else
466      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
467      break;      break;
468    
469        /* \g must be followed by a number, either plain or braced. If positive, it
470        is an absolute backreference. If negative, it is a relative backreference.
471        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
472        reference to a named group. This is part of Perl's movement towards a
473        unified syntax for back references. As this is synonymous with \k{name}, we
474        fudge it up by pretending it really was \k. */
475    
476        case 'g':
477        if (ptr[1] == '{')
478          {
479          const uschar *p;
480          for (p = ptr+2; *p != 0 && *p != '}'; p++)
481            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
482          if (*p != 0 && *p != '}')
483            {
484            c = -ESC_k;
485            break;
486            }
487          braced = TRUE;
488          ptr++;
489          }
490        else braced = FALSE;
491    
492        if (ptr[1] == '-')
493          {
494          negated = TRUE;
495          ptr++;
496          }
497        else negated = FALSE;
498    
499        c = 0;
500        while ((digitab[ptr[1]] & ctype_digit) != 0)
501          c = c * 10 + *(++ptr) - '0';
502    
503        if (c == 0 || (braced && *(++ptr) != '}'))
504          {
505          *errorcodeptr = ERR57;
506          return 0;
507          }
508    
509        if (negated)
510          {
511          if (c > bracount)
512            {
513            *errorcodeptr = ERR15;
514            return 0;
515            }
516          c = bracount - (c - 1);
517          }
518    
519        c = -(ESC_REF + c);
520        break;
521    
522      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
523      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
524      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 442  else Line 560  else
560        }        }
561    
562      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
563      larger first octal digit. */      larger first octal digit. The original code used just to take the least
564        significant 8 bits of octal numbers (I think this is what early Perls used
565        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
566        than 3 octal digits. */
567    
568      case '0':      case '0':
569      c -= '0';      c -= '0';
570      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
571          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
572      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
573      break;      break;
574    
575      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
576      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
577        treated as a data character. */
578    
579      case 'x':      case 'x':
580  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
581        {        {
582        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
583        register int count = 0;        int count = 0;
584    
585        c = 0;        c = 0;
586        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
587          {          {
588          int cc = *pt++;          register int cc = *pt++;
589            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
590          count++;          count++;
591  #if !EBCDIC    /* ASCII coding */  
592    #ifndef EBCDIC  /* ASCII coding */
593          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
594          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
595  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
596          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
597          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
598  #endif  #endif
599          }          }
600    
601        if (*pt == '}')        if (*pt == '}')
602          {          {
603          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
604          ptr = pt;          ptr = pt;
605          break;          break;
606          }          }
607    
608        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
609        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
610        }        }
 #endif  
611    
612      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
613    
614      c = 0;      c = 0;
615      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
616        {        {
617        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
618        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
619  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
620        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
621        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
622  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
623        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
624        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
625  #endif  #endif
626        }        }
627      break;      break;
628    
629      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
630        This coding is ASCII-specific, but then the whole concept of \cx is
631        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
632    
633      case 'c':      case 'c':
634      c = *(++ptr);      c = *(++ptr);
# Line 511  else Line 638  else
638        return 0;        return 0;
639        }        }
640    
641      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
642      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
643      c ^= 0x40;      c ^= 0x40;
644  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
645      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
646      c ^= 0xC0;      c ^= 0xC0;
647  #endif  #endif
# Line 560  escape sequence. Line 683  escape sequence.
683  Argument:  Argument:
684    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
685    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
686      dptr           points to an int that is set to the detailed property value
687    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
688    
689  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
690  */  */
691    
692  static int  static int
693  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
694  {  {
695  int c, i, bot, top;  int c, i, bot, top;
696  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
697  char name[4];  char name[32];
698    
699  c = *(++ptr);  c = *(++ptr);
700  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
701    
702  *negptr = FALSE;  *negptr = FALSE;
703    
704  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
705  preceded by ^ for negation. */  negation. */
706    
707  if (c == '{')  if (c == '{')
708    {    {
# Line 587  if (c == '{') Line 711  if (c == '{')
711      *negptr = TRUE;      *negptr = TRUE;
712      ptr++;      ptr++;
713      }      }
714    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
715      {      {
716      c = *(++ptr);      c = *(++ptr);
717      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
718      if (c == '}') break;      if (c == '}') break;
719      name[i] = c;      name[i] = c;
720      }      }
721    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
722    name[i] = 0;    name[i] = 0;
723    }    }
724    
# Line 619  top = _pcre_utt_size; Line 739  top = _pcre_utt_size;
739    
740  while (bot < top)  while (bot < top)
741    {    {
742    i = (bot + top)/2;    i = (bot + top) >> 1;
743    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
744    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
745        {
746        *dptr = _pcre_utt[i].value;
747        return _pcre_utt[i].type;
748        }
749    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
750    }    }
751    
 UNKNOWN_RETURN:  
752  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
753  *ptrptr = ptr;  *ptrptr = ptr;
754  return -1;  return -1;
# Line 698  read_repeat_counts(const uschar *p, int Line 821  read_repeat_counts(const uschar *p, int
821  int min = 0;  int min = 0;
822  int max = -1;  int max = -1;
823    
824    /* Read the minimum value and do a paranoid check: a negative value indicates
825    an integer overflow. */
826    
827  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
828    if (min < 0 || min > 65535)
829      {
830      *errorcodeptr = ERR5;
831      return p;
832      }
833    
834    /* Read the maximum value if there is one, and again do a paranoid on its size.
835    Also, max must not be less than min. */
836    
837  if (*p == '}') max = min; else  if (*p == '}') max = min; else
838    {    {
# Line 706  if (*p == '}') max = min; else Line 840  if (*p == '}') max = min; else
840      {      {
841      max = 0;      max = 0;
842      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
843        if (max < 0 || max > 65535)
844          {
845          *errorcodeptr = ERR5;
846          return p;
847          }
848      if (max < min)      if (max < min)
849        {        {
850        *errorcodeptr = ERR4;        *errorcodeptr = ERR4;
# Line 714  if (*p == '}') max = min; else Line 853  if (*p == '}') max = min; else
853      }      }
854    }    }
855    
856  /* Do paranoid checks, then fill in the required variables, and pass back the  /* Fill in the required variables, and pass back the pointer to the terminating
857  pointer to the terminating '}'. */  '}'. */
858    
859  if (min > 65535 || max > 65535)  *minp = min;
860    *errorcodeptr = ERR5;  *maxp = max;
861  else  return p;
862    }
863    
864    
865    
866    /*************************************************
867    *       Find forward referenced subpattern       *
868    *************************************************/
869    
870    /* This function scans along a pattern's text looking for capturing
871    subpatterns, and counting them. If it finds a named pattern that matches the
872    name it is given, it returns its number. Alternatively, if the name is NULL, it
873    returns when it reaches a given numbered subpattern. This is used for forward
874    references to subpatterns. We know that if (?P< is encountered, the name will
875    be terminated by '>' because that is checked in the first pass.
876    
877    Arguments:
878      ptr          current position in the pattern
879      count        current count of capturing parens so far encountered
880      name         name to seek, or NULL if seeking a numbered subpattern
881      lorn         name length, or subpattern number if name is NULL
882      xmode        TRUE if we are in /x mode
883    
884    Returns:       the number of the named subpattern, or -1 if not found
885    */
886    
887    static int
888    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
889      BOOL xmode)
890    {
891    const uschar *thisname;
892    
893    for (; *ptr != 0; ptr++)
894    {    {
895    *minp = min;    int term;
896    *maxp = max;  
897      /* Skip over backslashed characters and also entire \Q...\E */
898    
899      if (*ptr == '\\')
900        {
901        if (*(++ptr) == 0) return -1;
902        if (*ptr == 'Q') for (;;)
903          {
904          while (*(++ptr) != 0 && *ptr != '\\');
905          if (*ptr == 0) return -1;
906          if (*(++ptr) == 'E') break;
907          }
908        continue;
909        }
910    
911      /* Skip over character classes */
912    
913      if (*ptr == '[')
914        {
915        while (*(++ptr) != ']')
916          {
917          if (*ptr == '\\')
918            {
919            if (*(++ptr) == 0) return -1;
920            if (*ptr == 'Q') for (;;)
921              {
922              while (*(++ptr) != 0 && *ptr != '\\');
923              if (*ptr == 0) return -1;
924              if (*(++ptr) == 'E') break;
925              }
926            continue;
927            }
928          }
929        continue;
930        }
931    
932      /* Skip comments in /x mode */
933    
934      if (xmode && *ptr == '#')
935        {
936        while (*(++ptr) != 0 && *ptr != '\n');
937        if (*ptr == 0) return -1;
938        continue;
939        }
940    
941      /* An opening parens must now be a real metacharacter */
942    
943      if (*ptr != '(') continue;
944      if (ptr[1] != '?')
945        {
946        count++;
947        if (name == NULL && count == lorn) return count;
948        continue;
949        }
950    
951      ptr += 2;
952      if (*ptr == 'P') ptr++;                      /* Allow optional P */
953    
954      /* We have to disambiguate (?<! and (?<= from (?<name> */
955    
956      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
957           *ptr != '\'')
958        continue;
959    
960      count++;
961    
962      if (name == NULL && count == lorn) return count;
963      term = *ptr++;
964      if (term == '<') term = '>';
965      thisname = ptr;
966      while (*ptr != term) ptr++;
967      if (name != NULL && lorn == ptr - thisname &&
968          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
969        return count;
970    }    }
971  return p;  
972    return -1;
973  }  }
974    
975    
# Line 778  for (;;) Line 1023  for (;;)
1023    
1024      case OP_CALLOUT:      case OP_CALLOUT:
1025      case OP_CREF:      case OP_CREF:
1026      case OP_BRANUMBER:      case OP_RREF:
1027        case OP_DEF:
1028      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1029      break;      break;
1030    
# Line 823  for (;;) Line 1069  for (;;)
1069    {    {
1070    int d;    int d;
1071    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1072    
1073    switch (op)    switch (op)
1074      {      {
1075        case OP_CBRA:
1076      case OP_BRA:      case OP_BRA:
1077      case OP_ONCE:      case OP_ONCE:
1078      case OP_COND:      case OP_COND:
1079      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1080      if (d < 0) return d;      if (d < 0) return d;
1081      branchlength += d;      branchlength += d;
1082      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 865  for (;;) Line 1111  for (;;)
1111      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1112    
1113      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1114      case OP_CREF:      case OP_CREF:
1115        case OP_RREF:
1116        case OP_DEF:
1117      case OP_OPT:      case OP_OPT:
1118      case OP_CALLOUT:      case OP_CALLOUT:
1119      case OP_SOD:      case OP_SOD:
# Line 884  for (;;) Line 1131  for (;;)
1131    
1132      case OP_CHAR:      case OP_CHAR:
1133      case OP_CHARNC:      case OP_CHARNC:
1134        case OP_NOT:
1135      branchlength++;      branchlength++;
1136      cc += 2;      cc += 2;
1137  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 917  for (;;) Line 1165  for (;;)
1165    
1166      case OP_PROP:      case OP_PROP:
1167      case OP_NOTPROP:      case OP_NOTPROP:
1168      cc++;      cc += 2;
1169      /* Fall through */      /* Fall through */
1170    
1171      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 998  Returns:      pointer to the opcode for Line 1246  Returns:      pointer to the opcode for
1246  static const uschar *  static const uschar *
1247  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1248  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1249  for (;;)  for (;;)
1250    {    {
1251    register int c = *code;    register int c = *code;
1252    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1253    else if (c > OP_BRA)  
1254      /* XCLASS is used for classes that cannot be represented just by a bit
1255      map. This includes negated single high-valued characters. The length in
1256      the table is zero; the actual length is stored in the compiled code. */
1257    
1258      if (c == OP_XCLASS) code += GET(code, 1);
1259    
1260      /* Handle capturing bracket */
1261    
1262      else if (c == OP_CBRA)
1263      {      {
1264      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1265      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1266      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1267      }      }
1268    
1269      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1270      a multi-byte character. The length in the table is a minimum, so we have to
1271      arrange to skip the extra bytes. */
1272    
1273    else    else
1274      {      {
1275      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1276  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1277      if (utf8) switch(c)      if (utf8) switch(c)
1278        {        {
1279        case OP_CHAR:        case OP_CHAR:
# Line 1031  for (;;) Line 1281  for (;;)
1281        case OP_EXACT:        case OP_EXACT:
1282        case OP_UPTO:        case OP_UPTO:
1283        case OP_MINUPTO:        case OP_MINUPTO:
1284          case OP_POSUPTO:
1285        case OP_STAR:        case OP_STAR:
1286        case OP_MINSTAR:        case OP_MINSTAR:
1287          case OP_POSSTAR:
1288        case OP_PLUS:        case OP_PLUS:
1289        case OP_MINPLUS:        case OP_MINPLUS:
1290          case OP_POSPLUS:
1291        case OP_QUERY:        case OP_QUERY:
1292        case OP_MINQUERY:        case OP_MINQUERY:
1293        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1294        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1295        break;        break;
1296        }        }
1297  #endif  #endif
# Line 1072  Returns:      pointer to the opcode for Line 1318  Returns:      pointer to the opcode for
1318  static const uschar *  static const uschar *
1319  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1320  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1321  for (;;)  for (;;)
1322    {    {
1323    register int c = *code;    register int c = *code;
1324    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1325    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1326    else if (c > OP_BRA)  
1327      {    /* XCLASS is used for classes that cannot be represented just by a bit
1328      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1329      }    the table is zero; the actual length is stored in the compiled code. */
1330    
1331      if (c == OP_XCLASS) code += GET(code, 1);
1332    
1333      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1334      that are followed by a character may be followed by a multi-byte character.
1335      The length in the table is a minimum, so we have to arrange to skip the extra
1336      bytes. */
1337    
1338    else    else
1339      {      {
1340      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1341  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1342      if (utf8) switch(c)      if (utf8) switch(c)
1343        {        {
1344        case OP_CHAR:        case OP_CHAR:
# Line 1103  for (;;) Line 1346  for (;;)
1346        case OP_EXACT:        case OP_EXACT:
1347        case OP_UPTO:        case OP_UPTO:
1348        case OP_MINUPTO:        case OP_MINUPTO:
1349          case OP_POSUPTO:
1350        case OP_STAR:        case OP_STAR:
1351        case OP_MINSTAR:        case OP_MINSTAR:
1352          case OP_POSSTAR:
1353        case OP_PLUS:        case OP_PLUS:
1354        case OP_MINPLUS:        case OP_MINPLUS:
1355          case OP_POSPLUS:
1356        case OP_QUERY:        case OP_QUERY:
1357        case OP_MINQUERY:        case OP_MINQUERY:
1358        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1359        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1360        break;        break;
1361        }        }
1362  #endif  #endif
# Line 1132  for (;;) Line 1371  for (;;)
1371  *************************************************/  *************************************************/
1372    
1373  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1374  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1375  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1376  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1377  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1378    struck an inner bracket whose current branch will already have been scanned.
1379    
1380  Arguments:  Arguments:
1381    code        points to start of search    code        points to start of search
# Line 1149  static BOOL Line 1389  static BOOL
1389  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1390  {  {
1391  register int c;  register int c;
1392  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1393       code < endcode;       code < endcode;
1394       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1395    {    {
# Line 1157  for (code = first_significant_code(code Line 1397  for (code = first_significant_code(code
1397    
1398    c = *code;    c = *code;
1399    
1400    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1401    
1402      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1403        {
1404        code += _pcre_OP_lengths[c];
1405        do code += GET(code, 1); while (*code == OP_ALT);
1406        c = *code;
1407        continue;
1408        }
1409    
1410      /* For other groups, scan the branches. */
1411    
1412      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1413      {      {
1414      BOOL empty_branch;      BOOL empty_branch;
1415      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1173  for (code = first_significant_code(code Line 1425  for (code = first_significant_code(code
1425        }        }
1426      while (*code == OP_ALT);      while (*code == OP_ALT);
1427      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1428      c = *code;      c = *code;
1429        continue;
1430      }      }
1431    
1432    else switch (c)    /* Handle the other opcodes */
1433    
1434      switch (c)
1435      {      {
1436      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1437    
# Line 1233  for (code = first_significant_code(code Line 1487  for (code = first_significant_code(code
1487      case OP_NOT:      case OP_NOT:
1488      case OP_PLUS:      case OP_PLUS:
1489      case OP_MINPLUS:      case OP_MINPLUS:
1490        case OP_POSPLUS:
1491      case OP_EXACT:      case OP_EXACT:
1492      case OP_NOTPLUS:      case OP_NOTPLUS:
1493      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1494        case OP_NOTPOSPLUS:
1495      case OP_NOTEXACT:      case OP_NOTEXACT:
1496      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1497      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1498        case OP_TYPEPOSPLUS:
1499      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1500      return FALSE;      return FALSE;
1501    
# Line 1250  for (code = first_significant_code(code Line 1507  for (code = first_significant_code(code
1507      case OP_ALT:      case OP_ALT:
1508      return TRUE;      return TRUE;
1509    
1510      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1511      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1512    
1513  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1514      case OP_STAR:      case OP_STAR:
1515      case OP_MINSTAR:      case OP_MINSTAR:
1516        case OP_POSSTAR:
1517      case OP_QUERY:      case OP_QUERY:
1518      case OP_MINQUERY:      case OP_MINQUERY:
1519        case OP_POSQUERY:
1520      case OP_UPTO:      case OP_UPTO:
1521      case OP_MINUPTO:      case OP_MINUPTO:
1522        case OP_POSUPTO:
1523      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1524      break;      break;
1525  #endif  #endif
# Line 1377  earlier groups that are outside the curr Line 1637  earlier groups that are outside the curr
1637  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1638  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1639  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1640  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1641  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1642    
1643    This function has been extended with the possibility of forward references for
1644    recursions and subroutine calls. It must also check the list of such references
1645    for the group we are dealing with. If it finds that one of the recursions in
1646    the current group is on this list, it adjusts the offset in the list, not the
1647    value in the reference (which is a group number).
1648    
1649  Arguments:  Arguments:
1650    group      points to the start of the group    group      points to the start of the group
1651    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1652    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1653    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1654      save_hwm   the hwm forward reference pointer at the start of the group
1655    
1656  Returns:     nothing  Returns:     nothing
1657  */  */
1658    
1659  static void  static void
1660  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1661      uschar *save_hwm)
1662  {  {
1663  uschar *ptr = group;  uschar *ptr = group;
1664  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1665    {    {
1666    int offset = GET(ptr, 1);    int offset;
1667    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1668    
1669      /* See if this recursion is on the forward reference list. If so, adjust the
1670      reference. */
1671    
1672      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1673        {
1674        offset = GET(hc, 0);
1675        if (cd->start_code + offset == ptr + 1)
1676          {
1677          PUT(hc, 0, offset + adjust);
1678          break;
1679          }
1680        }
1681    
1682      /* Otherwise, adjust the recursion offset if it's after the start of this
1683      group. */
1684    
1685      if (hc >= cd->hwm)
1686        {
1687        offset = GET(ptr, 1);
1688        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1689        }
1690    
1691    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1692    }    }
1693  }  }
# Line 1475  Yield:        TRUE when range returned; Line 1766  Yield:        TRUE when range returned;
1766  */  */
1767    
1768  static BOOL  static BOOL
1769  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1770      unsigned int *odptr)
1771  {  {
1772  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1773    
1774  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1775    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1776    
1777  if (c > d) return FALSE;  if (c > d) return FALSE;
1778    
# Line 1492  next = othercase + 1; Line 1781  next = othercase + 1;
1781    
1782  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1783    {    {
1784    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1785    next++;    next++;
1786    }    }
1787    
# Line 1506  return TRUE; Line 1793  return TRUE;
1793  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1794    
1795    
1796    
1797  /*************************************************  /*************************************************
1798  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1799  *************************************************/  *************************************************/
1800    
1801  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1802  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1803  bits.  sense to automatically possessify the repeated item.
1804    
1805  Arguments:  Arguments:
1806    optionsptr     pointer to the option bits    op_code       the repeated op code
1807    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1808    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1809    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1810    errorcodeptr   points to error code variable    ptr           next character in pattern
1811    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1812    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1813    
1814  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1815  */  */
1816    
1817  static BOOL  static BOOL
1818  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1819    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1820  {  {
1821  int repeat_type, op_type;  int next;
1822  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1823  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
1824  int greedy_default, greedy_non_default;  
1825  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
1826  int zeroreqbyte, zerofirstbyte;    {
1827  int req_caseopt, reqvary, tempreqvary;    for (;;)
1828  int condcount = 0;      {
1829  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1830  int after_manual_callout = 0;      if (*ptr == '#')
1831  register int c;        {
1832  register uschar *code = *codeptr;        while (*(++ptr) != 0)
1833  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1834  BOOL inescq = FALSE;        }
1835  BOOL groupsetfirstbyte = FALSE;      else break;
1836  const uschar *ptr = *ptrptr;      }
1837  const uschar *tempptr;    }
1838  uschar *previous = NULL;  
1839  uschar *previous_callout = NULL;  /* If the next item is one that we can handle, get its value. A non-negative
1840  uschar classbits[32];  value is a character, a negative value is an escape value. */
1841    
1842    if (*ptr == '\\')
1843      {
1844      int temperrorcode = 0;
1845      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1846      if (temperrorcode != 0) return FALSE;
1847      ptr++;    /* Point after the escape sequence */
1848      }
1849    
1850    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1851      {
1852  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1853  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1854  #endif  #endif
1855      next = *ptr++;
1856      }
1857    
1858  /* Set up the default and non-default settings for greediness */  else return FALSE;
1859    
1860  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1861    
1862  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1863  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1864  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1865  find one.      {
1866        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867        if (*ptr == '#')
1868          {
1869          while (*(++ptr) != 0)
1870            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871          }
1872        else break;
1873        }
1874      }
1875    
1876  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1877    
1878  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1879      return FALSE;
1880    
1881  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1882  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1883  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1884  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1885    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1886    
1887  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1888    
1889  for (;; ptr++)  if (next >= 0) switch(op_code)
1890    {    {
1891    BOOL negate_class;    case OP_CHAR:
1892    #ifdef SUPPORT_UTF8
1893      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894    #endif
1895      return item != next;
1896    
1897      /* For CHARNC (caseless character) we must check the other case. If we have
1898      Unicode property support, we can use it to test the other case of
1899      high-valued characters. */
1900    
1901      case OP_CHARNC:
1902    #ifdef SUPPORT_UTF8
1903      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1904    #endif
1905      if (item == next) return FALSE;
1906    #ifdef SUPPORT_UTF8
1907      if (utf8)
1908        {
1909        unsigned int othercase;
1910        if (next < 128) othercase = cd->fcc[next]; else
1911    #ifdef SUPPORT_UCP
1912        othercase = _pcre_ucp_othercase((unsigned int)next);
1913    #else
1914        othercase = NOTACHAR;
1915    #endif
1916        return (unsigned int)item != othercase;
1917        }
1918      else
1919    #endif  /* SUPPORT_UTF8 */
1920      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1921    
1922      /* For OP_NOT, "item" must be a single-byte character. */
1923    
1924      case OP_NOT:
1925      if (next < 0) return FALSE;  /* Not a character */
1926      if (item == next) return TRUE;
1927      if ((options & PCRE_CASELESS) == 0) return FALSE;
1928    #ifdef SUPPORT_UTF8
1929      if (utf8)
1930        {
1931        unsigned int othercase;
1932        if (next < 128) othercase = cd->fcc[next]; else
1933    #ifdef SUPPORT_UCP
1934        othercase = _pcre_ucp_othercase(next);
1935    #else
1936        othercase = NOTACHAR;
1937    #endif
1938        return (unsigned int)item == othercase;
1939        }
1940      else
1941    #endif  /* SUPPORT_UTF8 */
1942      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1943    
1944      case OP_DIGIT:
1945      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1946    
1947      case OP_NOT_DIGIT:
1948      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1949    
1950      case OP_WHITESPACE:
1951      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1952    
1953      case OP_NOT_WHITESPACE:
1954      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1955    
1956      case OP_WORDCHAR:
1957      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1958    
1959      case OP_NOT_WORDCHAR:
1960      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1961    
1962      case OP_HSPACE:
1963      case OP_NOT_HSPACE:
1964      switch(next)
1965        {
1966        case 0x09:
1967        case 0x20:
1968        case 0xa0:
1969        case 0x1680:
1970        case 0x180e:
1971        case 0x2000:
1972        case 0x2001:
1973        case 0x2002:
1974        case 0x2003:
1975        case 0x2004:
1976        case 0x2005:
1977        case 0x2006:
1978        case 0x2007:
1979        case 0x2008:
1980        case 0x2009:
1981        case 0x200A:
1982        case 0x202f:
1983        case 0x205f:
1984        case 0x3000:
1985        return op_code != OP_HSPACE;
1986        default:
1987        return op_code == OP_HSPACE;
1988        }
1989    
1990      case OP_VSPACE:
1991      case OP_NOT_VSPACE:
1992      switch(next)
1993        {
1994        case 0x0a:
1995        case 0x0b:
1996        case 0x0c:
1997        case 0x0d:
1998        case 0x85:
1999        case 0x2028:
2000        case 0x2029:
2001        return op_code != OP_VSPACE;
2002        default:
2003        return op_code == OP_VSPACE;
2004        }
2005    
2006      default:
2007      return FALSE;
2008      }
2009    
2010    
2011    /* Handle the case when the next item is \d, \s, etc. */
2012    
2013    switch(op_code)
2014      {
2015      case OP_CHAR:
2016      case OP_CHARNC:
2017    #ifdef SUPPORT_UTF8
2018      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2019    #endif
2020      switch(-next)
2021        {
2022        case ESC_d:
2023        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2024    
2025        case ESC_D:
2026        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2027    
2028        case ESC_s:
2029        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2030    
2031        case ESC_S:
2032        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2033    
2034        case ESC_w:
2035        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2036    
2037        case ESC_W:
2038        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2039    
2040        case ESC_h:
2041        case ESC_H:
2042        switch(item)
2043          {
2044          case 0x09:
2045          case 0x20:
2046          case 0xa0:
2047          case 0x1680:
2048          case 0x180e:
2049          case 0x2000:
2050          case 0x2001:
2051          case 0x2002:
2052          case 0x2003:
2053          case 0x2004:
2054          case 0x2005:
2055          case 0x2006:
2056          case 0x2007:
2057          case 0x2008:
2058          case 0x2009:
2059          case 0x200A:
2060          case 0x202f:
2061          case 0x205f:
2062          case 0x3000:
2063          return -next != ESC_h;
2064          default:
2065          return -next == ESC_h;
2066          }
2067    
2068        case ESC_v:
2069        case ESC_V:
2070        switch(item)
2071          {
2072          case 0x0a:
2073          case 0x0b:
2074          case 0x0c:
2075          case 0x0d:
2076          case 0x85:
2077          case 0x2028:
2078          case 0x2029:
2079          return -next != ESC_v;
2080          default:
2081          return -next == ESC_v;
2082          }
2083    
2084        default:
2085        return FALSE;
2086        }
2087    
2088      case OP_DIGIT:
2089      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2090             next == -ESC_h || next == -ESC_v;
2091    
2092      case OP_NOT_DIGIT:
2093      return next == -ESC_d;
2094    
2095      case OP_WHITESPACE:
2096      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2097    
2098      case OP_NOT_WHITESPACE:
2099      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2100    
2101      case OP_HSPACE:
2102      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2103    
2104      case OP_NOT_HSPACE:
2105      return next == -ESC_h;
2106    
2107      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2108      case OP_VSPACE:
2109      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2110    
2111      case OP_NOT_VSPACE:
2112      return next == -ESC_v;
2113    
2114      case OP_WORDCHAR:
2115      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2116    
2117      case OP_NOT_WORDCHAR:
2118      return next == -ESC_w || next == -ESC_d;
2119    
2120      default:
2121      return FALSE;
2122      }
2123    
2124    /* Control does not reach here */
2125    }
2126    
2127    
2128    
2129    /*************************************************
2130    *           Compile one branch                   *
2131    *************************************************/
2132    
2133    /* Scan the pattern, compiling it into the a vector. If the options are
2134    changed during the branch, the pointer is used to change the external options
2135    bits. This function is used during the pre-compile phase when we are trying
2136    to find out the amount of memory needed, as well as during the real compile
2137    phase. The value of lengthptr distinguishes the two phases.
2138    
2139    Arguments:
2140      optionsptr     pointer to the option bits
2141      codeptr        points to the pointer to the current code point
2142      ptrptr         points to the current pattern pointer
2143      errorcodeptr   points to error code variable
2144      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2145      reqbyteptr     set to the last literal character required, else < 0
2146      bcptr          points to current branch chain
2147      cd             contains pointers to tables etc.
2148      lengthptr      NULL during the real compile phase
2149                     points to length accumulator during pre-compile phase
2150    
2151    Returns:         TRUE on success
2152                     FALSE, with *errorcodeptr set non-zero on error
2153    */
2154    
2155    static BOOL
2156    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2157      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2158      compile_data *cd, int *lengthptr)
2159    {
2160    int repeat_type, op_type;
2161    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2162    int bravalue = 0;
2163    int greedy_default, greedy_non_default;
2164    int firstbyte, reqbyte;
2165    int zeroreqbyte, zerofirstbyte;
2166    int req_caseopt, reqvary, tempreqvary;
2167    int options = *optionsptr;
2168    int after_manual_callout = 0;
2169    int length_prevgroup = 0;
2170    register int c;
2171    register uschar *code = *codeptr;
2172    uschar *last_code = code;
2173    uschar *orig_code = code;
2174    uschar *tempcode;
2175    BOOL inescq = FALSE;
2176    BOOL groupsetfirstbyte = FALSE;
2177    const uschar *ptr = *ptrptr;
2178    const uschar *tempptr;
2179    uschar *previous = NULL;
2180    uschar *previous_callout = NULL;
2181    uschar *save_hwm = NULL;
2182    uschar classbits[32];
2183    
2184    #ifdef SUPPORT_UTF8
2185    BOOL class_utf8;
2186    BOOL utf8 = (options & PCRE_UTF8) != 0;
2187    uschar *class_utf8data;
2188    uschar utf8_char[6];
2189    #else
2190    BOOL utf8 = FALSE;
2191    uschar *utf8_char = NULL;
2192    #endif
2193    
2194    #ifdef DEBUG
2195    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2196    #endif
2197    
2198    /* Set up the default and non-default settings for greediness */
2199    
2200    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2201    greedy_non_default = greedy_default ^ 1;
2202    
2203    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2204    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2205    matches a non-fixed char first char; reqbyte just remains unset if we never
2206    find one.
2207    
2208    When we hit a repeat whose minimum is zero, we may have to adjust these values
2209    to take the zero repeat into account. This is implemented by setting them to
2210    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2211    item types that can be repeated set these backoff variables appropriately. */
2212    
2213    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2214    
2215    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2216    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2217    value > 255. It is added into the firstbyte or reqbyte variables to record the
2218    case status of the value. This is used only for ASCII characters. */
2219    
2220    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2221    
2222    /* Switch on next character until the end of the branch */
2223    
2224    for (;; ptr++)
2225      {
2226      BOOL negate_class;
2227    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2228    BOOL is_quantifier;    BOOL is_quantifier;
2229      BOOL is_recurse;
2230      BOOL reset_bracount;
2231    int class_charcount;    int class_charcount;
2232    int class_lastchar;    int class_lastchar;
2233    int newoptions;    int newoptions;
2234    int recno;    int recno;
2235      int refsign;
2236    int skipbytes;    int skipbytes;
2237    int subreqbyte;    int subreqbyte;
2238    int subfirstbyte;    int subfirstbyte;
2239      int terminator;
2240    int mclength;    int mclength;
2241    uschar mcbuffer[8];    uschar mcbuffer[8];
2242    
2243    /* Next byte in the pattern */    /* Get next byte in the pattern */
2244    
2245    c = *ptr;    c = *ptr;
2246    
2247      /* If we are in the pre-compile phase, accumulate the length used for the
2248      previous cycle of this loop. */
2249    
2250      if (lengthptr != NULL)
2251        {
2252    #ifdef DEBUG
2253        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2254    #endif
2255        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2256          {
2257          *errorcodeptr = ERR52;
2258          goto FAILED;
2259          }
2260    
2261        /* There is at least one situation where code goes backwards: this is the
2262        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2263        the class is simply eliminated. However, it is created first, so we have to
2264        allow memory for it. Therefore, don't ever reduce the length at this point.
2265        */
2266    
2267        if (code < last_code) code = last_code;
2268    
2269        /* Paranoid check for integer overflow */
2270    
2271        if (OFLOW_MAX - *lengthptr < code - last_code)
2272          {
2273          *errorcodeptr = ERR20;
2274          goto FAILED;
2275          }
2276    
2277        *lengthptr += code - last_code;
2278        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2279    
2280        /* If "previous" is set and it is not at the start of the work space, move
2281        it back to there, in order to avoid filling up the work space. Otherwise,
2282        if "previous" is NULL, reset the current code pointer to the start. */
2283    
2284        if (previous != NULL)
2285          {
2286          if (previous > orig_code)
2287            {
2288            memmove(orig_code, previous, code - previous);
2289            code -= previous - orig_code;
2290            previous = orig_code;
2291            }
2292          }
2293        else code = orig_code;
2294    
2295        /* Remember where this code item starts so we can pick up the length
2296        next time round. */
2297    
2298        last_code = code;
2299        }
2300    
2301      /* In the real compile phase, just check the workspace used by the forward
2302      reference list. */
2303    
2304      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2305        {
2306        *errorcodeptr = ERR52;
2307        goto FAILED;
2308        }
2309    
2310    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2311    
2312    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1623  for (;; ptr++) Line 2321  for (;; ptr++)
2321        {        {
2322        if (previous_callout != NULL)        if (previous_callout != NULL)
2323          {          {
2324          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2325              complete_callout(previous_callout, ptr, cd);
2326          previous_callout = NULL;          previous_callout = NULL;
2327          }          }
2328        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1644  for (;; ptr++) Line 2343  for (;; ptr++)
2343    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2344         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2345      {      {
2346      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2347          complete_callout(previous_callout, ptr, cd);
2348      previous_callout = NULL;      previous_callout = NULL;
2349      }      }
2350    
# Line 1655  for (;; ptr++) Line 2355  for (;; ptr++)
2355      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2356      if (c == '#')      if (c == '#')
2357        {        {
2358        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2359        on the Macintosh. */          {
2360        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2361        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2362          if (*ptr != 0) continue;
2363    
2364          /* Else fall through to handle end of string */
2365          c = 0;
2366        }        }
2367      }      }
2368    
# Line 1672  for (;; ptr++) Line 2376  for (;; ptr++)
2376    
2377    switch(c)    switch(c)
2378      {      {
2379      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2380        case 0:                        /* The branch terminates at string end */
2381      case 0:      case '|':                      /* or | or ) */
     case '|':  
2382      case ')':      case ')':
2383      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2384      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2385      *codeptr = code;      *codeptr = code;
2386      *ptrptr = ptr;      *ptrptr = ptr;
2387        if (lengthptr != NULL)
2388          {
2389          if (OFLOW_MAX - *lengthptr < code - last_code)
2390            {
2391            *errorcodeptr = ERR20;
2392            goto FAILED;
2393            }
2394          *lengthptr += code - last_code;   /* To include callout length */
2395          DPRINTF((">> end branch\n"));
2396          }
2397      return TRUE;      return TRUE;
2398    
2399    
2400        /* ===================================================================*/
2401      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2402      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2403    
# Line 1711  for (;; ptr++) Line 2426  for (;; ptr++)
2426      *code++ = OP_ANY;      *code++ = OP_ANY;
2427      break;      break;
2428    
2429      /* Character classes. If the included characters are all < 255 in value, we  
2430      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2431      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2432      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2433      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2434        map as usual, then invert it at the end. However, we use a different opcode
2435        so that data characters > 255 can be handled correctly.
2436    
2437      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2438      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1736  for (;; ptr++) Line 2453  for (;; ptr++)
2453        goto FAILED;        goto FAILED;
2454        }        }
2455    
2456      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2457        if the first few characters (either before or after ^) are \Q\E or \E we
2458        skip them too. This makes for compatibility with Perl. */
2459    
2460      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2461        for (;;)
2462        {        {
       negate_class = TRUE;  
2463        c = *(++ptr);        c = *(++ptr);
2464        }        if (c == '\\')
2465      else          {
2466        {          if (ptr[1] == 'E') ptr++;
2467        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2468                else break;
2469            }
2470          else if (!negate_class && c == '^')
2471            negate_class = TRUE;
2472          else break;
2473        }        }
2474    
2475      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2476      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2477      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2478    
2479      class_charcount = 0;      class_charcount = 0;
2480      class_lastchar = -1;      class_lastchar = -1;
2481    
2482        /* Initialize the 32-char bit map to all zeros. We build the map in a
2483        temporary bit of memory, in case the class contains only 1 character (less
2484        than 256), because in that case the compiled code doesn't use the bit map.
2485        */
2486    
2487        memset(classbits, 0, 32 * sizeof(uschar));
2488    
2489  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2490      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2491      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2492  #endif  #endif
2493    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2494      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2495      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2496      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2497    
2498      do      if (c != 0) do
2499        {        {
2500          const uschar *oldptr;
2501    
2502  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2503        if (utf8 && c > 127)        if (utf8 && c > 127)
2504          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1786  for (;; ptr++) Line 2510  for (;; ptr++)
2510    
2511        if (inescq)        if (inescq)
2512          {          {
2513          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2514            {            {
2515            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2516            ptr++;            ptr++;                            /* Skip the 'E' */
2517            continue;            continue;                         /* Carry on with next */
2518            }            }
2519          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2520          }          }
2521    
2522        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1806  for (;; ptr++) Line 2530  for (;; ptr++)
2530            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2531          {          {
2532          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2533          int posix_class, i;          int posix_class, taboffset, tabopt;
2534          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2535            uschar pbits[32];
2536    
2537          if (ptr[1] != ':')          if (ptr[1] != ':')
2538            {            {
# Line 1836  for (;; ptr++) Line 2561  for (;; ptr++)
2561          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2562            posix_class = 0;            posix_class = 0;
2563    
2564          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2565          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2566          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2567          white space chars afterwards. */          result into the bit map that is being built. */
2568    
2569          posix_class *= 3;          posix_class *= 3;
2570          for (i = 0; i < 3; i++)  
2571            /* Copy in the first table (always present) */
2572    
2573            memcpy(pbits, cbits + posix_class_maps[posix_class],
2574              32 * sizeof(uschar));
2575    
2576            /* If there is a second table, add or remove it as required. */
2577    
2578            taboffset = posix_class_maps[posix_class + 1];
2579            tabopt = posix_class_maps[posix_class + 2];
2580    
2581            if (taboffset >= 0)
2582            {            {
2583            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2584            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2585            else            else
2586              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2587            }            }
2588    
2589            /* Not see if we need to remove any special characters. An option
2590            value of 1 removes vertical space and 2 removes underscore. */
2591    
2592            if (tabopt < 0) tabopt = -tabopt;
2593            if (tabopt == 1) pbits[1] &= ~0x3c;
2594              else if (tabopt == 2) pbits[11] &= 0x7f;
2595    
2596            /* Add the POSIX table or its complement into the main table that is
2597            being built and we are done. */
2598    
2599            if (local_negate)
2600              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2601            else
2602              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2603    
2604          ptr = tempptr + 1;          ptr = tempptr + 1;
2605          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2606          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2607          }          }
2608    
2609        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2610        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2611        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2612        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2613        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2614        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2615    
2616        if (c == '\\')        if (c == '\\')
2617          {          {
2618          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2619            if (*errorcodeptr != 0) goto FAILED;
2620    
2621          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2622          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2623            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2624          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2625            {            {
2626            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1895  for (;; ptr++) Line 2635  for (;; ptr++)
2635            {            {
2636            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2637            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2638            switch (-c)  
2639              /* Save time by not doing this in the pre-compile phase. */
2640    
2641              if (lengthptr == NULL) switch (-c)
2642              {              {
2643              case ESC_d:              case ESC_d:
2644              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1923  for (;; ptr++) Line 2666  for (;; ptr++)
2666              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2667              continue;              continue;
2668    
2669  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
             case ESC_p:  
             case ESC_P:  
               {  
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
               class_utf8 = TRUE;  
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
                 XCL_PROP : XCL_NOTPROP;  
               *class_utf8data++ = property;  
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2670              continue;              continue;
 #endif  
   
             /* Unrecognized escapes are faulted if PCRE is running in its  
             strict mode. By default, for compatibility with Perl, they are  
             treated as literals. */  
2671    
2672              default:              default:    /* Not recognized; fall through */
2673              if ((options & PCRE_EXTRA) != 0)              break;      /* Need "default" setting to stop compiler warning. */
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2674              }              }
           }  
   
         /* Fall through if we have a single character (c >= 0). This may be  
         > 256 in UTF-8 mode. */  
2675    
2676          }   /* End of backslash handling */            /* In the pre-compile phase, just do the recognition. */
2677    
2678        /* A single character may be followed by '-' to form a range. However,            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2679        Perl does not permit ']' to be the end of the range. A '-' character                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
       here is treated as a literal. */  
2680    
2681        if (ptr[1] == '-' && ptr[2] != ']')            /* We need to deal with \H, \h, \V, and \v in both phases because
2682          {            they use extra memory. */
         int d;  
         ptr += 2;  
2683    
2684              if (-c == ESC_h)
2685                {
2686                SETBIT(classbits, 0x09); /* VT */
2687                SETBIT(classbits, 0x20); /* SPACE */
2688                SETBIT(classbits, 0xa0); /* NSBP */
2689  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2690          if (utf8)              if (utf8)
2691            {                           /* Braces are required because the */                {
2692            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */                class_utf8 = TRUE;
2693            }                *class_utf8data++ = XCL_SINGLE;
2694          else                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2695                  *class_utf8data++ = XCL_SINGLE;
2696                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2697                  *class_utf8data++ = XCL_RANGE;
2698                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2699                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2700                  *class_utf8data++ = XCL_SINGLE;
2701                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2702                  *class_utf8data++ = XCL_SINGLE;
2703                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2704                  *class_utf8data++ = XCL_SINGLE;
2705                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2706                  }
2707    #endif
2708                continue;
2709                }
2710    
2711              if (-c == ESC_H)
2712                {
2713                for (c = 0; c < 32; c++)
2714                  {
2715                  int x = 0xff;
2716                  switch (c)
2717                    {
2718                    case 0x09/8: x ^= 1 << (0x09%8); break;
2719                    case 0x20/8: x ^= 1 << (0x20%8); break;
2720                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2721                    default: break;
2722                    }
2723                  classbits[c] |= x;
2724                  }
2725    
2726    #ifdef SUPPORT_UTF8
2727                if (utf8)
2728                  {
2729                  class_utf8 = TRUE;
2730                  *class_utf8data++ = XCL_RANGE;
2731                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2732                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2733                  *class_utf8data++ = XCL_RANGE;
2734                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2735                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2736                  *class_utf8data++ = XCL_RANGE;
2737                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2738                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2739                  *class_utf8data++ = XCL_RANGE;
2740                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2741                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2742                  *class_utf8data++ = XCL_RANGE;
2743                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2744                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2745                  *class_utf8data++ = XCL_RANGE;
2746                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2747                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2748                  *class_utf8data++ = XCL_RANGE;
2749                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2750                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2751                  }
2752    #endif
2753                continue;
2754                }
2755    
2756              if (-c == ESC_v)
2757                {
2758                SETBIT(classbits, 0x0a); /* LF */
2759                SETBIT(classbits, 0x0b); /* VT */
2760                SETBIT(classbits, 0x0c); /* FF */
2761                SETBIT(classbits, 0x0d); /* CR */
2762                SETBIT(classbits, 0x85); /* NEL */
2763    #ifdef SUPPORT_UTF8
2764                if (utf8)
2765                  {
2766                  class_utf8 = TRUE;
2767                  *class_utf8data++ = XCL_RANGE;
2768                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2769                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2770                  }
2771    #endif
2772                continue;
2773                }
2774    
2775              if (-c == ESC_V)
2776                {
2777                for (c = 0; c < 32; c++)
2778                  {
2779                  int x = 0xff;
2780                  switch (c)
2781                    {
2782                    case 0x0a/8: x ^= 1 << (0x0a%8);
2783                                 x ^= 1 << (0x0b%8);
2784                                 x ^= 1 << (0x0c%8);
2785                                 x ^= 1 << (0x0d%8);
2786                                 break;
2787                    case 0x85/8: x ^= 1 << (0x85%8); break;
2788                    default: break;
2789                    }
2790                  classbits[c] |= x;
2791                  }
2792    
2793    #ifdef SUPPORT_UTF8
2794                if (utf8)
2795                  {
2796                  class_utf8 = TRUE;
2797                  *class_utf8data++ = XCL_RANGE;
2798                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2799                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2800                  *class_utf8data++ = XCL_RANGE;
2801                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2802                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2803                  }
2804    #endif
2805                continue;
2806                }
2807    
2808              /* We need to deal with \P and \p in both phases. */
2809    
2810    #ifdef SUPPORT_UCP
2811              if (-c == ESC_p || -c == ESC_P)
2812                {
2813                BOOL negated;
2814                int pdata;
2815                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2816                if (ptype < 0) goto FAILED;
2817                class_utf8 = TRUE;
2818                *class_utf8data++ = ((-c == ESC_p) != negated)?
2819                  XCL_PROP : XCL_NOTPROP;
2820                *class_utf8data++ = ptype;
2821                *class_utf8data++ = pdata;
2822                class_charcount -= 2;   /* Not a < 256 character */
2823                continue;
2824                }
2825    #endif
2826              /* Unrecognized escapes are faulted if PCRE is running in its
2827              strict mode. By default, for compatibility with Perl, they are
2828              treated as literals. */
2829    
2830              if ((options & PCRE_EXTRA) != 0)
2831                {
2832                *errorcodeptr = ERR7;
2833                goto FAILED;
2834                }
2835    
2836              class_charcount -= 2;  /* Undo the default count from above */
2837              c = *ptr;              /* Get the final character and fall through */
2838              }
2839    
2840            /* Fall through if we have a single character (c >= 0). This may be
2841            greater than 256 in UTF-8 mode. */
2842    
2843            }   /* End of backslash handling */
2844    
2845          /* A single character may be followed by '-' to form a range. However,
2846          Perl does not permit ']' to be the end of the range. A '-' character
2847          at the end is treated as a literal. Perl ignores orphaned \E sequences
2848          entirely. The code for handling \Q and \E is messy. */
2849    
2850          CHECK_RANGE:
2851          while (ptr[1] == '\\' && ptr[2] == 'E')
2852            {
2853            inescq = FALSE;
2854            ptr += 2;
2855            }
2856    
2857          oldptr = ptr;
2858    
2859          if (!inescq && ptr[1] == '-')
2860            {
2861            int d;
2862            ptr += 2;
2863            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2864    
2865            /* If we hit \Q (not followed by \E) at this point, go into escaped
2866            mode. */
2867    
2868            while (*ptr == '\\' && ptr[1] == 'Q')
2869              {
2870              ptr += 2;
2871              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2872              inescq = TRUE;
2873              break;
2874              }
2875    
2876            if (*ptr == 0 || (!inescq && *ptr == ']'))
2877              {
2878              ptr = oldptr;
2879              goto LONE_SINGLE_CHARACTER;
2880              }
2881    
2882    #ifdef SUPPORT_UTF8
2883            if (utf8)
2884              {                           /* Braces are required because the */
2885              GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
2886              }
2887            else
2888  #endif  #endif
2889          d = *ptr;  /* Not UTF-8 mode */          d = *ptr;  /* Not UTF-8 mode */
2890    
# Line 1981  for (;; ptr++) Line 2892  for (;; ptr++)
2892          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2893          in such circumstances. */          in such circumstances. */
2894    
2895          if (d == '\\')          if (!inescq && d == '\\')
2896            {            {
2897            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2898            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2899    
2900            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2901            was literal */            special means the '-' was literal */
2902    
2903            if (d < 0)            if (d < 0)
2904              {              {
2905              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2906              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2907                else if (d == -ESC_R) d = 'R'; else
2908                {                {
2909                ptr = oldptr - 2;                ptr = oldptr;
2910                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2911                }                }
2912              }              }
2913            }            }
2914    
2915          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2916          the pre-pass. Optimize one-character ranges */          one-character ranges */
2917    
2918            if (d < c)
2919              {
2920              *errorcodeptr = ERR8;
2921              goto FAILED;
2922              }
2923    
2924          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2925    
# Line 2022  for (;; ptr++) Line 2940  for (;; ptr++)
2940  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2941            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2942              {              {
2943              int occ, ocd;              unsigned int occ, ocd;
2944              int cc = c;              unsigned int cc = c;
2945              int origd = d;              unsigned int origd = d;
2946              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2947                {                {
2948                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2949                      ocd <= (unsigned int)d)
2950                    continue;                          /* Skip embedded ranges */
2951    
2952                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2953                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2954                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2955                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2956                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2957                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2958                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2959                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2960                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2961                  d = ocd;                  d = ocd;
2962                  continue;                  continue;
# Line 2082  for (;; ptr++) Line 3004  for (;; ptr++)
3004          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3005          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3006    
3007          for (; c <= d; c++)          class_charcount += d - c + 1;
3008            class_lastchar = d;
3009    
3010            /* We can save a bit of time by skipping this in the pre-compile. */
3011    
3012            if (lengthptr == NULL) for (; c <= d; c++)
3013            {            {
3014            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3015            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2090  for (;; ptr++) Line 3017  for (;; ptr++)
3017              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3018              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3019              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3020            }            }
3021    
3022          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2115  for (;; ptr++) Line 3040  for (;; ptr++)
3040  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3041          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3042            {            {
3043            int chartype;            unsigned int othercase;
3044            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3045              {              {
3046              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3047              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2143  for (;; ptr++) Line 3066  for (;; ptr++)
3066          }          }
3067        }        }
3068    
3069      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3070      loop. This "while" is the end of the "do" above. */  
3071        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3072    
3073      while ((c = *(++ptr)) != ']' || inescq);      if (c == 0)                          /* Missing terminating ']' */
3074          {
3075          *errorcodeptr = ERR6;
3076          goto FAILED;
3077          }
3078    
3079      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3080      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2210  for (;; ptr++) Line 3138  for (;; ptr++)
3138    
3139      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3140      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3141      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3142    
3143  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3144      if (class_utf8)      if (class_utf8)
# Line 2220  for (;; ptr++) Line 3148  for (;; ptr++)
3148        code += LINK_SIZE;        code += LINK_SIZE;
3149        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3150    
3151        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3152        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3153    
3154        if (class_charcount > 0)        if (class_charcount > 0)
3155          {          {
3156          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3157            memmove(code + 32, code, class_utf8data - code);
3158          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3159          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3160          }          }
3161          else code = class_utf8data;
3162    
3163        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3164    
# Line 2254  for (;; ptr++) Line 3175  for (;; ptr++)
3175      if (negate_class)      if (negate_class)
3176        {        {
3177        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3178        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3179            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3180        }        }
3181      else      else
3182        {        {
# Line 2264  for (;; ptr++) Line 3186  for (;; ptr++)
3186      code += 32;      code += 32;
3187      break;      break;
3188    
3189    
3190        /* ===================================================================*/
3191      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3192      has been tested above. */      has been tested above. */
3193    
# Line 2331  for (;; ptr++) Line 3255  for (;; ptr++)
3255        }        }
3256      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3257    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3258      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3259      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3260      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2378  for (;; ptr++) Line 3288  for (;; ptr++)
3288          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3289          }          }
3290    
3291          /* If the repetition is unlimited, it pays to see if the next thing on
3292          the line is something that cannot possibly match this character. If so,
3293          automatically possessifying this item gains some performance in the case
3294          where the match fails. */
3295    
3296          if (!possessive_quantifier &&
3297              repeat_max < 0 &&
3298              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3299                options, cd))
3300            {
3301            repeat_type = 0;    /* Force greedy */
3302            possessive_quantifier = TRUE;
3303            }
3304    
3305        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3306        }        }
3307    
3308      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3309      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3310      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3311      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3312        currently used only for single-byte chars. */
3313    
3314      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3315        {        {
3316        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3317        c = previous[1];        c = previous[1];
3318          if (!possessive_quantifier &&
3319              repeat_max < 0 &&
3320              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3321            {
3322            repeat_type = 0;    /* Force greedy */
3323            possessive_quantifier = TRUE;
3324            }
3325        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3326        }        }
3327    
# Line 2403  for (;; ptr++) Line 3335  for (;; ptr++)
3335      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3336        {        {
3337        uschar *oldcode;        uschar *oldcode;
3338        int prop_type;        int prop_type, prop_value;
3339        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3340        c = *previous;        c = *previous;
3341    
3342          if (!possessive_quantifier &&
3343              repeat_max < 0 &&
3344              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3345            {
3346            repeat_type = 0;    /* Force greedy */
3347            possessive_quantifier = TRUE;
3348            }
3349    
3350        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3351        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3352          previous[1] : -1;          {
3353            prop_type = previous[1];
3354            prop_value = previous[2];
3355            }
3356          else prop_type = prop_value = -1;
3357    
3358        oldcode = code;        oldcode = code;
3359        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2443  for (;; ptr++) Line 3387  for (;; ptr++)
3387          }          }
3388    
3389        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3390        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3391        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3392        one less than the maximum. */        one less than the maximum. */
3393    
# Line 2470  for (;; ptr++) Line 3414  for (;; ptr++)
3414    
3415          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3416          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3417          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3418          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3419          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3420    
# Line 2486  for (;; ptr++) Line 3430  for (;; ptr++)
3430  #endif  #endif
3431              {              {
3432              *code++ = c;              *code++ = c;
3433              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3434                  {
3435                  *code++ = prop_type;
3436                  *code++ = prop_value;
3437                  }
3438              }              }
3439            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3440            }            }
3441    
3442          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3443          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3444            UPTO is just for 1 instance, we can use QUERY instead. */
3445    
3446          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3447            {            {
# Line 2505  for (;; ptr++) Line 3454  for (;; ptr++)
3454            else            else
3455  #endif  #endif
3456            *code++ = c;            *code++ = c;
3457            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3458                {
3459                *code++ = prop_type;
3460                *code++ = prop_value;
3461                }
3462            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3463            *code++ = OP_UPTO + repeat_type;  
3464            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3465                {
3466                *code++ = OP_QUERY + repeat_type;
3467                }
3468              else
3469                {
3470                *code++ = OP_UPTO + repeat_type;
3471                PUT2INC(code, 0, repeat_max);
3472                }
3473            }            }
3474          }          }
3475    
# Line 2524  for (;; ptr++) Line 3485  for (;; ptr++)
3485  #endif  #endif
3486        *code++ = c;        *code++ = c;
3487    
3488        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3489        defines the required property. */        define the required property. */
3490    
3491  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3492        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3493            {
3494            *code++ = prop_type;
3495            *code++ = prop_value;
3496            }
3497  #endif  #endif
3498        }        }
3499    
# Line 2571  for (;; ptr++) Line 3536  for (;; ptr++)
3536      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3537      cases. */      cases. */
3538    
3539      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3540               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3541        {        {
3542        register int i;        register int i;
3543        int ketoffset = 0;        int ketoffset = 0;
3544        int len = code - previous;        int len = code - previous;
3545        uschar *bralink = NULL;        uschar *bralink = NULL;
3546    
3547          /* Repeating a DEFINE group is pointless */
3548    
3549          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3550            {
3551            *errorcodeptr = ERR55;
3552            goto FAILED;
3553            }
3554    
3555        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3556        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3557        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2613  for (;; ptr++) Line 3586  for (;; ptr++)
3586          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3587          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3588          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3589          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3590          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3591            doing this. */
3592    
3593          if (repeat_max <= 1)          if (repeat_max <= 1)
3594            {            {
3595            *code = OP_END;            *code = OP_END;
3596            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3597            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3598            code++;            code++;
3599            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2637  for (;; ptr++) Line 3611  for (;; ptr++)
3611            {            {
3612            int offset;            int offset;
3613            *code = OP_END;            *code = OP_END;
3614            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3615            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3616            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3617            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3631  for (;; ptr++)
3631        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3632        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3633        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3634        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3635          forward reference subroutine calls in the group, there will be entries on
3636          the workspace list; replicate these with an appropriate increment. */
3637    
3638        else        else
3639          {          {
3640          if (repeat_min > 1)          if (repeat_min > 1)
3641            {            {
3642            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3643            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3644              potential integer overflow. */
3645    
3646              if (lengthptr != NULL)
3647                {
3648                int delta = (repeat_min - 1)*length_prevgroup;
3649                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3650                                                                (double)INT_MAX ||
3651                    OFLOW_MAX - *lengthptr < delta)
3652                  {
3653                  *errorcodeptr = ERR20;
3654                  goto FAILED;
3655                  }
3656                *lengthptr += delta;
3657                }
3658    
3659              /* This is compiling for real */
3660    
3661              else
3662              {              {
3663              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3664              code += len;              for (i = 1; i < repeat_min; i++)
3665                  {
3666                  uschar *hc;
3667                  uschar *this_hwm = cd->hwm;
3668                  memcpy(code, previous, len);
3669                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3670                    {
3671                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3672                    cd->hwm += LINK_SIZE;
3673                    }
3674                  save_hwm = this_hwm;
3675                  code += len;
3676                  }
3677              }              }
3678            }            }
3679    
3680          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3681          }          }
3682    
# Line 2677  for (;; ptr++) Line 3684  for (;; ptr++)
3684        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3685        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3686        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3687        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3688          replicate entries on the forward reference list. */
3689    
3690        if (repeat_max >= 0)        if (repeat_max >= 0)
3691          {          {
3692          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3693            just adjust the length as if we had. For each repetition we must add 1
3694            to the length for BRAZERO and for all but the last repetition we must
3695            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3696            paranoid checks to avoid integer overflow. */
3697    
3698            if (lengthptr != NULL && repeat_max > 0)
3699              {
3700              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3701                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3702              if ((double)repeat_max *
3703                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3704                      > (double)INT_MAX ||
3705                  OFLOW_MAX - *lengthptr < delta)
3706                {
3707                *errorcodeptr = ERR20;
3708                goto FAILED;
3709                }
3710              *lengthptr += delta;
3711              }
3712    
3713            /* This is compiling for real */
3714    
3715            else for (i = repeat_max - 1; i >= 0; i--)
3716            {            {
3717              uschar *hc;
3718              uschar *this_hwm = cd->hwm;
3719    
3720            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3721    
3722            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2698  for (;; ptr++) Line 3732  for (;; ptr++)
3732              }              }
3733    
3734            memcpy(code, previous, len);            memcpy(code, previous, len);
3735              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3736                {
3737                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3738                cd->hwm += LINK_SIZE;
3739                }
3740              save_hwm = this_hwm;
3741            code += len;            code += len;
3742            }            }
3743    
# Line 2720  for (;; ptr++) Line 3760  for (;; ptr++)
3760        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3761        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3762        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3763        correct offset was computed above. */        correct offset was computed above.
3764    
3765        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
3766          this group is a non-atomic one that could match an empty string. If so,
3767          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3768          that runtime checking can be done. [This check is also applied to
3769          atomic groups at runtime, but in a different way.] */
3770    
3771          else
3772            {
3773            uschar *ketcode = code - ketoffset;
3774            uschar *bracode = ketcode - GET(ketcode, 1);
3775            *ketcode = OP_KETRMAX + repeat_type;
3776            if (lengthptr == NULL && *bracode != OP_ONCE)
3777              {
3778              uschar *scode = bracode;
3779              do
3780                {
3781                if (could_be_empty_branch(scode, ketcode, utf8))
3782                  {
3783                  *bracode += OP_SBRA - OP_BRA;
3784                  break;
3785                  }
3786                scode += GET(scode, 1);
3787                }
3788              while (*scode == OP_ALT);
3789              }
3790            }
3791        }        }
3792    
3793      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2733  for (;; ptr++) Line 3798  for (;; ptr++)
3798        goto FAILED;        goto FAILED;
3799        }        }
3800    
3801      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3802      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3803      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3804      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3805      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3806        but the special opcodes can optimize it a bit. The repeated item starts at
3807        tempcode, not at previous, which might be the first part of a string whose
3808        (former) last char we repeated.
3809    
3810        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3811        an 'upto' may follow. We skip over an 'exact' item, and then test the
3812        length of what remains before proceeding. */
3813    
3814      if (possessive_quantifier)      if (possessive_quantifier)
3815        {        {
3816        int len = code - tempcode;        int len;
3817        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3818        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3819        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3820        tempcode[0] = OP_ONCE;        len = code - tempcode;
3821        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3822        PUTINC(code, 0, len);          {
3823        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3824            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3825            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3826            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3827    
3828            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3829            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3830            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3831            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3832    
3833            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3834            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3835            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3836            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3837    
3838            default:
3839            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3840            code += 1 + LINK_SIZE;
3841            len += 1 + LINK_SIZE;
3842            tempcode[0] = OP_ONCE;
3843            *code++ = OP_KET;
3844            PUTINC(code, 0, len);
3845            PUT(tempcode, 1, len);
3846            break;
3847            }
3848        }        }
3849    
3850      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2761  for (;; ptr++) Line 3857  for (;; ptr++)
3857      break;      break;
3858    
3859    
3860      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3861      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3862      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3863      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3864      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3865      check for syntax errors here.  */      group. */
3866    
3867      case '(':      case '(':
3868      newoptions = options;      newoptions = options;
3869      skipbytes = 0;      skipbytes = 0;
3870        bravalue = OP_CBRA;
3871        save_hwm = cd->hwm;
3872        reset_bracount = FALSE;
3873    
3874      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3875        {        {
3876        int set, unset;        int i, set, unset, namelen;
3877        int *optset;        int *optset;
3878          const uschar *name;
3879          uschar *slot;
3880    
3881        switch (*(++ptr))        switch (*(++ptr))
3882          {          {
3883          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3884          ptr++;          ptr++;
3885          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3886            if (*ptr == 0)
3887              {
3888              *errorcodeptr = ERR18;
3889              goto FAILED;
3890              }
3891          continue;          continue;
3892    
3893          case ':':                 /* Non-extracting bracket */  
3894            /* ------------------------------------------------------------ */
3895            case '|':                 /* Reset capture count for each branch */
3896            reset_bracount = TRUE;
3897            /* Fall through */
3898    
3899            /* ------------------------------------------------------------ */
3900            case ':':                 /* Non-capturing bracket */
3901          bravalue = OP_BRA;          bravalue = OP_BRA;
3902          ptr++;          ptr++;
3903          break;          break;
3904    
3905    
3906            /* ------------------------------------------------------------ */
3907          case '(':          case '(':
3908          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3909    
3910          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3911            group), a name (referring to a named group), or 'R', referring to
3912            recursion. R<digits> and R&name are also permitted for recursion tests.
3913    
3914            There are several syntaxes for testing a named group: (?(name)) is used
3915            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3916    
3917            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3918            be the recursive thing or the name 'R' (and similarly for 'R' followed
3919            by digits), and (b) a number could be a name that consists of digits.
3920            In both cases, we look for a name first; if not found, we try the other
3921            cases. */
3922    
3923            /* For conditions that are assertions, check the syntax, and then exit
3924            the switch. This will take control down to where bracketed groups,
3925            including assertions, are processed. */
3926    
3927            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3928              break;
3929    
3930            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3931            below), and all need to skip 3 bytes at the start of the group. */
3932    
3933          if (ptr[1] == 'R')          code[1+LINK_SIZE] = OP_CREF;
3934            skipbytes = 3;
3935            refsign = -1;
3936    
3937            /* Check for a test for recursion in a named group. */
3938    
3939            if (ptr[1] == 'R' && ptr[2] == '&')
3940            {            {
3941            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
3942            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
3943            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
3944            }            }
3945    
3946          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
3947          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
3948    
3949          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
3950            {            {
3951            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
3952            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
3953            }            }
3954          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
         set bravalue above. */  
         break;  
   
         case '=':                 /* Positive lookahead */  
         bravalue = OP_ASSERT;  
         ptr++;  
         break;  
   
         case '!':                 /* Negative lookahead */  
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
   
         case '<':                 /* Lookbehinds */  
         switch (*(++ptr))  
3955            {            {
3956            case '=':               /* Positive lookbehind */            terminator = '\'';
           bravalue = OP_ASSERTBACK;  
3957            ptr++;            ptr++;
3958            break;            }
3959            else
3960              {
3961              terminator = 0;
3962              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3963              }
3964    
3965            case '!':               /* Negative lookbehind */          /* We now expect to read a name; any thing else is an error */
3966            bravalue = OP_ASSERTBACK_NOT;  
3967            ptr++;          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3968            break;            {
3969              ptr += 1;  /* To get the right offset */
3970              *errorcodeptr = ERR28;
3971              goto FAILED;
3972            }            }
         break;  
3973    
3974          case '>':                 /* One-time brackets */          /* Read the name, but also get it as a number if it's all digits */
         bravalue = OP_ONCE;  
         ptr++;  
         break;  
3975    
3976          case 'C':                 /* Callout - may be followed by digits; */          recno = 0;
3977          previous_callout = code;  /* Save for later completion */          name = ++ptr;
3978          after_manual_callout = 1; /* Skip one item before completing */          while ((cd->ctypes[*ptr] & ctype_word) != 0)
3979          *code++ = OP_CALLOUT;     /* Already checked that the terminating */            {
3980            {                       /* closing parenthesis is present. */            if (recno >= 0)
3981            int n = 0;              recno = ((digitab[*ptr] & ctype_digit) != 0)?
3982            while ((digitab[*(++ptr)] & ctype_digit) != 0)                recno * 10 + *ptr - '0' : -1;
3983              n = n * 10 + *ptr - '0';            ptr++;
           if (n > 255)  
             {  
             *errorcodeptr = ERR38;  
             goto FAILED;  
             }  
           *code++ = n;  
           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */  
           PUT(code, LINK_SIZE, 0);                    /* Default length */  
           code += 2 * LINK_SIZE;  
3984            }            }
3985          previous = NULL;          namelen = ptr - name;
         continue;  
3986    
3987          case 'P':                 /* Named subpattern handling */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
         if (*(++ptr) == '<')      /* Definition */  
3988            {            {
3989            int i, namelen;            ptr--;      /* Error offset */
3990            uschar *slot = cd->name_table;            *errorcodeptr = ERR26;
3991            const uschar *name;     /* Don't amalgamate; some compilers */            goto FAILED;
3992            name = ++ptr;           /* grumble at autoincrement in declaration */            }
3993    
3994            while (*ptr++ != '>');          /* Do no further checking in the pre-compile phase. */
           namelen = ptr - name - 1;  
3995    
3996            for (i = 0; i < cd->names_found; i++)          if (lengthptr != NULL) break;
3997              {  
3998              int crc = memcmp(name, slot+2, namelen);          /* In the real compile we do the work of looking for the actual
3999              if (crc == 0)          reference. If the string started with "+" or "-" we require the rest to
4000                {          be digits, in which case recno will be set. */
4001                if (slot[2+namelen] == 0)  
4002            if (refsign > 0)
4003              {
4004              if (recno <= 0)
4005                {
4006                *errorcodeptr = ERR58;
4007                goto FAILED;
4008                }
4009              if (refsign == '-')
4010                {
4011                recno = cd->bracount - recno + 1;
4012                if (recno <= 0)
4013                  {
4014                  *errorcodeptr = ERR15;
4015                  goto FAILED;
4016                  }
4017                }
4018              else recno += cd->bracount;
4019              PUT2(code, 2+LINK_SIZE, recno);
4020              break;
4021              }
4022    
4023            /* Otherwise (did not start with "+" or "-"), start by looking for the
4024            name. */
4025    
4026            slot = cd->name_table;
4027            for (i = 0; i < cd->names_found; i++)
4028              {
4029              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4030              slot += cd->name_entry_size;
4031              }
4032    
4033            /* Found a previous named subpattern */
4034    
4035            if (i < cd->names_found)
4036              {
4037              recno = GET2(slot, 0);
4038              PUT2(code, 2+LINK_SIZE, recno);
4039              }
4040    
4041            /* Search the pattern for a forward reference */
4042    
4043            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4044                            (options & PCRE_EXTENDED) != 0)) > 0)
4045              {
4046              PUT2(code, 2+LINK_SIZE, i);
4047              }
4048    
4049            /* If terminator == 0 it means that the name followed directly after
4050            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4051            some further alternatives to try. For the cases where terminator != 0
4052            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4053            now checked all the possibilities, so give an error. */
4054    
4055            else if (terminator != 0)
4056              {
4057              *errorcodeptr = ERR15;
4058              goto FAILED;
4059              }
4060    
4061            /* Check for (?(R) for recursion. Allow digits after R to specify a
4062            specific group number. */
4063    
4064            else if (*name == 'R')
4065              {
4066              recno = 0;
4067              for (i = 1; i < namelen; i++)
4068                {
4069                if ((digitab[name[i]] & ctype_digit) == 0)
4070                  {
4071                  *errorcodeptr = ERR15;
4072                  goto FAILED;
4073                  }
4074                recno = recno * 10 + name[i] - '0';
4075                }
4076              if (recno == 0) recno = RREF_ANY;
4077              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4078              PUT2(code, 2+LINK_SIZE, recno);
4079              }
4080    
4081            /* Similarly, check for the (?(DEFINE) "condition", which is always
4082            false. */
4083    
4084            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4085              {
4086              code[1+LINK_SIZE] = OP_DEF;
4087              skipbytes = 1;
4088              }
4089    
4090            /* Check for the "name" actually being a subpattern number. */
4091    
4092            else if (recno > 0)
4093              {
4094              PUT2(code, 2+LINK_SIZE, recno);
4095              }
4096    
4097            /* Either an unidentified subpattern, or a reference to (?(0) */
4098    
4099            else
4100              {
4101              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4102              goto FAILED;
4103              }
4104            break;
4105    
4106    
4107            /* ------------------------------------------------------------ */
4108            case '=':                 /* Positive lookahead */
4109            bravalue = OP_ASSERT;
4110            ptr++;
4111            break;
4112    
4113    
4114            /* ------------------------------------------------------------ */
4115            case '!':                 /* Negative lookahead */
4116            bravalue = OP_ASSERT_NOT;
4117            ptr++;
4118            break;
4119    
4120    
4121            /* ------------------------------------------------------------ */
4122            case '<':                 /* Lookbehind or named define */
4123            switch (ptr[1])
4124              {
4125              case '=':               /* Positive lookbehind */
4126              bravalue = OP_ASSERTBACK;
4127              ptr += 2;
4128              break;
4129    
4130              case '!':               /* Negative lookbehind */
4131              bravalue = OP_ASSERTBACK_NOT;
4132              ptr += 2;
4133              break;
4134    
4135              default:                /* Could be name define, else bad */
4136              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4137              ptr++;                  /* Correct offset for error */
4138              *errorcodeptr = ERR24;
4139              goto FAILED;
4140              }
4141            break;
4142    
4143    
4144            /* ------------------------------------------------------------ */
4145            case '>':                 /* One-time brackets */
4146            bravalue = OP_ONCE;
4147            ptr++;
4148            break;
4149    
4150    
4151            /* ------------------------------------------------------------ */
4152            case 'C':                 /* Callout - may be followed by digits; */
4153            previous_callout = code;  /* Save for later completion */
4154            after_manual_callout = 1; /* Skip one item before completing */
4155            *code++ = OP_CALLOUT;
4156              {
4157              int n = 0;
4158              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4159                n = n * 10 + *ptr - '0';
4160              if (*ptr != ')')
4161                {
4162                *errorcodeptr = ERR39;
4163                goto FAILED;
4164                }
4165              if (n > 255)
4166                {
4167                *errorcodeptr = ERR38;
4168                goto FAILED;
4169                }
4170              *code++ = n;
4171              PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4172              PUT(code, LINK_SIZE, 0);                    /* Default length */
4173              code += 2 * LINK_SIZE;
4174              }
4175            previous = NULL;
4176            continue;
4177    
4178    
4179            /* ------------------------------------------------------------ */
4180            case 'P':                 /* Python-style named subpattern handling */
4181            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4182              {
4183              is_recurse = *ptr == '>';
4184              terminator = ')';
4185              goto NAMED_REF_OR_RECURSE;
4186              }
4187            else if (*ptr != '<')    /* Test for Python-style definition */
4188              {
4189              *errorcodeptr = ERR41;
4190              goto FAILED;
4191              }
4192            /* Fall through to handle (?P< as (?< is handled */
4193    
4194    
4195            /* ------------------------------------------------------------ */
4196            DEFINE_NAME:    /* Come here from (?< handling */
4197            case '\'':
4198              {
4199              terminator = (*ptr == '<')? '>' : '\'';
4200              name = ++ptr;
4201    
4202              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4203              namelen = ptr - name;
4204    
4205              /* In the pre-compile phase, just do a syntax check. */
4206    
4207              if (lengthptr != NULL)
4208                {
4209                if (*ptr != terminator)
4210                  {
4211                  *errorcodeptr = ERR42;
4212                  goto FAILED;
4213                  }
4214                if (cd->names_found >= MAX_NAME_COUNT)
4215                  {
4216                  *errorcodeptr = ERR49;
4217                  goto FAILED;
4218                  }
4219                if (namelen + 3 > cd->name_entry_size)
4220                  {
4221                  cd->name_entry_size = namelen + 3;
4222                  if (namelen > MAX_NAME_SIZE)
4223                  {                  {
4224                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4225                  goto FAILED;                  goto FAILED;
4226                  }                  }
               crc = -1;             /* Current name is substring */  
4227                }                }
4228              if (crc < 0)              }
4229    
4230              /* In the real compile, create the entry in the table */
4231    
4232              else
4233                {
4234                slot = cd->name_table;
4235                for (i = 0; i < cd->names_found; i++)
4236                {                {
4237                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
4238                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
4239                break;                  {
4240                    if (slot[2+namelen] == 0)
4241                      {
4242                      if ((options & PCRE_DUPNAMES) == 0)
4243                        {
4244                        *errorcodeptr = ERR43;
4245                        goto FAILED;
4246                        }
4247                      }
4248                    else crc = -1;      /* Current name is substring */
4249                    }
4250                  if (crc < 0)
4251                    {
4252                    memmove(slot + cd->name_entry_size, slot,
4253                      (cd->names_found - i) * cd->name_entry_size);
4254                    break;
4255                    }
4256                  slot += cd->name_entry_size;
4257                }                }
             slot += cd->name_entry_size;  
             }  
4258    
4259            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
4260            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4261            slot[2+namelen] = 0;              slot[2+namelen] = 0;
4262            cd->names_found++;              }
           goto NUMBERED_GROUP;  
4263            }            }
4264    
4265          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4266    
4267            ptr++;                    /* Move past > or ' */
4268            cd->names_found++;
4269            goto NUMBERED_GROUP;
4270    
4271    
4272            /* ------------------------------------------------------------ */
4273            case '&':                 /* Perl recursion/subroutine syntax */
4274            terminator = ')';
4275            is_recurse = TRUE;
4276            /* Fall through */
4277    
4278            /* We come here from the Python syntax above that handles both
4279            references (?P=name) and recursion (?P>name), as well as falling
4280            through from the Perl recursion syntax (?&name). */
4281    
4282            NAMED_REF_OR_RECURSE:
4283            name = ++ptr;
4284            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4285            namelen = ptr - name;
4286    
4287            /* In the pre-compile phase, do a syntax check and set a dummy
4288            reference number. */
4289    
4290            if (lengthptr != NULL)
4291            {            {
4292            int i, namelen;            if (*ptr != terminator)
4293            int type = *ptr++;              {
4294            const uschar *name = ptr;              *errorcodeptr = ERR42;
4295            uschar *slot = cd->name_table;              goto FAILED;
4296                }
4297              if (namelen > MAX_NAME_SIZE)
4298                {
4299                *errorcodeptr = ERR48;
4300                goto FAILED;
4301                }
4302              recno = 0;
4303              }
4304    
4305            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4306    
4307            else
4308              {
4309              slot = cd->name_table;
4310            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4311              {              {
4312              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4313              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4314              }              }
4315            if (i >= cd->names_found)  
4316              if (i < cd->names_found)         /* Back reference */
4317                {
4318                recno = GET2(slot, 0);
4319                }
4320              else if ((recno =                /* Forward back reference */
4321                        find_parens(ptr, cd->bracount, name, namelen,
4322                          (options & PCRE_EXTENDED) != 0)) <= 0)
4323              {              {
4324              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4325              goto FAILED;              goto FAILED;
4326              }              }
4327              }
4328    
4329            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4330            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4331    
4332            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4333            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4334    
         /* Should never happen */  
         break;  
4335    
4336          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4337            case 'R':                 /* Recursion */
4338          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4339          /* Fall through */          /* Fall through */
4340    
         /* Recursion or "subroutine" call */  
4341    
4342          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4343          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4344            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4345            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4346            {            {
4347            const uschar *called;            const uschar *called;
4348    
4349              if ((refsign = *ptr) == '+') ptr++;
4350              else if (refsign == '-')
4351                {
4352                if ((digitab[ptr[1]] & ctype_digit) == 0)
4353                  goto OTHER_CHAR_AFTER_QUERY;
4354                ptr++;
4355                }
4356    
4357            recno = 0;            recno = 0;
4358            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4359              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4360    
4361              if (*ptr != ')')
4362                {
4363                *errorcodeptr = ERR29;
4364                goto FAILED;
4365                }
4366    
4367              if (refsign == '-')
4368                {
4369                if (recno == 0)
4370                  {
4371                  *errorcodeptr = ERR58;
4372                  goto FAILED;
4373                  }
4374                recno = cd->bracount - recno + 1;
4375                if (recno <= 0)
4376                  {
4377                  *errorcodeptr = ERR15;
4378                  goto FAILED;
4379                  }
4380                }
4381              else if (refsign == '+')
4382                {
4383                if (recno == 0)
4384                  {
4385                  *errorcodeptr = ERR58;
4386                  goto FAILED;
4387                  }
4388                recno += cd->bracount;
4389                }
4390    
4391            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4392    
4393            HANDLE_RECURSION:            HANDLE_RECURSION:
4394    
4395            previous = code;            previous = code;
4396              called = cd->start_code;
4397    
4398            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4399            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4400              this point. If we end up with a forward reference, first check that
4401            *code = OP_END;            the bracket does occur later so we can give the error (and position)
4402            called = (recno == 0)?            now. Then remember this forward reference in the workspace so it can
4403              cd->start_code : find_bracket(cd->start_code, utf8, recno);            be filled in at the end. */
4404    
4405            if (called == NULL)            if (lengthptr == NULL)
4406              {              {
4407              *errorcodeptr = ERR15;              *code = OP_END;
4408              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4409    
4410            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4411    
4412            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4413              {                {
4414              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4415              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4416                    {
4417                    *errorcodeptr = ERR15;
4418                    goto FAILED;
4419                    }
4420                  called = cd->start_code + recno;
4421                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4422                  }
4423    
4424                /* If not a forward reference, and the subpattern is still open,
4425                this is a recursive call. We check to see if this is a left
4426                recursion that could loop for ever, and diagnose that case. */
4427    
4428                else if (GET(called, 1) == 0 &&
4429                         could_be_empty(called, code, bcptr, utf8))
4430                  {
4431                  *errorcodeptr = ERR40;
4432                  goto FAILED;
4433                  }
4434              }              }
4435    
4436            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4437              "once" brackets. Set up a "previous group" length so that a
4438              subsequent quantifier will work. */
4439    
4440              *code = OP_ONCE;
4441              PUT(code, 1, 2 + 2*LINK_SIZE);
4442              code += 1 + LINK_SIZE;
4443    
4444            *code = OP_RECURSE;            *code = OP_RECURSE;
4445            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4446            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4447    
4448              *code = OP_KET;
4449              PUT(code, 1, 2 + 2*LINK_SIZE);
4450              code += 1 + LINK_SIZE;
4451    
4452              length_prevgroup = 3 + 3*LINK_SIZE;
4453            }            }
4454    
4455            /* Can't determine a first byte now */
4456    
4457            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4458          continue;          continue;
4459<