/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

<
revision 77 by nigel, Sat Feb 24 21:40:45 2007 UTC revision 200 by ph10, Wed Aug 1 09:10:40 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56    /* When DEBUG is defined, we need the pcre_printint() function, which is also
57    used by pcretest. DEBUG is not defined when building a production library. */
58    
59    #ifdef DEBUG
60    #include "pcre_printint.src"
61    #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    
69  /*************************************************  /*************************************************
70  *      Code parameters and static tables         *  *      Code parameters and static tables         *
71  *************************************************/  *************************************************/
72    
73  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
74  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
75  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
76  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
77  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
78    so this number is very generous.
79    
80    The same workspace is used during the second, actual compile phase for
81    remembering forward references to groups so that they can be filled in at the
82    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
83    is 4 there is plenty of room. */
84    
85  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
86    
87    
88  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 90  are simple data values; negative values
90  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
91  is invalid. */  is invalid. */
92    
93  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
94  static const short int escapes[] = {  static const short int escapes[] = {
95       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
96       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
97     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
98       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
99  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
100  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
101     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
102       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
103  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
104       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
105  };  };
106    
107  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
108  static const short int escapes[] = {  static const short int escapes[] = {
109  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
110  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 114  static const short int escapes[] = {
114  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
115  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
116  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
117  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
118  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
119  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
120  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
121  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
122  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
123  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
124  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
125  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
126  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
127  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
128  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
129  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
130  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
131  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 107  static const short int escapes[] = { Line 134  static const short int escapes[] = {
134    
135    
136  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
137  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
138  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
139    
140  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 118  static const char *const posix_names[] = Line 145  static const char *const posix_names[] =
145  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
146    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
147    
148  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
149  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
150  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
151    characters are removed, and for [:alpha:] and [:alnum:] the underscore
152    character is removed. The triples in the table consist of the base map offset,
153    second map offset or -1 if no second map, and a non-negative value for map
154    addition or a negative value for map subtraction (if there are two maps). The
155    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
156    remove vertical space characters, 2 => remove underscore. */
157    
158  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
159    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
160    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
161    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
162    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
163    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
164    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
165    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
166    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
167    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
168    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
169    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
170    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
171    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
172    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
173  };  };
174    
175    
176    #define STRING(a)  # a
177    #define XSTRING(s) STRING(s)
178    
179  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
180  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
181    they are documented. Always add a new error instead. Messages marked DEAD below
182    are no longer used. */
183    
184  static const char *error_texts[] = {  static const char *error_texts[] = {
185    "no error",    "no error",
# Line 156  static const char *error_texts[] = { Line 194  static const char *error_texts[] = {
194    "range out of order in character class",    "range out of order in character class",
195    "nothing to repeat",    "nothing to repeat",
196    /* 10 */    /* 10 */
197    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
198    "internal error: unexpected repeat",    "internal error: unexpected repeat",
199    "unrecognized character after (?",    "unrecognized character after (?",
200    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 166  static const char *error_texts[] = { Line 204  static const char *error_texts[] = {
204    "erroffset passed as NULL",    "erroffset passed as NULL",
205    "unknown option bit(s) set",    "unknown option bit(s) set",
206    "missing ) after comment",    "missing ) after comment",
207    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
208    /* 20 */    /* 20 */
209    "regular expression too large",    "regular expression too large",
210    "failed to get memory",    "failed to get memory",
# Line 175  static const char *error_texts[] = { Line 213  static const char *error_texts[] = {
213    "unrecognized character after (?<",    "unrecognized character after (?<",
214    /* 25 */    /* 25 */
215    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
216    "malformed number after (?(",    "malformed number or name after (?(",
217    "conditional group contains more than two branches",    "conditional group contains more than two branches",
218    "assertion expected after (?(",    "assertion expected after (?(",
219    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
220    /* 30 */    /* 30 */
221    "unknown POSIX class name",    "unknown POSIX class name",
222    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
223    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
224    "spare error",    "spare error",  /** DEAD **/
225    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
226    /* 35 */    /* 35 */
227    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 194  static const char *error_texts[] = { Line 232  static const char *error_texts[] = {
232    /* 40 */    /* 40 */
233    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
234    "unrecognized character after (?P",    "unrecognized character after (?P",
235    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
236    "two named groups have the same name",    "two named subpatterns have the same name",
237    "invalid UTF-8 string",    "invalid UTF-8 string",
238    /* 45 */    /* 45 */
239    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
240    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
241    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
242      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
243      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
244      /* 50 */
245      "repeated subpattern is too long",
246      "octal value is greater than \\377 (not in UTF-8 mode)",
247      "internal error: overran compiling workspace",
248      "internal error: previously-checked referenced subpattern not found",
249      "DEFINE group contains more than one branch",
250      /* 55 */
251      "repeating a DEFINE group is not allowed",
252      "inconsistent NEWLINE options",
253      "\\g is not followed by a braced name or an optionally braced non-zero number",
254      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
255  };  };
256    
257    
# Line 220  For convenience, we use the same bit def Line 271  For convenience, we use the same bit def
271    
272  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
273    
274  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
275  static const unsigned char digitab[] =  static const unsigned char digitab[] =
276    {    {
277    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 307  static const unsigned char digitab[] =
307    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
308    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
309    
310  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
311  static const unsigned char digitab[] =  static const unsigned char digitab[] =
312    {    {
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 321  static const unsigned char digitab[] =
321    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
322    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
323    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
324    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
325    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
326    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
327    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 355  static const unsigned char ebcdic_charta
355    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
356    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
357    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
358    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
360    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
361    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 382  static const unsigned char ebcdic_charta
382  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
383    
384  static BOOL  static BOOL
385    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
386      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
387    
388    
389    
# Line 342  static BOOL Line 393  static BOOL
393    
394  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
395  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
396  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
397  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
398  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
399    ptr is pointing at the \. On exit, it is on the final character of the escape
400    sequence.
401    
402  Arguments:  Arguments:
403    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 362  static int Line 415  static int
415  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
416    int options, BOOL isclass)    int options, BOOL isclass)
417  {  {
418  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
419    const uschar *ptr = *ptrptr + 1;
420  int c, i;  int c, i;
421    
422    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
423    ptr--;                            /* Set pointer back to the last byte */
424    
425  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
426    
 c = *(++ptr);  
427  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
428    
429  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
430  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
431  Otherwise further processing may be required. */  Otherwise further processing may be required. */
432    
433  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
434  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
435  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
436    
437  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
438  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
439  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
440  #endif  #endif
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 444  else if ((i = escapes[c - 0x48]) != 0)
444  else  else
445    {    {
446    const uschar *oldptr;    const uschar *oldptr;
447      BOOL braced, negated;
448    
449    switch (c)    switch (c)
450      {      {
451      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 459  else
459      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
460      break;      break;
461    
462        /* \g must be followed by a number, either plain or braced. If positive, it
463        is an absolute backreference. If negative, it is a relative backreference.
464        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
465        reference to a named group. This is part of Perl's movement towards a
466        unified syntax for back references. As this is synonymous with \k{name}, we
467        fudge it up by pretending it really was \k. */
468    
469        case 'g':
470        if (ptr[1] == '{')
471          {
472          const uschar *p;
473          for (p = ptr+2; *p != 0 && *p != '}'; p++)
474            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
475          if (*p != 0 && *p != '}')
476            {
477            c = -ESC_k;
478            break;
479            }
480          braced = TRUE;
481          ptr++;
482          }
483        else braced = FALSE;
484    
485        if (ptr[1] == '-')
486          {
487          negated = TRUE;
488          ptr++;
489          }
490        else negated = FALSE;
491    
492        c = 0;
493        while ((digitab[ptr[1]] & ctype_digit) != 0)
494          c = c * 10 + *(++ptr) - '0';
495    
496        if (c == 0 || (braced && *(++ptr) != '}'))
497          {
498          *errorcodeptr = ERR57;
499          return 0;
500          }
501    
502        if (negated)
503          {
504          if (c > bracount)
505            {
506            *errorcodeptr = ERR15;
507            return 0;
508            }
509          c = bracount - (c - 1);
510          }
511    
512        c = -(ESC_REF + c);
513        break;
514    
515      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
516      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
517      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 442  else Line 553  else
553        }        }
554    
555      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
556      larger first octal digit. */      larger first octal digit. The original code used just to take the least
557        significant 8 bits of octal numbers (I think this is what early Perls used
558        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
559        than 3 octal digits. */
560    
561      case '0':      case '0':
562      c -= '0';      c -= '0';
563      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
564          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
565      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
566      break;      break;
567    
568      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
569      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
570        treated as a data character. */
571    
572      case 'x':      case 'x':
573  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
574        {        {
575        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
576        register int count = 0;        int count = 0;
577    
578        c = 0;        c = 0;
579        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
580          {          {
581          int cc = *pt++;          register int cc = *pt++;
582            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
583          count++;          count++;
584  #if !EBCDIC    /* ASCII coding */  
585    #ifndef EBCDIC  /* ASCII coding */
586          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
587          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
588  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
589          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
590          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
591  #endif  #endif
592          }          }
593    
594        if (*pt == '}')        if (*pt == '}')
595          {          {
596          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
597          ptr = pt;          ptr = pt;
598          break;          break;
599          }          }
600    
601        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
602        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
603        }        }
 #endif  
604    
605      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
606    
607      c = 0;      c = 0;
608      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
609        {        {
610        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
611        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
612  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
613        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
614        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
615  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
616        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
617        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
618  #endif  #endif
619        }        }
620      break;      break;
621    
622      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
623        This coding is ASCII-specific, but then the whole concept of \cx is
624        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
625    
626      case 'c':      case 'c':
627      c = *(++ptr);      c = *(++ptr);
# Line 511  else Line 631  else
631        return 0;        return 0;
632        }        }
633    
634      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
635      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
636      c ^= 0x40;      c ^= 0x40;
637  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
638      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
639      c ^= 0xC0;      c ^= 0xC0;
640  #endif  #endif
# Line 560  escape sequence. Line 676  escape sequence.
676  Argument:  Argument:
677    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
678    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
679      dptr           points to an int that is set to the detailed property value
680    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
681    
682  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
683  */  */
684    
685  static int  static int
686  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
687  {  {
688  int c, i, bot, top;  int c, i, bot, top;
689  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
690  char name[4];  char name[32];
691    
692  c = *(++ptr);  c = *(++ptr);
693  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
694    
695  *negptr = FALSE;  *negptr = FALSE;
696    
697  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
698  preceded by ^ for negation. */  negation. */
699    
700  if (c == '{')  if (c == '{')
701    {    {
# Line 587  if (c == '{') Line 704  if (c == '{')
704      *negptr = TRUE;      *negptr = TRUE;
705      ptr++;      ptr++;
706      }      }
707    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
708      {      {
709      c = *(++ptr);      c = *(++ptr);
710      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
711      if (c == '}') break;      if (c == '}') break;
712      name[i] = c;      name[i] = c;
713      }      }
714    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
715    name[i] = 0;    name[i] = 0;
716    }    }
717    
# Line 619  top = _pcre_utt_size; Line 732  top = _pcre_utt_size;
732    
733  while (bot < top)  while (bot < top)
734    {    {
735    i = (bot + top)/2;    i = (bot + top) >> 1;
736    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
737    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
738        {
739        *dptr = _pcre_utt[i].value;
740        return _pcre_utt[i].type;
741        }
742    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
743    }    }
744    
 UNKNOWN_RETURN:  
745  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
746  *ptrptr = ptr;  *ptrptr = ptr;
747  return -1;  return -1;
# Line 698  read_repeat_counts(const uschar *p, int Line 814  read_repeat_counts(const uschar *p, int
814  int min = 0;  int min = 0;
815  int max = -1;  int max = -1;
816    
817    /* Read the minimum value and do a paranoid check: a negative value indicates
818    an integer overflow. */
819    
820  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
821    if (min < 0 || min > 65535)
822      {
823      *errorcodeptr = ERR5;
824      return p;
825      }
826    
827    /* Read the maximum value if there is one, and again do a paranoid on its size.
828    Also, max must not be less than min. */
829    
830  if (*p == '}') max = min; else  if (*p == '}') max = min; else
831    {    {
# Line 706  if (*p == '}') max = min; else Line 833  if (*p == '}') max = min; else
833      {      {
834      max = 0;      max = 0;
835      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
836        if (max < 0 || max > 65535)
837          {
838          *errorcodeptr = ERR5;
839          return p;
840          }
841      if (max < min)      if (max < min)
842        {        {
843        *errorcodeptr = ERR4;        *errorcodeptr = ERR4;
# Line 714  if (*p == '}') max = min; else Line 846  if (*p == '}') max = min; else
846      }      }
847    }    }
848    
849  /* Do paranoid checks, then fill in the required variables, and pass back the  /* Fill in the required variables, and pass back the pointer to the terminating
850  pointer to the terminating '}'. */  '}'. */
851    
852  if (min > 65535 || max > 65535)  *minp = min;
853    *errorcodeptr = ERR5;  *maxp = max;
854  else  return p;
855    }
856    
857    
858    
859    /*************************************************
860    *       Find forward referenced subpattern       *
861    *************************************************/
862    
863    /* This function scans along a pattern's text looking for capturing
864    subpatterns, and counting them. If it finds a named pattern that matches the
865    name it is given, it returns its number. Alternatively, if the name is NULL, it
866    returns when it reaches a given numbered subpattern. This is used for forward
867    references to subpatterns. We know that if (?P< is encountered, the name will
868    be terminated by '>' because that is checked in the first pass.
869    
870    Arguments:
871      ptr          current position in the pattern
872      count        current count of capturing parens so far encountered
873      name         name to seek, or NULL if seeking a numbered subpattern
874      lorn         name length, or subpattern number if name is NULL
875      xmode        TRUE if we are in /x mode
876    
877    Returns:       the number of the named subpattern, or -1 if not found
878    */
879    
880    static int
881    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
882      BOOL xmode)
883    {
884    const uschar *thisname;
885    
886    for (; *ptr != 0; ptr++)
887    {    {
888    *minp = min;    int term;
889    *maxp = max;  
890      /* Skip over backslashed characters and also entire \Q...\E */
891    
892      if (*ptr == '\\')
893        {
894        if (*(++ptr) == 0) return -1;
895        if (*ptr == 'Q') for (;;)
896          {
897          while (*(++ptr) != 0 && *ptr != '\\');
898          if (*ptr == 0) return -1;
899          if (*(++ptr) == 'E') break;
900          }
901        continue;
902        }
903    
904      /* Skip over character classes */
905    
906      if (*ptr == '[')
907        {
908        while (*(++ptr) != ']')
909          {
910          if (*ptr == '\\')
911            {
912            if (*(++ptr) == 0) return -1;
913            if (*ptr == 'Q') for (;;)
914              {
915              while (*(++ptr) != 0 && *ptr != '\\');
916              if (*ptr == 0) return -1;
917              if (*(++ptr) == 'E') break;
918              }
919            continue;
920            }
921          }
922        continue;
923        }
924    
925      /* Skip comments in /x mode */
926    
927      if (xmode && *ptr == '#')
928        {
929        while (*(++ptr) != 0 && *ptr != '\n');
930        if (*ptr == 0) return -1;
931        continue;
932        }
933    
934      /* An opening parens must now be a real metacharacter */
935    
936      if (*ptr != '(') continue;
937      if (ptr[1] != '?')
938        {
939        count++;
940        if (name == NULL && count == lorn) return count;
941        continue;
942        }
943    
944      ptr += 2;
945      if (*ptr == 'P') ptr++;                      /* Allow optional P */
946    
947      /* We have to disambiguate (?<! and (?<= from (?<name> */
948    
949      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
950           *ptr != '\'')
951        continue;
952    
953      count++;
954    
955      if (name == NULL && count == lorn) return count;
956      term = *ptr++;
957      if (term == '<') term = '>';
958      thisname = ptr;
959      while (*ptr != term) ptr++;
960      if (name != NULL && lorn == ptr - thisname &&
961          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
962        return count;
963    }    }
964  return p;  
965    return -1;
966  }  }
967    
968    
# Line 778  for (;;) Line 1016  for (;;)
1016    
1017      case OP_CALLOUT:      case OP_CALLOUT:
1018      case OP_CREF:      case OP_CREF:
1019      case OP_BRANUMBER:      case OP_RREF:
1020        case OP_DEF:
1021      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1022      break;      break;
1023    
# Line 823  for (;;) Line 1062  for (;;)
1062    {    {
1063    int d;    int d;
1064    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1065    
1066    switch (op)    switch (op)
1067      {      {
1068        case OP_CBRA:
1069      case OP_BRA:      case OP_BRA:
1070      case OP_ONCE:      case OP_ONCE:
1071      case OP_COND:      case OP_COND:
1072      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1073      if (d < 0) return d;      if (d < 0) return d;
1074      branchlength += d;      branchlength += d;
1075      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 865  for (;;) Line 1104  for (;;)
1104      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1105    
1106      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1107      case OP_CREF:      case OP_CREF:
1108        case OP_RREF:
1109        case OP_DEF:
1110      case OP_OPT:      case OP_OPT:
1111      case OP_CALLOUT:      case OP_CALLOUT:
1112      case OP_SOD:      case OP_SOD:
# Line 884  for (;;) Line 1124  for (;;)
1124    
1125      case OP_CHAR:      case OP_CHAR:
1126      case OP_CHARNC:      case OP_CHARNC:
1127        case OP_NOT:
1128      branchlength++;      branchlength++;
1129      cc += 2;      cc += 2;
1130  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 917  for (;;) Line 1158  for (;;)
1158    
1159      case OP_PROP:      case OP_PROP:
1160      case OP_NOTPROP:      case OP_NOTPROP:
1161      cc++;      cc += 2;
1162      /* Fall through */      /* Fall through */
1163    
1164      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 998  Returns:      pointer to the opcode for Line 1239  Returns:      pointer to the opcode for
1239  static const uschar *  static const uschar *
1240  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1241  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1242  for (;;)  for (;;)
1243    {    {
1244    register int c = *code;    register int c = *code;
1245    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1246    else if (c > OP_BRA)  
1247      /* XCLASS is used for classes that cannot be represented just by a bit
1248      map. This includes negated single high-valued characters. The length in
1249      the table is zero; the actual length is stored in the compiled code. */
1250    
1251      if (c == OP_XCLASS) code += GET(code, 1);
1252    
1253      /* Handle capturing bracket */
1254    
1255      else if (c == OP_CBRA)
1256      {      {
1257      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1258      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1259      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1260      }      }
1261    
1262      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1263      a multi-byte character. The length in the table is a minimum, so we have to
1264      arrange to skip the extra bytes. */
1265    
1266    else    else
1267      {      {
1268      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1269  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1270      if (utf8) switch(c)      if (utf8) switch(c)
1271        {        {
1272        case OP_CHAR:        case OP_CHAR:
# Line 1031  for (;;) Line 1274  for (;;)
1274        case OP_EXACT:        case OP_EXACT:
1275        case OP_UPTO:        case OP_UPTO:
1276        case OP_MINUPTO:        case OP_MINUPTO:
1277          case OP_POSUPTO:
1278        case OP_STAR:        case OP_STAR:
1279        case OP_MINSTAR:        case OP_MINSTAR:
1280          case OP_POSSTAR:
1281        case OP_PLUS:        case OP_PLUS:
1282        case OP_MINPLUS:        case OP_MINPLUS:
1283          case OP_POSPLUS:
1284        case OP_QUERY:        case OP_QUERY:
1285        case OP_MINQUERY:        case OP_MINQUERY:
1286        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1287        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1288        break;        break;
1289        }        }
1290  #endif  #endif
# Line 1072  Returns:      pointer to the opcode for Line 1311  Returns:      pointer to the opcode for
1311  static const uschar *  static const uschar *
1312  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1313  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1314  for (;;)  for (;;)
1315    {    {
1316    register int c = *code;    register int c = *code;
1317    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1318    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1319    else if (c > OP_BRA)  
1320      {    /* XCLASS is used for classes that cannot be represented just by a bit
1321      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1322      }    the table is zero; the actual length is stored in the compiled code. */
1323    
1324      if (c == OP_XCLASS) code += GET(code, 1);
1325    
1326      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1327      that are followed by a character may be followed by a multi-byte character.
1328      The length in the table is a minimum, so we have to arrange to skip the extra
1329      bytes. */
1330    
1331    else    else
1332      {      {
1333      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1334  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1335      if (utf8) switch(c)      if (utf8) switch(c)
1336        {        {
1337        case OP_CHAR:        case OP_CHAR:
# Line 1103  for (;;) Line 1339  for (;;)
1339        case OP_EXACT:        case OP_EXACT:
1340        case OP_UPTO:        case OP_UPTO:
1341        case OP_MINUPTO:        case OP_MINUPTO:
1342          case OP_POSUPTO:
1343        case OP_STAR:        case OP_STAR:
1344        case OP_MINSTAR:        case OP_MINSTAR:
1345          case OP_POSSTAR:
1346        case OP_PLUS:        case OP_PLUS:
1347        case OP_MINPLUS:        case OP_MINPLUS:
1348          case OP_POSPLUS:
1349        case OP_QUERY:        case OP_QUERY:
1350        case OP_MINQUERY:        case OP_MINQUERY:
1351        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1352        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1353        break;        break;
1354        }        }
1355  #endif  #endif
# Line 1132  for (;;) Line 1364  for (;;)
1364  *************************************************/  *************************************************/
1365    
1366  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1367  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1368  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1369  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1370  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1371    struck an inner bracket whose current branch will already have been scanned.
1372    
1373  Arguments:  Arguments:
1374    code        points to start of search    code        points to start of search
# Line 1149  static BOOL Line 1382  static BOOL
1382  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1383  {  {
1384  register int c;  register int c;
1385  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1386       code < endcode;       code < endcode;
1387       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1388    {    {
# Line 1157  for (code = first_significant_code(code Line 1390  for (code = first_significant_code(code
1390    
1391    c = *code;    c = *code;
1392    
1393    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1394    
1395      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1396        {
1397        code += _pcre_OP_lengths[c];
1398        do code += GET(code, 1); while (*code == OP_ALT);
1399        c = *code;
1400        continue;
1401        }
1402    
1403      /* For other groups, scan the branches. */
1404    
1405      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1406      {      {
1407      BOOL empty_branch;      BOOL empty_branch;
1408      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1173  for (code = first_significant_code(code Line 1418  for (code = first_significant_code(code
1418        }        }
1419      while (*code == OP_ALT);      while (*code == OP_ALT);
1420      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1421      c = *code;      c = *code;
1422        continue;
1423      }      }
1424    
1425    else switch (c)    /* Handle the other opcodes */
1426    
1427      switch (c)
1428      {      {
1429      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1430    
# Line 1233  for (code = first_significant_code(code Line 1480  for (code = first_significant_code(code
1480      case OP_NOT:      case OP_NOT:
1481      case OP_PLUS:      case OP_PLUS:
1482      case OP_MINPLUS:      case OP_MINPLUS:
1483        case OP_POSPLUS:
1484      case OP_EXACT:      case OP_EXACT:
1485      case OP_NOTPLUS:      case OP_NOTPLUS:
1486      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1487        case OP_NOTPOSPLUS:
1488      case OP_NOTEXACT:      case OP_NOTEXACT:
1489      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1490      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1491        case OP_TYPEPOSPLUS:
1492      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1493      return FALSE;      return FALSE;
1494    
# Line 1250  for (code = first_significant_code(code Line 1500  for (code = first_significant_code(code
1500      case OP_ALT:      case OP_ALT:
1501      return TRUE;      return TRUE;
1502    
1503      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1504      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1505    
1506  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1507      case OP_STAR:      case OP_STAR:
1508      case OP_MINSTAR:      case OP_MINSTAR:
1509        case OP_POSSTAR:
1510      case OP_QUERY:      case OP_QUERY:
1511      case OP_MINQUERY:      case OP_MINQUERY:
1512        case OP_POSQUERY:
1513      case OP_UPTO:      case OP_UPTO:
1514      case OP_MINUPTO:      case OP_MINUPTO:
1515        case OP_POSUPTO:
1516      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1517      break;      break;
1518  #endif  #endif
# Line 1377  earlier groups that are outside the curr Line 1630  earlier groups that are outside the curr
1630  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1631  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1632  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1633  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1634  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1635    
1636    This function has been extended with the possibility of forward references for
1637    recursions and subroutine calls. It must also check the list of such references
1638    for the group we are dealing with. If it finds that one of the recursions in
1639    the current group is on this list, it adjusts the offset in the list, not the
1640    value in the reference (which is a group number).
1641    
1642  Arguments:  Arguments:
1643    group      points to the start of the group    group      points to the start of the group
1644    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1645    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1646    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1647      save_hwm   the hwm forward reference pointer at the start of the group
1648    
1649  Returns:     nothing  Returns:     nothing
1650  */  */
1651    
1652  static void  static void
1653  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1654      uschar *save_hwm)
1655  {  {
1656  uschar *ptr = group;  uschar *ptr = group;
1657  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1658    {    {
1659    int offset = GET(ptr, 1);    int offset;
1660    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1661    
1662      /* See if this recursion is on the forward reference list. If so, adjust the
1663      reference. */
1664    
1665      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1666        {
1667        offset = GET(hc, 0);
1668        if (cd->start_code + offset == ptr + 1)
1669          {
1670          PUT(hc, 0, offset + adjust);
1671          break;
1672          }
1673        }
1674    
1675      /* Otherwise, adjust the recursion offset if it's after the start of this
1676      group. */
1677    
1678      if (hc >= cd->hwm)
1679        {
1680        offset = GET(ptr, 1);
1681        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1682        }
1683    
1684    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1685    }    }
1686  }  }
# Line 1475  Yield:        TRUE when range returned; Line 1759  Yield:        TRUE when range returned;
1759  */  */
1760    
1761  static BOOL  static BOOL
1762  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1763      unsigned int *odptr)
1764  {  {
1765  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1766    
1767  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1768    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1769    
1770  if (c > d) return FALSE;  if (c > d) return FALSE;
1771    
# Line 1492  next = othercase + 1; Line 1774  next = othercase + 1;
1774    
1775  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1776    {    {
1777    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1778    next++;    next++;
1779    }    }
1780    
# Line 1506  return TRUE; Line 1786  return TRUE;
1786  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1787    
1788    
1789    
1790  /*************************************************  /*************************************************
1791  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1792  *************************************************/  *************************************************/
1793    
1794  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1795  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1796  bits.  sense to automatically possessify the repeated item.
1797    
1798  Arguments:  Arguments:
1799    optionsptr     pointer to the option bits    op_code       the repeated op code
1800    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1801    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1802    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1803    errorcodeptr   points to error code variable    ptr           next character in pattern
1804    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1805    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1806    
1807  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1808  */  */
1809    
1810  static BOOL  static BOOL
1811  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1812    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1813  {  {
1814  int repeat_type, op_type;  int next;
 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
 int bravalue = 0;  
 int greedy_default, greedy_non_default;  
 int firstbyte, reqbyte;  
 int zeroreqbyte, zerofirstbyte;  
 int req_caseopt, reqvary, tempreqvary;  
 int condcount = 0;  
 int options = *optionsptr;  
 int after_manual_callout = 0;  
 register int c;  
 register uschar *code = *codeptr;  
 uschar *tempcode;  
 BOOL inescq = FALSE;  
 BOOL groupsetfirstbyte = FALSE;  
 const uschar *ptr = *ptrptr;  
 const uschar *tempptr;  
 uschar *previous = NULL;  
 uschar *previous_callout = NULL;  
 uschar classbits[32];  
1815    
1816    /* Skip whitespace and comments in extended mode */
1817    
1818    if ((options & PCRE_EXTENDED) != 0)
1819      {
1820      for (;;)
1821        {
1822        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1823        if (*ptr == '#')
1824          {
1825          while (*(++ptr) != 0)
1826            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1827          }
1828        else break;
1829        }
1830      }
1831    
1832    /* If the next item is one that we can handle, get its value. A non-negative
1833    value is a character, a negative value is an escape value. */
1834    
1835    if (*ptr == '\\')
1836      {
1837      int temperrorcode = 0;
1838      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1839      if (temperrorcode != 0) return FALSE;
1840      ptr++;    /* Point after the escape sequence */
1841      }
1842    
1843    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1844      {
1845  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1846  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1847  #endif  #endif
1848      next = *ptr++;
1849      }
1850    
1851  /* Set up the default and non-default settings for greediness */  else return FALSE;
1852    
1853  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1854    
1855  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1856  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1857  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1858  find one.      {
1859        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1860        if (*ptr == '#')
1861          {
1862          while (*(++ptr) != 0)
1863            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1864          }
1865        else break;
1866        }
1867      }
1868    
1869  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1870    
1871  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1872      return FALSE;
1873    
1874  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1875  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1876  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1877  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1878    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1879    
1880  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1881    
1882  for (;; ptr++)  if (next >= 0) switch(op_code)
1883    {    {
1884    BOOL negate_class;    case OP_CHAR:
1885    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
1886    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1887    #endif
1888      return item != next;
1889    
1890      /* For CHARNC (caseless character) we must check the other case. If we have
1891      Unicode property support, we can use it to test the other case of
1892      high-valued characters. */
1893    
1894      case OP_CHARNC:
1895    #ifdef SUPPORT_UTF8
1896      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1897    #endif
1898      if (item == next) return FALSE;
1899    #ifdef SUPPORT_UTF8
1900      if (utf8)
1901        {
1902        unsigned int othercase;
1903        if (next < 128) othercase = cd->fcc[next]; else
1904    #ifdef SUPPORT_UCP
1905        othercase = _pcre_ucp_othercase((unsigned int)next);
1906    #else
1907        othercase = NOTACHAR;
1908    #endif
1909        return (unsigned int)item != othercase;
1910        }
1911      else
1912    #endif  /* SUPPORT_UTF8 */
1913      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1914    
1915      /* For OP_NOT, "item" must be a single-byte character. */
1916    
1917      case OP_NOT:
1918      if (next < 0) return FALSE;  /* Not a character */
1919      if (item == next) return TRUE;
1920      if ((options & PCRE_CASELESS) == 0) return FALSE;
1921    #ifdef SUPPORT_UTF8
1922      if (utf8)
1923        {
1924        unsigned int othercase;
1925        if (next < 128) othercase = cd->fcc[next]; else
1926    #ifdef SUPPORT_UCP
1927        othercase = _pcre_ucp_othercase(next);
1928    #else
1929        othercase = NOTACHAR;
1930    #endif
1931        return (unsigned int)item == othercase;
1932        }
1933      else
1934    #endif  /* SUPPORT_UTF8 */
1935      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1936    
1937      case OP_DIGIT:
1938      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1939    
1940      case OP_NOT_DIGIT:
1941      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1942    
1943      case OP_WHITESPACE:
1944      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1945    
1946      case OP_NOT_WHITESPACE:
1947      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1948    
1949      case OP_WORDCHAR:
1950      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1951    
1952      case OP_NOT_WORDCHAR:
1953      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1954    
1955      case OP_HSPACE:
1956      case OP_NOT_HSPACE:
1957      switch(next)
1958        {
1959        case 0x09:
1960        case 0x20:
1961        case 0xa0:
1962        case 0x1680:
1963        case 0x180e:
1964        case 0x2000:
1965        case 0x2001:
1966        case 0x2002:
1967        case 0x2003:
1968        case 0x2004:
1969        case 0x2005:
1970        case 0x2006:
1971        case 0x2007:
1972        case 0x2008:
1973        case 0x2009:
1974        case 0x200A:
1975        case 0x202f:
1976        case 0x205f:
1977        case 0x3000:
1978        return op_code != OP_HSPACE;
1979        default:
1980        return op_code == OP_HSPACE;
1981        }
1982    
1983      case OP_VSPACE:
1984      case OP_NOT_VSPACE:
1985      switch(next)
1986        {
1987        case 0x0a:
1988        case 0x0b:
1989        case 0x0c:
1990        case 0x0d:
1991        case 0x85:
1992        case 0x2028:
1993        case 0x2029:
1994        return op_code != OP_VSPACE;
1995        default:
1996        return op_code == OP_VSPACE;
1997        }
1998    
1999      default:
2000      return FALSE;
2001      }
2002    
2003    
2004    /* Handle the case when the next item is \d, \s, etc. */
2005    
2006    switch(op_code)
2007      {
2008      case OP_CHAR:
2009      case OP_CHARNC:
2010    #ifdef SUPPORT_UTF8
2011      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2012    #endif
2013      switch(-next)
2014        {
2015        case ESC_d:
2016        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2017    
2018        case ESC_D:
2019        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2020    
2021        case ESC_s:
2022        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2023    
2024        case ESC_S:
2025        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2026    
2027        case ESC_w:
2028        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2029    
2030        case ESC_W:
2031        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2032    
2033        case ESC_h:
2034        case ESC_H:
2035        switch(item)
2036          {
2037          case 0x09:
2038          case 0x20:
2039          case 0xa0:
2040          case 0x1680:
2041          case 0x180e:
2042          case 0x2000:
2043          case 0x2001:
2044          case 0x2002:
2045          case 0x2003:
2046          case 0x2004:
2047          case 0x2005:
2048          case 0x2006:
2049          case 0x2007:
2050          case 0x2008:
2051          case 0x2009:
2052          case 0x200A:
2053          case 0x202f:
2054          case 0x205f:
2055          case 0x3000:
2056          return -next != ESC_h;
2057          default:
2058          return -next == ESC_h;
2059          }
2060    
2061        case ESC_v:
2062        case ESC_V:
2063        switch(item)
2064          {
2065          case 0x0a:
2066          case 0x0b:
2067          case 0x0c:
2068          case 0x0d:
2069          case 0x85:
2070          case 0x2028:
2071          case 0x2029:
2072          return -next != ESC_v;
2073          default:
2074          return -next == ESC_v;
2075          }
2076    
2077        default:
2078        return FALSE;
2079        }
2080    
2081      case OP_DIGIT:
2082      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2083             next == -ESC_h || next == -ESC_v;
2084    
2085      case OP_NOT_DIGIT:
2086      return next == -ESC_d;
2087    
2088      case OP_WHITESPACE:
2089      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2090    
2091      case OP_NOT_WHITESPACE:
2092      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2093    
2094      case OP_HSPACE:
2095      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2096    
2097      case OP_NOT_HSPACE:
2098      return next == -ESC_h;
2099    
2100      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2101      case OP_VSPACE:
2102      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2103    
2104      case OP_NOT_VSPACE:
2105      return next == -ESC_v;
2106    
2107      case OP_WORDCHAR:
2108      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2109    
2110      case OP_NOT_WORDCHAR:
2111      return next == -ESC_w || next == -ESC_d;
2112    
2113      default:
2114      return FALSE;
2115      }
2116    
2117    /* Control does not reach here */
2118    }
2119    
2120    
2121    
2122    /*************************************************
2123    *           Compile one branch                   *
2124    *************************************************/
2125    
2126    /* Scan the pattern, compiling it into the a vector. If the options are
2127    changed during the branch, the pointer is used to change the external options
2128    bits. This function is used during the pre-compile phase when we are trying
2129    to find out the amount of memory needed, as well as during the real compile
2130    phase. The value of lengthptr distinguishes the two phases.
2131    
2132    Arguments:
2133      optionsptr     pointer to the option bits
2134      codeptr        points to the pointer to the current code point
2135      ptrptr         points to the current pattern pointer
2136      errorcodeptr   points to error code variable
2137      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2138      reqbyteptr     set to the last literal character required, else < 0
2139      bcptr          points to current branch chain
2140      cd             contains pointers to tables etc.
2141      lengthptr      NULL during the real compile phase
2142                     points to length accumulator during pre-compile phase
2143    
2144    Returns:         TRUE on success
2145                     FALSE, with *errorcodeptr set non-zero on error
2146    */
2147    
2148    static BOOL
2149    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2150      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2151      compile_data *cd, int *lengthptr)
2152    {
2153    int repeat_type, op_type;
2154    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2155    int bravalue = 0;
2156    int greedy_default, greedy_non_default;
2157    int firstbyte, reqbyte;
2158    int zeroreqbyte, zerofirstbyte;
2159    int req_caseopt, reqvary, tempreqvary;
2160    int options = *optionsptr;
2161    int after_manual_callout = 0;
2162    int length_prevgroup = 0;
2163    register int c;
2164    register uschar *code = *codeptr;
2165    uschar *last_code = code;
2166    uschar *orig_code = code;
2167    uschar *tempcode;
2168    BOOL inescq = FALSE;
2169    BOOL groupsetfirstbyte = FALSE;
2170    const uschar *ptr = *ptrptr;
2171    const uschar *tempptr;
2172    uschar *previous = NULL;
2173    uschar *previous_callout = NULL;
2174    uschar *save_hwm = NULL;
2175    uschar classbits[32];
2176    
2177    #ifdef SUPPORT_UTF8
2178    BOOL class_utf8;
2179    BOOL utf8 = (options & PCRE_UTF8) != 0;
2180    uschar *class_utf8data;
2181    uschar utf8_char[6];
2182    #else
2183    BOOL utf8 = FALSE;
2184    uschar *utf8_char = NULL;
2185    #endif
2186    
2187    #ifdef DEBUG
2188    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2189    #endif
2190    
2191    /* Set up the default and non-default settings for greediness */
2192    
2193    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2194    greedy_non_default = greedy_default ^ 1;
2195    
2196    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2197    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2198    matches a non-fixed char first char; reqbyte just remains unset if we never
2199    find one.
2200    
2201    When we hit a repeat whose minimum is zero, we may have to adjust these values
2202    to take the zero repeat into account. This is implemented by setting them to
2203    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2204    item types that can be repeated set these backoff variables appropriately. */
2205    
2206    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2207    
2208    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2209    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2210    value > 255. It is added into the firstbyte or reqbyte variables to record the
2211    case status of the value. This is used only for ASCII characters. */
2212    
2213    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2214    
2215    /* Switch on next character until the end of the branch */
2216    
2217    for (;; ptr++)
2218      {
2219      BOOL negate_class;
2220      BOOL possessive_quantifier;
2221      BOOL is_quantifier;
2222      BOOL is_recurse;
2223      BOOL reset_bracount;
2224    int class_charcount;    int class_charcount;
2225    int class_lastchar;    int class_lastchar;
2226    int newoptions;    int newoptions;
2227    int recno;    int recno;
2228      int refsign;
2229    int skipbytes;    int skipbytes;
2230    int subreqbyte;    int subreqbyte;
2231    int subfirstbyte;    int subfirstbyte;
2232      int terminator;
2233    int mclength;    int mclength;
2234    uschar mcbuffer[8];    uschar mcbuffer[8];
2235    
2236    /* Next byte in the pattern */    /* Get next byte in the pattern */
2237    
2238    c = *ptr;    c = *ptr;
2239    
2240      /* If we are in the pre-compile phase, accumulate the length used for the
2241      previous cycle of this loop. */
2242    
2243      if (lengthptr != NULL)
2244        {
2245    #ifdef DEBUG
2246        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2247    #endif
2248        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2249          {
2250          *errorcodeptr = ERR52;
2251          goto FAILED;
2252          }
2253    
2254        /* There is at least one situation where code goes backwards: this is the
2255        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2256        the class is simply eliminated. However, it is created first, so we have to
2257        allow memory for it. Therefore, don't ever reduce the length at this point.
2258        */
2259    
2260        if (code < last_code) code = last_code;
2261        *lengthptr += code - last_code;
2262        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2263    
2264        /* If "previous" is set and it is not at the start of the work space, move
2265        it back to there, in order to avoid filling up the work space. Otherwise,
2266        if "previous" is NULL, reset the current code pointer to the start. */
2267    
2268        if (previous != NULL)
2269          {
2270          if (previous > orig_code)
2271            {
2272            memmove(orig_code, previous, code - previous);
2273            code -= previous - orig_code;
2274            previous = orig_code;
2275            }
2276          }
2277        else code = orig_code;
2278    
2279        /* Remember where this code item starts so we can pick up the length
2280        next time round. */
2281    
2282        last_code = code;
2283        }
2284    
2285      /* In the real compile phase, just check the workspace used by the forward
2286      reference list. */
2287    
2288      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2289        {
2290        *errorcodeptr = ERR52;
2291        goto FAILED;
2292        }
2293    
2294    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2295    
2296    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1623  for (;; ptr++) Line 2305  for (;; ptr++)
2305        {        {
2306        if (previous_callout != NULL)        if (previous_callout != NULL)
2307          {          {
2308          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2309              complete_callout(previous_callout, ptr, cd);
2310          previous_callout = NULL;          previous_callout = NULL;
2311          }          }
2312        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1644  for (;; ptr++) Line 2327  for (;; ptr++)
2327    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2328         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2329      {      {
2330      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2331          complete_callout(previous_callout, ptr, cd);
2332      previous_callout = NULL;      previous_callout = NULL;
2333      }      }
2334    
# Line 1655  for (;; ptr++) Line 2339  for (;; ptr++)
2339      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2340      if (c == '#')      if (c == '#')
2341        {        {
2342        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2343        on the Macintosh. */          {
2344        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2345        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2346          if (*ptr != 0) continue;
2347    
2348          /* Else fall through to handle end of string */
2349          c = 0;
2350        }        }
2351      }      }
2352    
# Line 1672  for (;; ptr++) Line 2360  for (;; ptr++)
2360    
2361    switch(c)    switch(c)
2362      {      {
2363      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2364        case 0:                        /* The branch terminates at string end */
2365      case 0:      case '|':                      /* or | or ) */
     case '|':  
2366      case ')':      case ')':
2367      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2368      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2369      *codeptr = code;      *codeptr = code;
2370      *ptrptr = ptr;      *ptrptr = ptr;
2371        if (lengthptr != NULL)
2372          {
2373          *lengthptr += code - last_code;   /* To include callout length */
2374          DPRINTF((">> end branch\n"));
2375          }
2376      return TRUE;      return TRUE;
2377    
2378    
2379        /* ===================================================================*/
2380      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2381      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2382    
# Line 1711  for (;; ptr++) Line 2405  for (;; ptr++)
2405      *code++ = OP_ANY;      *code++ = OP_ANY;
2406      break;      break;
2407    
2408      /* Character classes. If the included characters are all < 255 in value, we  
2409      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2410      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2411      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2412      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2413        map as usual, then invert it at the end. However, we use a different opcode
2414        so that data characters > 255 can be handled correctly.
2415    
2416      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2417      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1749  for (;; ptr++) Line 2445  for (;; ptr++)
2445        }        }
2446    
2447      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2448      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2449      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2450    
2451      class_charcount = 0;      class_charcount = 0;
2452      class_lastchar = -1;      class_lastchar = -1;
2453    
2454        /* Initialize the 32-char bit map to all zeros. We build the map in a
2455        temporary bit of memory, in case the class contains only 1 character (less
2456        than 256), because in that case the compiled code doesn't use the bit map.
2457        */
2458    
2459        memset(classbits, 0, 32 * sizeof(uschar));
2460    
2461  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2462      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2463      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2464  #endif  #endif
2465    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2466      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2467      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2468      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2469    
2470      do      if (c != 0) do
2471        {        {
2472          const uschar *oldptr;
2473    
2474  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2475        if (utf8 && c > 127)        if (utf8 && c > 127)
2476          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1786  for (;; ptr++) Line 2482  for (;; ptr++)
2482    
2483        if (inescq)        if (inescq)
2484          {          {
2485          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2486            {            {
2487            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2488            ptr++;            ptr++;                            /* Skip the 'E' */
2489            continue;            continue;                         /* Carry on with next */
2490            }            }
2491          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2492          }          }
2493    
2494        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1806  for (;; ptr++) Line 2502  for (;; ptr++)
2502            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2503          {          {
2504          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2505          int posix_class, i;          int posix_class, taboffset, tabopt;
2506          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2507            uschar pbits[32];
2508    
2509          if (ptr[1] != ':')          if (ptr[1] != ':')
2510            {            {
# Line 1836  for (;; ptr++) Line 2533  for (;; ptr++)
2533          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2534            posix_class = 0;            posix_class = 0;
2535    
2536          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2537          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2538          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2539          white space chars afterwards. */          result into the bit map that is being built. */
2540    
2541          posix_class *= 3;          posix_class *= 3;
2542          for (i = 0; i < 3; i++)  
2543            /* Copy in the first table (always present) */
2544    
2545            memcpy(pbits, cbits + posix_class_maps[posix_class],
2546              32 * sizeof(uschar));
2547    
2548            /* If there is a second table, add or remove it as required. */
2549    
2550            taboffset = posix_class_maps[posix_class + 1];
2551            tabopt = posix_class_maps[posix_class + 2];
2552    
2553            if (taboffset >= 0)
2554            {            {
2555            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2556            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2557            else            else
2558              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2559            }            }
2560    
2561            /* Not see if we need to remove any special characters. An option
2562            value of 1 removes vertical space and 2 removes underscore. */
2563    
2564            if (tabopt < 0) tabopt = -tabopt;
2565            if (tabopt == 1) pbits[1] &= ~0x3c;
2566              else if (tabopt == 2) pbits[11] &= 0x7f;
2567    
2568            /* Add the POSIX table or its complement into the main table that is
2569            being built and we are done. */
2570    
2571            if (local_negate)
2572              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2573            else
2574              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2575    
2576          ptr = tempptr + 1;          ptr = tempptr + 1;
2577          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2578          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2579          }          }
2580    
2581        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2582        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2583        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2584        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2585        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2586        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2587    
2588        if (c == '\\')        if (c == '\\')
2589          {          {
2590          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2591            if (*errorcodeptr != 0) goto FAILED;
2592    
2593          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2594          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2595            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2596          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2597            {            {
2598            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1895  for (;; ptr++) Line 2607  for (;; ptr++)
2607            {            {
2608            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2609            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2610            switch (-c)  
2611              /* Save time by not doing this in the pre-compile phase. */
2612    
2613              if (lengthptr == NULL) switch (-c)
2614              {              {
2615              case ESC_d:              case ESC_d:
2616              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1923  for (;; ptr++) Line 2638  for (;; ptr++)
2638              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2639              continue;              continue;
2640    
2641  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
2642              case ESC_p:              continue;
2643              case ESC_P:  
2644                default:    /* Not recognized; fall through */
2645                break;      /* Need "default" setting to stop compiler warning. */
2646                }
2647    
2648              /* In the pre-compile phase, just do the recognition. */
2649    
2650              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2651                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2652    
2653              /* We need to deal with \H, \h, \V, and \v in both phases because
2654              they use extra memory. */
2655    
2656              if (-c == ESC_h)
2657                {
2658                SETBIT(classbits, 0x09); /* VT */
2659                SETBIT(classbits, 0x20); /* SPACE */
2660                SETBIT(classbits, 0xa0); /* NSBP */
2661    #ifdef SUPPORT_UTF8
2662                if (utf8)
2663                {                {
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
2664                class_utf8 = TRUE;                class_utf8 = TRUE;
2665                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2666                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2667                *class_utf8data++ = property;                *class_utf8data++ = XCL_SINGLE;
2668                class_charcount -= 2;   /* Not a < 256 character */                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2669                  *class_utf8data++ = XCL_RANGE;
2670                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2671                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2672                  *class_utf8data++ = XCL_SINGLE;
2673                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2674                  *class_utf8data++ = XCL_SINGLE;
2675                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2676                  *class_utf8data++ = XCL_SINGLE;
2677                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2678                }                }
             continue;  
2679  #endif  #endif
2680                continue;
2681                }
2682    
2683              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
2684              strict mode. By default, for compatibility with Perl, they are              {
2685              treated as literals. */              for (c = 0; c < 32; c++)
2686                  {
2687                  int x = 0xff;
2688                  switch (c)
2689                    {
2690                    case 0x09/8: x ^= 1 << (0x09%8); break;
2691                    case 0x20/8: x ^= 1 << (0x20%8); break;
2692                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2693                    default: break;
2694                    }
2695                  classbits[c] |= x;
2696                  }
2697    
2698              default:  #ifdef SUPPORT_UTF8
2699              if ((options & PCRE_EXTRA) != 0)              if (utf8)
2700                {                {
2701                *errorcodeptr = ERR7;                class_utf8 = TRUE;
2702                goto FAILED;                *class_utf8data++ = XCL_RANGE;
2703                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2704                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2705                  *class_utf8data++ = XCL_RANGE;
2706                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2707                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2708                  *class_utf8data++ = XCL_RANGE;
2709                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2710                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2711                  *class_utf8data++ = XCL_RANGE;
2712                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2713                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2714                  *class_utf8data++ = XCL_RANGE;
2715                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2716                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2717                  *class_utf8data++ = XCL_RANGE;
2718                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2719                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2720                  *class_utf8data++ = XCL_RANGE;
2721                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2722                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2723                }                }
2724              c = *ptr;              /* The final character */  #endif
2725              class_charcount -= 2;  /* Undo the default count from above */              continue;
2726              }              }
           }  
2727    
2728          /* Fall through if we have a single character (c >= 0). This may be            if (-c == ESC_v)
2729          > 256 in UTF-8 mode. */              {
2730                SETBIT(classbits, 0x0a); /* LF */
2731                SETBIT(classbits, 0x0b); /* VT */
2732                SETBIT(classbits, 0x0c); /* FF */
2733                SETBIT(classbits, 0x0d); /* CR */
2734                SETBIT(classbits, 0x85); /* NEL */
2735    #ifdef SUPPORT_UTF8
2736                if (utf8)
2737                  {
2738                  class_utf8 = TRUE;
2739                  *class_utf8data++ = XCL_RANGE;
2740                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2741                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2742                  }
2743    #endif
2744                continue;
2745                }
2746    
2747              if (-c == ESC_V)
2748                {
2749                for (c = 0; c < 32; c++)
2750                  {
2751                  int x = 0xff;
2752                  switch (c)
2753                    {
2754                    case 0x0a/8: x ^= 1 << (0x0a%8);
2755                                 x ^= 1 << (0x0b%8);
2756                                 x ^= 1 << (0x0c%8);
2757                                 x ^= 1 << (0x0d%8);
2758                                 break;
2759                    case 0x85/8: x ^= 1 << (0x85%8); break;
2760                    default: break;
2761                    }
2762                  classbits[c] |= x;
2763                  }
2764    
2765    #ifdef SUPPORT_UTF8
2766                if (utf8)
2767                  {
2768                  class_utf8 = TRUE;
2769                  *class_utf8data++ = XCL_RANGE;
2770                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2771                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2772                  *class_utf8data++ = XCL_RANGE;
2773                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2774                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2775                  }
2776    #endif
2777                continue;
2778                }
2779    
2780              /* We need to deal with \P and \p in both phases. */
2781    
2782    #ifdef SUPPORT_UCP
2783              if (-c == ESC_p || -c == ESC_P)
2784                {
2785                BOOL negated;
2786                int pdata;
2787                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2788                if (ptype < 0) goto FAILED;
2789                class_utf8 = TRUE;
2790                *class_utf8data++ = ((-c == ESC_p) != negated)?
2791                  XCL_PROP : XCL_NOTPROP;
2792                *class_utf8data++ = ptype;
2793                *class_utf8data++ = pdata;
2794                class_charcount -= 2;   /* Not a < 256 character */
2795                continue;
2796                }
2797    #endif
2798              /* Unrecognized escapes are faulted if PCRE is running in its
2799              strict mode. By default, for compatibility with Perl, they are
2800              treated as literals. */
2801    
2802              if ((options & PCRE_EXTRA) != 0)
2803                {
2804                *errorcodeptr = ERR7;
2805                goto FAILED;
2806                }
2807    
2808              class_charcount -= 2;  /* Undo the default count from above */
2809              c = *ptr;              /* Get the final character and fall through */
2810              }
2811    
2812            /* Fall through if we have a single character (c >= 0). This may be
2813            greater than 256 in UTF-8 mode. */
2814    
2815          }   /* End of backslash handling */          }   /* End of backslash handling */
2816    
2817        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2818        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2819        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2820          entirely. The code for handling \Q and \E is messy. */
2821    
2822          CHECK_RANGE:
2823          while (ptr[1] == '\\' && ptr[2] == 'E')
2824            {
2825            inescq = FALSE;
2826            ptr += 2;
2827            }
2828    
2829        if (ptr[1] == '-' && ptr[2] != ']')        oldptr = ptr;
2830    
2831          if (!inescq && ptr[1] == '-')
2832          {          {
2833          int d;          int d;
2834          ptr += 2;          ptr += 2;
2835            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2836    
2837            /* If we hit \Q (not followed by \E) at this point, go into escaped
2838            mode. */
2839    
2840            while (*ptr == '\\' && ptr[1] == 'Q')
2841              {
2842              ptr += 2;
2843              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2844              inescq = TRUE;
2845              break;
2846              }
2847    
2848            if (*ptr == 0 || (!inescq && *ptr == ']'))
2849              {
2850              ptr = oldptr;
2851              goto LONE_SINGLE_CHARACTER;
2852              }
2853    
2854  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2855          if (utf8)          if (utf8)
# Line 1981  for (;; ptr++) Line 2864  for (;; ptr++)
2864          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2865          in such circumstances. */          in such circumstances. */
2866    
2867          if (d == '\\')          if (!inescq && d == '\\')
2868            {            {
2869            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2870            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2871    
2872            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2873            was literal */            special means the '-' was literal */
2874    
2875            if (d < 0)            if (d < 0)
2876              {              {
2877              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2878              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2879                else if (d == -ESC_R) d = 'R'; else
2880                {                {
2881                ptr = oldptr - 2;                ptr = oldptr;
2882                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2883                }                }
2884              }              }
2885            }            }
2886    
2887          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2888          the pre-pass. Optimize one-character ranges */          one-character ranges */
2889    
2890            if (d < c)
2891              {
2892              *errorcodeptr = ERR8;
2893              goto FAILED;
2894              }
2895    
2896          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2897    
# Line 2022  for (;; ptr++) Line 2912  for (;; ptr++)
2912  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2913            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2914              {              {
2915              int occ, ocd;              unsigned int occ, ocd;
2916              int cc = c;              unsigned int cc = c;
2917              int origd = d;              unsigned int origd = d;
2918              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2919                {                {
2920                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2921                      ocd <= (unsigned int)d)
2922                    continue;                          /* Skip embedded ranges */
2923    
2924                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2925                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2926                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2927                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2928                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2929                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2930                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2931                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2932                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2933                  d = ocd;                  d = ocd;
2934                  continue;                  continue;
# Line 2082  for (;; ptr++) Line 2976  for (;; ptr++)
2976          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2977          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2978    
2979          for (; c <= d; c++)          class_charcount += d - c + 1;
2980            class_lastchar = d;
2981    
2982            /* We can save a bit of time by skipping this in the pre-compile. */
2983    
2984            if (lengthptr == NULL) for (; c <= d; c++)
2985            {            {
2986            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2987            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2090  for (;; ptr++) Line 2989  for (;; ptr++)
2989              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2990              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2991              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2992            }            }
2993    
2994          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2115  for (;; ptr++) Line 3012  for (;; ptr++)
3012  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3013          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3014            {            {
3015            int chartype;            unsigned int othercase;
3016            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3017              {              {
3018              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3019              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2143  for (;; ptr++) Line 3038  for (;; ptr++)
3038          }          }
3039        }        }
3040    
3041      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3042      loop. This "while" is the end of the "do" above. */  
3043        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3044    
3045      while ((c = *(++ptr)) != ']' || inescq);      if (c == 0)                          /* Missing terminating ']' */
3046          {
3047          *errorcodeptr = ERR6;
3048          goto FAILED;
3049          }
3050    
3051      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3052      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2210  for (;; ptr++) Line 3110  for (;; ptr++)
3110    
3111      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3112      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3113      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3114    
3115  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3116      if (class_utf8)      if (class_utf8)
# Line 2220  for (;; ptr++) Line 3120  for (;; ptr++)
3120        code += LINK_SIZE;        code += LINK_SIZE;
3121        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3122    
3123        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3124        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3125    
3126        if (class_charcount > 0)        if (class_charcount > 0)
3127          {          {
3128          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3129            memmove(code + 32, code, class_utf8data - code);
3130          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3131          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3132          }          }
3133          else code = class_utf8data;
3134    
3135        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3136    
# Line 2254  for (;; ptr++) Line 3147  for (;; ptr++)
3147      if (negate_class)      if (negate_class)
3148        {        {
3149        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3150        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3151            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3152        }        }
3153      else      else
3154        {        {
# Line 2264  for (;; ptr++) Line 3158  for (;; ptr++)
3158      code += 32;      code += 32;
3159      break;      break;
3160    
3161    
3162        /* ===================================================================*/
3163      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3164      has been tested above. */      has been tested above. */
3165    
# Line 2331  for (;; ptr++) Line 3227  for (;; ptr++)
3227        }        }
3228      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3229    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3230      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3231      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3232      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2378  for (;; ptr++) Line 3260  for (;; ptr++)
3260          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3261          }          }
3262    
3263          /* If the repetition is unlimited, it pays to see if the next thing on
3264          the line is something that cannot possibly match this character. If so,
3265          automatically possessifying this item gains some performance in the case
3266          where the match fails. */
3267    
3268          if (!possessive_quantifier &&
3269              repeat_max < 0 &&
3270              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3271                options, cd))
3272            {
3273            repeat_type = 0;    /* Force greedy */
3274            possessive_quantifier = TRUE;
3275            }
3276    
3277        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3278        }        }
3279    
3280      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3281      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3282      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3283      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3284        currently used only for single-byte chars. */
3285    
3286      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3287        {        {
3288        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3289        c = previous[1];        c = previous[1];
3290          if (!possessive_quantifier &&
3291              repeat_max < 0 &&
3292              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3293            {
3294            repeat_type = 0;    /* Force greedy */
3295            possessive_quantifier = TRUE;
3296            }
3297        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3298        }        }
3299    
# Line 2403  for (;; ptr++) Line 3307  for (;; ptr++)
3307      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3308        {        {
3309        uschar *oldcode;        uschar *oldcode;
3310        int prop_type;        int prop_type, prop_value;
3311        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3312        c = *previous;        c = *previous;
3313    
3314          if (!possessive_quantifier &&
3315              repeat_max < 0 &&
3316              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3317            {
3318            repeat_type = 0;    /* Force greedy */
3319            possessive_quantifier = TRUE;
3320            }
3321    
3322        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3323        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3324          previous[1] : -1;          {
3325            prop_type = previous[1];
3326            prop_value = previous[2];
3327            }
3328          else prop_type = prop_value = -1;
3329    
3330        oldcode = code;        oldcode = code;
3331        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2443  for (;; ptr++) Line 3359  for (;; ptr++)
3359          }          }
3360    
3361        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3362        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3363        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3364        one less than the maximum. */        one less than the maximum. */
3365    
# Line 2470  for (;; ptr++) Line 3386  for (;; ptr++)
3386    
3387          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3388          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3389          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3390          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3391          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3392    
# Line 2486  for (;; ptr++) Line 3402  for (;; ptr++)
3402  #endif  #endif
3403              {              {
3404              *code++ = c;              *code++ = c;
3405              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3406                  {
3407                  *code++ = prop_type;
3408                  *code++ = prop_value;
3409                  }
3410              }              }
3411            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3412            }            }
3413    
3414          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3415          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3416            UPTO is just for 1 instance, we can use QUERY instead. */
3417    
3418          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3419            {            {
# Line 2505  for (;; ptr++) Line 3426  for (;; ptr++)
3426            else            else
3427  #endif  #endif
3428            *code++ = c;            *code++ = c;
3429            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3430                {
3431                *code++ = prop_type;
3432                *code++ = prop_value;
3433                }
3434            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3435            *code++ = OP_UPTO + repeat_type;  
3436            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3437                {
3438                *code++ = OP_QUERY + repeat_type;
3439                }
3440              else
3441                {
3442                *code++ = OP_UPTO + repeat_type;
3443                PUT2INC(code, 0, repeat_max);
3444                }
3445            }            }
3446          }          }
3447    
# Line 2524  for (;; ptr++) Line 3457  for (;; ptr++)
3457  #endif  #endif
3458        *code++ = c;        *code++ = c;
3459    
3460        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3461        defines the required property. */        define the required property. */
3462    
3463  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3464        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3465            {
3466            *code++ = prop_type;
3467            *code++ = prop_value;
3468            }
3469  #endif  #endif
3470        }        }
3471    
# Line 2571  for (;; ptr++) Line 3508  for (;; ptr++)
3508      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3509      cases. */      cases. */
3510    
3511      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3512               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3513        {        {
3514        register int i;        register int i;
3515        int ketoffset = 0;        int ketoffset = 0;
3516        int len = code - previous;        int len = code - previous;
3517        uschar *bralink = NULL;        uschar *bralink = NULL;
3518    
3519          /* Repeating a DEFINE group is pointless */
3520    
3521          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3522            {
3523            *errorcodeptr = ERR55;
3524            goto FAILED;
3525            }
3526    
3527          /* This is a paranoid check to stop integer overflow later on */
3528    
3529          if (len > MAX_DUPLENGTH)
3530            {
3531            *errorcodeptr = ERR50;
3532            goto FAILED;
3533            }
3534    
3535        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3536        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3537        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2613  for (;; ptr++) Line 3566  for (;; ptr++)
3566          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3567          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3568          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3569          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3570          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3571            doing this. */
3572    
3573          if (repeat_max <= 1)          if (repeat_max <= 1)
3574            {            {
3575            *code = OP_END;            *code = OP_END;
3576            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3577            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3578            code++;            code++;
3579            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2637  for (;; ptr++) Line 3591  for (;; ptr++)
3591            {            {
3592            int offset;            int offset;
3593            *code = OP_END;            *code = OP_END;
3594            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3595            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3596            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3597            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3611  for (;; ptr++)
3611        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3612        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3613        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3614        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3615          forward reference subroutine calls in the group, there will be entries on
3616          the workspace list; replicate these with an appropriate increment. */
3617    
3618        else        else
3619          {          {
3620          if (repeat_min > 1)          if (repeat_min > 1)
3621            {            {
3622            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3623            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3624    
3625              if (lengthptr != NULL)
3626                *lengthptr += (repeat_min - 1)*length_prevgroup;
3627    
3628              /* This is compiling for real */
3629    
3630              else
3631              {              {
3632              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3633              code += len;              for (i = 1; i < repeat_min; i++)
3634                  {
3635                  uschar *hc;
3636                  uschar *this_hwm = cd->hwm;
3637                  memcpy(code, previous, len);
3638                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3639                    {
3640                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3641                    cd->hwm += LINK_SIZE;
3642                    }
3643                  save_hwm = this_hwm;
3644                  code += len;
3645                  }
3646              }              }
3647            }            }
3648    
3649          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3650          }          }
3651    
# Line 2677  for (;; ptr++) Line 3653  for (;; ptr++)
3653        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3654        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3655        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3656        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3657          replicate entries on the forward reference list. */
3658    
3659        if (repeat_max >= 0)        if (repeat_max >= 0)
3660          {          {
3661          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3662            just adjust the length as if we had. For each repetition we must add 1
3663            to the length for BRAZERO and for all but the last repetition we must
3664            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3665    
3666            if (lengthptr != NULL && repeat_max > 0)
3667              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3668                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3669    
3670            /* This is compiling for real */
3671    
3672            else for (i = repeat_max - 1; i >= 0; i--)
3673            {            {
3674              uschar *hc;
3675              uschar *this_hwm = cd->hwm;
3676    
3677            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3678    
3679            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2698  for (;; ptr++) Line 3689  for (;; ptr++)
3689              }              }
3690    
3691            memcpy(code, previous, len);            memcpy(code, previous, len);
3692              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3693                {
3694                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3695                cd->hwm += LINK_SIZE;
3696                }
3697              save_hwm = this_hwm;
3698            code += len;            code += len;
3699            }            }
3700    
# Line 2720  for (;; ptr++) Line 3717  for (;; ptr++)
3717        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3718        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3719        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3720        correct offset was computed above. */        correct offset was computed above.
3721    
3722          Then, when we are doing the actual compile phase, check to see whether
3723          this group is a non-atomic one that could match an empty string. If so,
3724          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3725          that runtime checking can be done. [This check is also applied to
3726          atomic groups at runtime, but in a different way.] */
3727    
3728        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3729            {
3730            uschar *ketcode = code - ketoffset;
3731            uschar *bracode = ketcode - GET(ketcode, 1);
3732            *ketcode = OP_KETRMAX + repeat_type;
3733            if (lengthptr == NULL && *bracode != OP_ONCE)
3734              {
3735              uschar *scode = bracode;
3736              do
3737                {
3738                if (could_be_empty_branch(scode, ketcode, utf8))
3739                  {
3740                  *bracode += OP_SBRA - OP_BRA;
3741                  break;
3742                  }
3743                scode += GET(scode, 1);
3744                }
3745              while (*scode == OP_ALT);
3746              }
3747            }
3748        }        }
3749    
3750      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2733  for (;; ptr++) Line 3755  for (;; ptr++)
3755        goto FAILED;        goto FAILED;
3756        }        }
3757    
3758      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3759      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3760      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3761      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3762      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3763        but the special opcodes can optimize it a bit. The repeated item starts at
3764        tempcode, not at previous, which might be the first part of a string whose
3765        (former) last char we repeated.
3766    
3767        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3768        an 'upto' may follow. We skip over an 'exact' item, and then test the
3769        length of what remains before proceeding. */
3770    
3771      if (possessive_quantifier)      if (possessive_quantifier)
3772        {        {
3773        int len = code - tempcode;        int len;
3774        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3775        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3776        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3777        tempcode[0] = OP_ONCE;        len = code - tempcode;
3778        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3779        PUTINC(code, 0, len);          {
3780        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3781            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3782            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3783            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3784    
3785            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3786            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3787            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3788            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3789    
3790            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3791            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3792            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3793            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3794    
3795            default:
3796            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3797            code += 1 + LINK_SIZE;
3798            len += 1 + LINK_SIZE;
3799            tempcode[0] = OP_ONCE;
3800            *code++ = OP_KET;
3801            PUTINC(code, 0, len);
3802            PUT(tempcode, 1, len);
3803            break;
3804            }
3805        }        }
3806    
3807      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2761  for (;; ptr++) Line 3814  for (;; ptr++)
3814      break;      break;
3815    
3816    
3817      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3818      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3819      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3820      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3821      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3822      check for syntax errors here.  */      group. */
3823    
3824      case '(':      case '(':
3825      newoptions = options;      newoptions = options;
3826      skipbytes = 0;      skipbytes = 0;
3827        bravalue = OP_CBRA;
3828        save_hwm = cd->hwm;
3829        reset_bracount = FALSE;
3830    
3831      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3832        {        {
3833        int set, unset;        int i, set, unset, namelen;
3834        int *optset;        int *optset;
3835          const uschar *name;
3836          uschar *slot;
3837    
3838        switch (*(++ptr))        switch (*(++ptr))
3839          {          {
3840          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3841          ptr++;          ptr++;
3842          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3843            if (*ptr == 0)
3844              {
3845              *errorcodeptr = ERR18;
3846              goto FAILED;
3847              }
3848          continue;          continue;
3849    
3850          case ':':                 /* Non-extracting bracket */  
3851            /* ------------------------------------------------------------ */
3852            case '|':                 /* Reset capture count for each branch */
3853            reset_bracount = TRUE;
3854            /* Fall through */
3855    
3856            /* ------------------------------------------------------------ */
3857            case ':':                 /* Non-capturing bracket */
3858          bravalue = OP_BRA;          bravalue = OP_BRA;
3859          ptr++;          ptr++;
3860          break;          break;
3861    
3862    
3863            /* ------------------------------------------------------------ */
3864          case '(':          case '(':
3865          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3866    
3867          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3868            group), a name (referring to a named group), or 'R', referring to
3869            recursion. R<digits> and R&name are also permitted for recursion tests.
3870    
3871            There are several syntaxes for testing a named group: (?(name)) is used
3872            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3873    
3874            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3875            be the recursive thing or the name 'R' (and similarly for 'R' followed
3876            by digits), and (b) a number could be a name that consists of digits.
3877            In both cases, we look for a name first; if not found, we try the other
3878            cases. */
3879    
3880            /* For conditions that are assertions, check the syntax, and then exit
3881            the switch. This will take control down to where bracketed groups,
3882            including assertions, are processed. */
3883    
3884            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3885              break;
3886    
3887            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3888            below), and all need to skip 3 bytes at the start of the group. */
3889    
3890            code[1+LINK_SIZE] = OP_CREF;
3891            skipbytes = 3;
3892            refsign = -1;
3893    
3894            /* Check for a test for recursion in a named group. */
3895    
3896            if (ptr[1] == 'R' && ptr[2] == '&')
3897              {
3898              terminator = -1;
3899              ptr += 2;
3900              code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3901              }
3902    
3903            /* Check for a test for a named group's having been set, using the Perl
3904            syntax (?(<name>) or (?('name') */
3905    
3906            else if (ptr[1] == '<')
3907              {
3908              terminator = '>';
3909              ptr++;
3910              }
3911            else if (ptr[1] == '\'')
3912              {
3913              terminator = '\'';
3914              ptr++;
3915              }
3916            else
3917              {
3918              terminator = 0;
3919              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3920              }
3921    
3922            /* We now expect to read a name; any thing else is an error */
3923    
3924            if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3925              {
3926              ptr += 1;  /* To get the right offset */
3927              *errorcodeptr = ERR28;
3928              goto FAILED;
3929              }
3930    
3931            /* Read the name, but also get it as a number if it's all digits */
3932    
3933          if (ptr[1] == 'R')          recno = 0;
3934            name = ++ptr;
3935            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3936              {
3937              if (recno >= 0)
3938                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3939                  recno * 10 + *ptr - '0' : -1;
3940              ptr++;
3941              }
3942            namelen = ptr - name;
3943    
3944            if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3945            {            {
3946            code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
3947            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            *errorcodeptr = ERR26;
3948            skipbytes = 3;            goto FAILED;
           ptr += 3;  
3949            }            }
3950    
3951          /* Condition to test for a numbered subpattern match. We know that          /* Do no further checking in the pre-compile phase. */
         if a digit follows ( then there will just be digits until ) because  
         the syntax was checked in the first pass. */  
3952    
3953          else if ((digitab[ptr[1]] && ctype_digit) != 0)          if (lengthptr != NULL) break;
3954    
3955            /* In the real compile we do the work of looking for the actual
3956            reference. If the string started with "+" or "-" we require the rest to
3957            be digits, in which case recno will be set. */
3958    
3959            if (refsign > 0)
3960            {            {
3961            int condref;                 /* Don't amalgamate; some compilers */            if (recno <= 0)
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
3962              {              {
3963              *errorcodeptr = ERR35;              *errorcodeptr = ERR58;
3964              goto FAILED;              goto FAILED;
3965              }              }
3966            ptr++;            if (refsign == '-')
3967            code[1+LINK_SIZE] = OP_CREF;              {
3968            PUT2(code, 2+LINK_SIZE, condref);              recno = cd->bracount - recno + 1;
3969            skipbytes = 3;              if (recno <= 0)
3970                  {
3971                  *errorcodeptr = ERR15;
3972                  goto FAILED;
3973                  }
3974                }
3975              else recno += cd->bracount;
3976              PUT2(code, 2+LINK_SIZE, recno);
3977              break;
3978              }
3979    
3980            /* Otherwise (did not start with "+" or "-"), start by looking for the
3981            name. */
3982    
3983            slot = cd->name_table;
3984            for (i = 0; i < cd->names_found; i++)
3985              {
3986              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3987              slot += cd->name_entry_size;
3988              }
3989    
3990            /* Found a previous named subpattern */
3991    
3992            if (i < cd->names_found)
3993              {
3994              recno = GET2(slot, 0);
3995              PUT2(code, 2+LINK_SIZE, recno);
3996              }
3997    
3998            /* Search the pattern for a forward reference */
3999    
4000            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4001                            (options & PCRE_EXTENDED) != 0)) > 0)
4002              {
4003              PUT2(code, 2+LINK_SIZE, i);
4004              }
4005    
4006            /* If terminator == 0 it means that the name followed directly after
4007            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4008            some further alternatives to try. For the cases where terminator != 0
4009            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4010            now checked all the possibilities, so give an error. */
4011    
4012            else if (terminator != 0)
4013              {
4014              *errorcodeptr = ERR15;
4015              goto FAILED;
4016              }
4017    
4018            /* Check for (?(R) for recursion. Allow digits after R to specify a
4019            specific group number. */
4020    
4021            else if (*name == 'R')
4022              {
4023              recno = 0;
4024              for (i = 1; i < namelen; i++)
4025                {
4026                if ((digitab[name[i]] & ctype_digit) == 0)
4027                  {
4028                  *errorcodeptr = ERR15;
4029                  goto FAILED;
4030                  }
4031                recno = recno * 10 + name[i] - '0';
4032                }
4033              if (recno == 0) recno = RREF_ANY;
4034              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4035              PUT2(code, 2+LINK_SIZE, recno);
4036              }
4037    
4038            /* Similarly, check for the (?(DEFINE) "condition", which is always
4039            false. */
4040    
4041            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4042              {
4043              code[1+LINK_SIZE] = OP_DEF;
4044              skipbytes = 1;
4045              }
4046    
4047            /* Check for the "name" actually being a subpattern number. */
4048    
4049            else if (recno > 0)
4050              {
4051              PUT2(code, 2+LINK_SIZE, recno);
4052              }
4053    
4054            /* Either an unidentified subpattern, or a reference to (?(0) */
4055    
4056            else
4057              {
4058              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4059              goto FAILED;
4060            }            }
         /* For conditions that are assertions, we just fall through, having  
         set bravalue above. */  
4061          break;          break;
4062    
4063    
4064            /* ------------------------------------------------------------ */
4065          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
4066          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
4067          ptr++;          ptr++;
4068          break;          break;
4069    
4070    
4071            /* ------------------------------------------------------------ */
4072          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
4073          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
4074          ptr++;          ptr++;
4075          break;          break;
4076    
4077          case '<':                 /* Lookbehinds */  
4078          switch (*(++ptr))          /* ------------------------------------------------------------ */
4079            case '<':                 /* Lookbehind or named define */
4080            switch (ptr[1])
4081            {            {
4082            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
4083            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
4084            ptr++;            ptr += 2;
4085            break;            break;
4086    
4087            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
4088            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
4089            ptr++;            ptr += 2;
4090            break;            break;
4091    
4092              default:                /* Could be name define, else bad */
4093              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4094              ptr++;                  /* Correct offset for error */
4095              *errorcodeptr = ERR24;
4096              goto FAILED;
4097            }            }
4098          break;          break;
4099    
4100    
4101            /* ------------------------------------------------------------ */
4102          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
4103          bravalue = OP_ONCE;          bravalue = OP_ONCE;
4104          ptr++;          ptr++;
4105          break;          break;
4106    
4107    
4108            /* ------------------------------------------------------------ */
4109          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
4110          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
4111          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
4112          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
4113            {                       /* closing parenthesis is present. */            {
4114            int n = 0;            int n = 0;
4115            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
4116              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
4117              if (*ptr != ')')
4118                {
4119                *errorcodeptr = ERR39;
4120                goto FAILED;
4121                }
4122            if (n > 255)            if (n > 255)
4123              {              {
4124              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 2876  for (;; ptr++) Line 4132  for (;; ptr++)
4132          previous = NULL;          previous = NULL;
4133          continue;          continue;
4134    
4135          case 'P':                 /* Named subpattern handling */  
4136          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4137            case 'P':                 /* Python-style named subpattern handling */
4138            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4139              {
4140              is_recurse = *ptr == '>';
4141              terminator = ')';
4142              goto NAMED_REF_OR_RECURSE;
4143              }
4144            else if (*ptr != '<')    /* Test for Python-style definition */
4145              {
4146              *errorcodeptr = ERR41;
4147              goto FAILED;
4148              }
4149            /* Fall through to handle (?P< as (?< is handled */
4150    
4151    
4152            /* ------------------------------------------------------------ */
4153            DEFINE_NAME:    /* Come here from (?< handling */
4154            case '\'':
4155            {            {
4156            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4157            uschar *slot = cd->name_table;            name = ++ptr;
4158            const uschar *name;     /* Don't amalgamate; some compilers */  
4159            name = ++ptr;           /* grumble at autoincrement in declaration */            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4160              namelen = ptr - name;
4161    
4162            while (*ptr++ != '>');            /* In the pre-compile phase, just do a syntax check. */
           namelen = ptr - name - 1;  
4163    
4164            for (i = 0; i < cd->names_found; i++)            if (lengthptr != NULL)
4165              {              {
4166              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
             if (crc == 0)  
4167                {                {
4168                if (slot[2+namelen] == 0)                *errorcodeptr = ERR42;
4169                  goto FAILED;
4170                  }
4171                if (cd->names_found >= MAX_NAME_COUNT)
4172                  {
4173                  *errorcodeptr = ERR49;
4174                  goto FAILED;
4175                  }
4176                if (namelen + 3 > cd->name_entry_size)
4177                  {
4178                  cd->name_entry_size = namelen + 3;
4179                  if (namelen > MAX_NAME_SIZE)
4180                  {                  {
4181                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4182                  goto FAILED;                  goto FAILED;
4183                  }                  }
               crc = -1;             /* Current name is substring */  
4184                }                }
             if (crc < 0)  
               {  
               memmove(slot + cd->name_entry_size, slot,  
                 (cd->names_found - i) * cd->name_entry_size);  
               break;  
               }  
             slot += cd->name_entry_size;  
4185              }              }
4186    
4187            PUT2(slot, 0, *brackets + 1);            /* In the real compile, create the entry in the table */
4188            memcpy(slot + 2, name, namelen);  
4189            slot[2+namelen] = 0;            else
4190            cd->names_found++;              {
4191            goto NUMBERED_GROUP;              slot = cd->name_table;
4192                for (i = 0; i < cd->names_found; i++)
4193                  {
4194                  int crc = memcmp(name, slot+2, namelen);
4195                  if (crc == 0)
4196                    {
4197                    if (slot[2+namelen] == 0)
4198                      {
4199                      if ((options & PCRE_DUPNAMES) == 0)
4200                        {
4201                        *errorcodeptr = ERR43;
4202                        goto FAILED;
4203                        }
4204                      }
4205                    else crc = -1;      /* Current name is substring */
4206                    }
4207                  if (crc < 0)
4208                    {
4209                    memmove(slot + cd->name_entry_size, slot,
4210                      (cd->names_found - i) * cd->name_entry_size);
4211                    break;
4212                    }
4213                  slot += cd->name_entry_size;
4214                  }
4215    
4216                PUT2(slot, 0, cd->bracount + 1);
4217                memcpy(slot + 2, name, namelen);
4218                slot[2+namelen] = 0;
4219                }
4220            }            }
4221    
4222          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4223    
4224            ptr++;                    /* Move past > or ' */
4225            cd->names_found++;
4226            goto NUMBERED_GROUP;
4227    
4228    
4229            /* ------------------------------------------------------------ */
4230            case '&':                 /* Perl recursion/subroutine syntax */
4231            terminator = ')';
4232            is_recurse = TRUE;
4233            /* Fall through */
4234    
4235            /* We come here from the Python syntax above that handles both
4236            references (?P=name) and recursion (?P>name), as well as falling
4237            through from the Perl recursion syntax (?&name). */
4238    
4239            NAMED_REF_OR_RECURSE:
4240            name = ++ptr;
4241            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4242            namelen = ptr - name;
4243    
4244            /* In the pre-compile phase, do a syntax check and set a dummy
4245            reference number. */
4246    
4247            if (lengthptr != NULL)
4248            {            {
4249            int i, namelen;            if (*ptr != terminator)
4250            int type = *ptr++;              {
4251            const uschar *name = ptr;              *errorcodeptr = ERR42;
4252            uschar *slot = cd->name_table;              goto FAILED;
4253                }
4254              if (namelen > MAX_NAME_SIZE)
4255                {
4256                *errorcodeptr = ERR48;
4257                goto FAILED;
4258                }
4259              recno = 0;
4260              }
4261    
4262            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4263    
4264            else
4265              {
4266              slot = cd->name_table;
4267            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4268              {              {
4269              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4270              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4271              }              }
4272            if (i >= cd->names_found)  
4273              if (i < cd->names_found)         /* Back reference */
4274                {
4275                recno = GET2(slot, 0);
4276                }
4277              else if ((recno =                /* Forward back reference */
4278                        find_parens(ptr, cd->bracount, name, namelen,
4279                          (options & PCRE_EXTENDED) != 0)) <= 0)
4280              {              {
4281              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4282              goto FAILED;              goto FAILED;
4283              }              }
4284              }
4285    
4286            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4287            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4288    
4289            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4290            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4291    
         /* Should never happen */  
         break;  
4292    
4293          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4294            case 'R':                 /* Recursion */
4295          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4296          /* Fall through */          /* Fall through */
4297    
         /* Recursion or "subroutine" call */  
4298    
4299          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4300          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4301            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4302            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4303            {            {
4304            const uschar *called;            const uschar *called;
4305    
4306              if ((refsign = *ptr) == '+') ptr++;
4307              else if (refsign == '-')
4308                {
4309                if ((digitab[ptr[1]] & ctype_digit) == 0)
4310                  goto OTHER_CHAR_AFTER_QUERY;
4311                ptr++;
4312                }
4313    
4314            recno = 0;            recno = 0;
4315            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4316              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4317    
4318              if (*ptr != ')')
4319                {
4320                *errorcodeptr = ERR29;
4321                goto FAILED;
4322                }
4323    
4324              if (refsign == '-')
4325                {
4326                if (recno == 0)
4327                  {
4328                  *errorcodeptr = ERR58;
4329                  goto FAILED;
4330                  }
4331                recno = cd->bracount - recno + 1;
4332                if (recno <= 0)
4333                  {
4334                  *errorcodeptr = ERR15;
4335                  goto FAILED;
4336                  }
4337                }
4338              else if (refsign == '+')
4339                {
4340                if (recno == 0)
4341                  {
4342                  *errorcodeptr = ERR58;
4343                  goto FAILED;
4344                  }
4345                recno += cd->bracount;
4346                }
4347    
4348            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4349    
4350            HANDLE_RECURSION:            HANDLE_RECURSION:
4351    
4352            previous = code;            previous = code;
4353              called = cd->start_code;
4354    
4355            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4356            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4357              this point. If we end up with a forward reference, first check that
4358              the bracket does occur later so we can give the error (and position)
4359              now. Then remember this forward reference in the workspace so it can
4360              be filled in at the end. */
4361    
4362            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)?  
             cd->start_code : find_bracket(cd->start_code, utf8, recno);  
   
           if (called == NULL)  
4363              {              {
4364              *errorcodeptr = ERR15;              *code = OP_END;
4365              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4366    
4367            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4368    
4369            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4370              {                {
4371              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4372              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4373                    {
4374                    *errorcodeptr = ERR15;
4375                    goto FAILED;
4376                    }
4377                  called = cd->start_code + recno;
4378                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4379                  }
4380    
4381                /* If not a forward reference, and the subpattern is still open,
4382                this is a recursive call. We check to see if this is a left
4383                recursion that could loop for ever, and diagnose that case. */
4384    
4385                else if (GET(called, 1) == 0 &&
4386                         could_be_empty(called, code, bcptr, utf8))
4387                  {
4388                  *errorcodeptr = ERR40;
4389                  goto FAILED;
4390                  }
4391              }              }
4392    
4393            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4394              "once" brackets. Set up a "previous group" length so that a
4395              subsequent quantifier will work. */
4396    
4397              *code = OP_ONCE;
4398              PUT(code, 1, 2 + 2*LINK_SIZE);
4399              code += 1 + LINK_SIZE;
4400    
4401            *code = OP_RECURSE;            *code = OP_RECURSE;
4402            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4403            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4404    
4405              *code = OP_KET;
4406              PUT(code, 1, 2 + 2*LINK_SIZE);
4407              code += 1 + LINK_SIZE;
4408    
4409              length_prevgroup = 3 + 3*LINK_SIZE;
4410            }            }
4411    
4412            /* Can't determine a first byte now */
4413    
4414            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4415          continue;          continue;
4416    
         /* Character after (? not specially recognized */  
4417    
4418          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4419            default:              /* Other characters: check option setting */
4420            OTHER_CHAR_AFTER_QUERY:
4421          set = unset = 0;          set = unset = 0;
4422          optset = &set;          optset = &set;
4423    
# Line 3016  for (;; ptr++) Line 4427  for (;; ptr++)
4427              {              {
4428              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4429    
4430                case 'J':    /* Record that it changed in the external options */
4431                *optset |= PCRE_DUPNAMES;
4432                cd->external_options |= PCRE_JCHANGED;
4433                break;
4434    
4435              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4436              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4437              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4438              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4439              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4440              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4441    
4442                default:  *errorcodeptr = ERR12;
4443                          ptr--;    /* Correct the offset */
4444                          goto FAILED;
4445              }              }
4446            }            }
4447    
# Line 3030  for (;; ptr++) Line 4450  for (;; ptr++)
4450          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4451    
4452          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4453          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4454          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4455          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4456          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4457          a group), a resetting item can be compiled.          caseless checking of required bytes.
4458    
4459          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4460          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4461          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4462            that value after the start, because it gets reset as code is discarded
4463            during the pre-compile. However, this can happen only at top level - if
4464            we are within parentheses, the starting BRA will still be present. At
4465            any parenthesis level, the length value can be used to test if anything
4466            has been compiled at that level. Thus, a test for both these conditions
4467            is necessary to ensure we correctly detect the start of the pattern in
4468            both phases.
4469    
4470            If we are not at the pattern start, compile code to change the ims
4471            options if this setting actually changes any of them. We also pass the
4472            new setting back so that it can be put at the start of any following
4473            branches, and when this group ends (if we are in a group), a resetting
4474            item can be compiled. */
4475    
4476          if (*ptr == ')')          if (*ptr == ')')
4477            {            {
4478            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4479                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4480              {              {
4481              *code++ = OP_OPT;              cd->external_options = newoptions;
4482              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4483              }              }
4484             else
4485                {
4486                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4487                  {
4488                  *code++ = OP_OPT;
4489                  *code++ = newoptions & PCRE_IMS;
4490                  }
4491    
4492            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4493            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4494            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4495    
4496            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4497            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4498            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4499            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4500                }
4501    
4502            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4503            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3068  for (;; ptr++) Line 4510  for (;; ptr++)
4510    
4511          bravalue = OP_BRA;          bravalue = OP_BRA;
4512          ptr++;          ptr++;
4513          }          }     /* End of switch for character following (? */
4514        }        }       /* End of (? handling */
4515    
4516      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4517      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4518        brackets. */
4519    
4520      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4521        {        {
4522        bravalue = OP_BRA;        bravalue = OP_BRA;
4523        }        }
4524    
4525      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4526    
4527      else      else
4528        {        {
4529        NUMBERED_GROUP:        NUMBERED_GROUP:
4530        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4531          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4532          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4533        }        }
4534    
4535      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4536      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4537      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4538      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4539        they have changed. */
4540    
4541      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4542      *code = bravalue;      *code = bravalue;
4543      tempcode = code;      tempcode = code;
4544      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4545        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4546    
4547      if (!compile_regex(      if (!compile_regex(
4548           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4549           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4550           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4551           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4552           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4553           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4554            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4555           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           reset_bracount,               /* True if (?| group */
4556             skipbytes,                    /* Skip over bracket number */
4557           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4558           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4559           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4560           cd))                          /* Tables block */           cd,                           /* Tables block */
4561             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4562               &length_prevgroup           /* Pre-compile phase */
4563             ))
4564        goto FAILED;        goto FAILED;
4565    
4566      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3128  for (;; ptr++) Line 4569  for (;; ptr++)
4569      is on the bracket. */      is on the bracket. */
4570    
4571      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4572      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4573        in the real compile phase, not in the pre-pass, where the whole group may
4574        not be available. */
4575    
4576      else if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4577        {        {
4578        uschar *tc = code;        uschar *tc = code;
4579        condcount = 0;        int condcount = 0;
4580    
4581        do {        do {
4582           condcount++;           condcount++;
# Line 3141  for (;; ptr++) Line 4584  for (;; ptr++)
4584           }           }
4585        while (*tc != OP_KET);        while (*tc != OP_KET);
4586    
4587        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4588          false). It must have only one branch. */
4589    
4590          if (code[LINK_SIZE+1] == OP_DEF)
4591          {          {
4592          *errorcodeptr = ERR27;          if (condcount > 1)
4593          goto FAILED;            {
4594              *errorcodeptr = ERR54;
4595              goto FAILED;
4596              }
4597            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4598            }
4599    
4600          /* A "normal" conditional group. If there is just one branch, we must not
4601          make use of its firstbyte or reqbyte, because this is equivalent to an
4602          empty second branch. */
4603    
4604          else
4605            {
4606            if (condcount > 2)
4607              {
4608              *errorcodeptr = ERR27;
4609              goto FAILED;
4610              }
4611            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4612          }          }
4613          }
4614    
4615        /* Error if hit end of pattern */
4616    
4617        /* If there is just one branch, we must not make use of its firstbyte or      if (*ptr != ')')
4618        reqbyte, because this is equivalent to an empty second branch. */        {
4619          *errorcodeptr = ERR14;
4620          goto FAILED;
4621          }
4622    
4623        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4624        group, less the brackets at either end. Then reduce the compiled code to
4625        just the brackets so that it doesn't use much memory if it is duplicated by
4626        a quantifier. */
4627    
4628        if (lengthptr != NULL)
4629          {
4630          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4631          code++;
4632          PUTINC(code, 0, 1 + LINK_SIZE);
4633          *code++ = OP_KET;
4634          PUTINC(code, 0, 1 + LINK_SIZE);
4635        }        }
4636    
4637      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4638      brackets of all kinds, and conditions with two branches (see code above).  
4639      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4640      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4641      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4642        relevant. */
4643    
4644        if (bravalue == OP_DEF) break;
4645    
4646        /* Handle updating of the required and first characters for other types of
4647        group. Update for normal brackets of all kinds, and conditions with two
4648        branches (see code above). If the bracket is followed by a quantifier with
4649        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4650        zerofirstbyte outside the main loop so that they can be accessed for the
4651        back off. */
4652    
4653      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4654      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4655      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4656    
4657      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4658        {        {
4659        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4660        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3204  for (;; ptr++) Line 4695  for (;; ptr++)
4695      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4696    
4697      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4698        break;     /* End of processing '(' */
4699    
     /* Now update the main code pointer to the end of the group. */