/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 79 by nigel, Sat Feb 24 21:40:52 2007 UTC revision 335 by ph10, Sat Apr 12 14:36:14 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56    /* When DEBUG is defined, we need the pcre_printint() function, which is also
57    used by pcretest. DEBUG is not defined when building a production library. */
58    
59    #ifdef DEBUG
60    #include "pcre_printint.src"
61    #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 106  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144  terminated by a zero length entry. The first three must be alpha, upper, lower,  searched linearly. Put all the names into a single string, in order to reduce
145  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. */
146    
147  static const char *const posix_names[] = {  typedef struct verbitem {
148    "alpha", "lower", "upper",    int   len;
149    "alnum", "ascii", "blank", "cntrl", "digit", "graph",    int   op;
150    "print", "punct", "space", "word",  "xdigit" };  } verbitem;
151    
152    static const char verbnames[] =
153      "ACCEPT\0"
154      "COMMIT\0"
155      "F\0"
156      "FAIL\0"
157      "PRUNE\0"
158      "SKIP\0"
159      "THEN";
160    
161    static const verbitem verbs[] = {
162      { 6, OP_ACCEPT },
163      { 6, OP_COMMIT },
164      { 1, OP_FAIL },
165      { 4, OP_FAIL },
166      { 5, OP_PRUNE },
167      { 4, OP_SKIP  },
168      { 4, OP_THEN  }
169    };
170    
171    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174    /* Tables of names of POSIX character classes and their lengths. The names are
175    now all in a single string, to reduce the number of relocations when a shared
176    library is dynamically loaded. The list of lengths is terminated by a zero
177    length entry. The first three must be alpha, lower, upper, as this is assumed
178    for handling case independence. */
179    
180    static const char posix_names[] =
181      "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
182      "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
183      "word\0"   "xdigit";
184    
185  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
186    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
189  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
190  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
191    characters are removed, and for [:alpha:] and [:alnum:] the underscore
192    character is removed. The triples in the table consist of the base map offset,
193    second map offset or -1 if no second map, and a non-negative value for map
194    addition or a negative value for map subtraction (if there are two maps). The
195    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196    remove vertical space characters, 2 => remove underscore. */
197    
198  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
199    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
200    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
201    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
202    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
203    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
204    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
205    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
206    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
207    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
208    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
209    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
210    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
211    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
212    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
213  };  };
214    
215    
216  /* The texts of compile-time error messages. These are "char *" because they  #define STRING(a)  # a
217  are passed to the outside world. */  #define XSTRING(s) STRING(s)
218    
219  static const char *error_texts[] = {  /* The texts of compile-time error messages. These are "char *" because they
220    "no error",  are passed to the outside world. Do not ever re-use any error number, because
221    "\\ at end of pattern",  they are documented. Always add a new error instead. Messages marked DEAD below
222    "\\c at end of pattern",  are no longer used. This used to be a table of strings, but in order to reduce
223    "unrecognized character follows \\",  the number of relocations needed when a shared library is loaded dynamically,
224    "numbers out of order in {} quantifier",  it is now one long string. We cannot use a table of offsets, because the
225    lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226    simply count through to the one we want - this isn't a performance issue
227    because these strings are used only when there is a compilation error. */
228    
229    static const char error_texts[] =
230      "no error\0"
231      "\\ at end of pattern\0"
232      "\\c at end of pattern\0"
233      "unrecognized character follows \\\0"
234      "numbers out of order in {} quantifier\0"
235    /* 5 */    /* 5 */
236    "number too big in {} quantifier",    "number too big in {} quantifier\0"
237    "missing terminating ] for character class",    "missing terminating ] for character class\0"
238    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
239    "range out of order in character class",    "range out of order in character class\0"
240    "nothing to repeat",    "nothing to repeat\0"
241    /* 10 */    /* 10 */
242    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
244    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
245    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
246    "missing )",    "missing )\0"
247    /* 15 */    /* 15 */
248    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
249    "erroffset passed as NULL",    "erroffset passed as NULL\0"
250    "unknown option bit(s) set",    "unknown option bit(s) set\0"
251    "missing ) after comment",    "missing ) after comment\0"
252    "parentheses nested too deeply",    "parentheses nested too deeply\0"  /** DEAD **/
253    /* 20 */    /* 20 */
254    "regular expression too large",    "regular expression is too large\0"
255    "failed to get memory",    "failed to get memory\0"
256    "unmatched parentheses",    "unmatched parentheses\0"
257    "internal error: code overflow",    "internal error: code overflow\0"
258    "unrecognized character after (?<",    "unrecognized character after (?<\0"
259    /* 25 */    /* 25 */
260    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
261    "malformed number after (?(",    "malformed number or name after (?(\0"
262    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
263    "assertion expected after (?(",    "assertion expected after (?(\0"
264    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
265    /* 30 */    /* 30 */
266    "unknown POSIX class name",    "unknown POSIX class name\0"
267    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
268    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269    "spare error",    "spare error\0"  /** DEAD **/
270    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
271    /* 35 */    /* 35 */
272    "invalid condition (?(0)",    "invalid condition (?(0)\0"
273    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
274    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275    "number after (?C is > 255",    "number after (?C is > 255\0"
276    "closing ) for (?C expected",    "closing ) for (?C expected\0"
277    /* 40 */    /* 40 */
278    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
279    "unrecognized character after (?P",    "unrecognized character after (?P\0"
280    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)\0"
281    "two named groups have the same name",    "two named subpatterns have the same name\0"
282    "invalid UTF-8 string",    "invalid UTF-8 string\0"
283    /* 45 */    /* 45 */
284    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
285    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
286    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p\0"
287  };    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289      /* 50 */
290      "repeated subpattern is too long\0"    /** DEAD **/
291      "octal value is greater than \\377 (not in UTF-8 mode)\0"
292      "internal error: overran compiling workspace\0"
293      "internal error: previously-checked referenced subpattern not found\0"
294      "DEFINE group contains more than one branch\0"
295      /* 55 */
296      "repeating a DEFINE group is not allowed\0"
297      "inconsistent NEWLINE options\0"
298      "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299      "a numbered reference must not be zero\0"
300      "(*VERB) with an argument is not supported\0"
301      /* 60 */
302      "(*VERB) not recognized\0"
303      "number is too big\0"
304      "subpattern name expected\0"
305      "digit expected after (?+";
306    
307    
308  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 220  For convenience, we use the same bit def Line 321  For convenience, we use the same bit def
321    
322  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
323    
324  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
325  static const unsigned char digitab[] =  static const unsigned char digitab[] =
326    {    {
327    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 357  static const unsigned char digitab[] =
357    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359    
360  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
361  static const unsigned char digitab[] =  static const unsigned char digitab[] =
362    {    {
363    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 371  static const unsigned char digitab[] =
371    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
372    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
373    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
374    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
375    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
376    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
377    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 405  static const unsigned char ebcdic_charta
405    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
406    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
407    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
408    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
409    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
410    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
411    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 432  static const unsigned char ebcdic_charta
432  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
433    
434  static BOOL  static BOOL
435    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
437    
438    
439    
440    /*************************************************
441    *            Find an error text                  *
442    *************************************************/
443    
444    /* The error texts are now all in one long string, to save on relocations. As
445    some of the text is of unknown length, we can't use a table of offsets.
446    Instead, just count through the strings. This is not a performance issue
447    because it happens only when there has been a compilation error.
448    
449    Argument:   the error number
450    Returns:    pointer to the error string
451    */
452    
453    static const char *
454    find_error_text(int n)
455    {
456    const char *s = error_texts;
457    for (; n > 0; n--) while (*s++ != 0);
458    return s;
459    }
460    
461    
462  /*************************************************  /*************************************************
# Line 342  static BOOL Line 465  static BOOL
465    
466  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
467  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
468  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
469  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471    ptr is pointing at the \. On exit, it is on the final character of the escape
472    sequence.
473    
474  Arguments:  Arguments:
475    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 355  Arguments: Line 480  Arguments:
480    
481  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
482                   negative => a special escape sequence                   negative => a special escape sequence
483                   on error, errorptr is set                   on error, errorcodeptr is set
484  */  */
485    
486  static int  static int
487  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488    int options, BOOL isclass)    int options, BOOL isclass)
489  {  {
490  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
491    const uschar *ptr = *ptrptr + 1;
492  int c, i;  int c, i;
493    
494    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
495    ptr--;                            /* Set pointer back to the last byte */
496    
497  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
498    
 c = *(++ptr);  
499  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
500    
501  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
503  Otherwise further processing may be required. */  Otherwise further processing may be required. */
504    
505  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
506  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
507  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
508    
509  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
510  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
511  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
512  #endif  #endif
513    
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 516  else if ((i = escapes[c - 0x48]) != 0)
516  else  else
517    {    {
518    const uschar *oldptr;    const uschar *oldptr;
519      BOOL braced, negated;
520    
521    switch (c)    switch (c)
522      {      {
523      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 531  else
531      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
532      break;      break;
533    
534        /* \g must be followed by one of a number of specific things:
535    
536        (1) A number, either plain or braced. If positive, it is an absolute
537        backreference. If negative, it is a relative backreference. This is a Perl
538        5.10 feature.
539    
540        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
541        is part of Perl's movement towards a unified syntax for back references. As
542        this is synonymous with \k{name}, we fudge it up by pretending it really
543        was \k.
544    
545        (3) For Oniguruma compatibility we also support \g followed by a name or a
546        number either in angle brackets or in single quotes. However, these are
547        (possibly recursive) subroutine calls, _not_ backreferences. Just return
548        the -ESC_g code (cf \k). */
549    
550        case 'g':
551        if (ptr[1] == '<' || ptr[1] == '\'')
552          {
553          c = -ESC_g;
554          break;
555          }
556    
557        /* Handle the Perl-compatible cases */
558    
559        if (ptr[1] == '{')
560          {
561          const uschar *p;
562          for (p = ptr+2; *p != 0 && *p != '}'; p++)
563            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
564          if (*p != 0 && *p != '}')
565            {
566            c = -ESC_k;
567            break;
568            }
569          braced = TRUE;
570          ptr++;
571          }
572        else braced = FALSE;
573    
574        if (ptr[1] == '-')
575          {
576          negated = TRUE;
577          ptr++;
578          }
579        else negated = FALSE;
580    
581        c = 0;
582        while ((digitab[ptr[1]] & ctype_digit) != 0)
583          c = c * 10 + *(++ptr) - '0';
584    
585        if (c < 0)   /* Integer overflow */
586          {
587          *errorcodeptr = ERR61;
588          break;
589          }
590    
591        if (braced && *(++ptr) != '}')
592          {
593          *errorcodeptr = ERR57;
594          break;
595          }
596    
597        if (c == 0)
598          {
599          *errorcodeptr = ERR58;
600          break;
601          }
602    
603        if (negated)
604          {
605          if (c > bracount)
606            {
607            *errorcodeptr = ERR15;
608            break;
609            }
610          c = bracount - (c - 1);
611          }
612    
613        c = -(ESC_REF + c);
614        break;
615    
616      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
617      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
618      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 422  else Line 634  else
634        c -= '0';        c -= '0';
635        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
636          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
637          if (c < 0)    /* Integer overflow */
638            {
639            *errorcodeptr = ERR61;
640            break;
641            }
642        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
643          {          {
644          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 442  else Line 659  else
659        }        }
660    
661      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
662      larger first octal digit. */      larger first octal digit. The original code used just to take the least
663        significant 8 bits of octal numbers (I think this is what early Perls used
664        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
665        than 3 octal digits. */
666    
667      case '0':      case '0':
668      c -= '0';      c -= '0';
669      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
670          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
671      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
672      break;      break;
673    
674      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
675      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
676        treated as a data character. */
677    
678      case 'x':      case 'x':
679  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
680        {        {
681        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
682        register int count = 0;        int count = 0;
683    
684        c = 0;        c = 0;
685        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
686          {          {
687          int cc = *pt++;          register int cc = *pt++;
688            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
689          count++;          count++;
690  #if !EBCDIC    /* ASCII coding */  
691    #ifndef EBCDIC  /* ASCII coding */
692          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
693          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
694  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
695          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
696          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
697  #endif  #endif
698          }          }
699    
700        if (*pt == '}')        if (*pt == '}')
701          {          {
702          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
703          ptr = pt;          ptr = pt;
704          break;          break;
705          }          }
706    
707        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
708        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
709        }        }
 #endif  
710    
711      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
712    
713      c = 0;      c = 0;
714      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
715        {        {
716        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
717        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
718  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
719        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
720        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
721  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
722        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
723        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
724  #endif  #endif
725        }        }
726      break;      break;
727    
728      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
729        This coding is ASCII-specific, but then the whole concept of \cx is
730        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
731    
732      case 'c':      case 'c':
733      c = *(++ptr);      c = *(++ptr);
734      if (c == 0)      if (c == 0)
735        {        {
736        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
737        return 0;        break;
738        }        }
739    
740      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
741      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
742      c ^= 0x40;      c ^= 0x40;
743  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
744      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
745      c ^= 0xC0;      c ^= 0xC0;
746  #endif  #endif
747      break;      break;
748    
749      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
750      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
751      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
752      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
753      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
754    
755      default:      default:
756      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 560  escape sequence. Line 782  escape sequence.
782  Argument:  Argument:
783    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
784    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
785      dptr           points to an int that is set to the detailed property value
786    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
787    
788  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
789  */  */
790    
791  static int  static int
792  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
793  {  {
794  int c, i, bot, top;  int c, i, bot, top;
795  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
796  char name[4];  char name[32];
797    
798  c = *(++ptr);  c = *(++ptr);
799  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
800    
801  *negptr = FALSE;  *negptr = FALSE;
802    
803  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
804  preceded by ^ for negation. */  negation. */
805    
806  if (c == '{')  if (c == '{')
807    {    {
# Line 587  if (c == '{') Line 810  if (c == '{')
810      *negptr = TRUE;      *negptr = TRUE;
811      ptr++;      ptr++;
812      }      }
813    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
814      {      {
815      c = *(++ptr);      c = *(++ptr);
816      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
817      if (c == '}') break;      if (c == '}') break;
818      name[i] = c;      name[i] = c;
819      }      }
820    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
821    name[i] = 0;    name[i] = 0;
822    }    }
823    
# Line 619  top = _pcre_utt_size; Line 838  top = _pcre_utt_size;
838    
839  while (bot < top)  while (bot < top)
840    {    {
841    i = (bot + top)/2;    i = (bot + top) >> 1;
842    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
843    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
844        {
845        *dptr = _pcre_utt[i].value;
846        return _pcre_utt[i].type;
847        }
848    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
849    }    }
850    
 UNKNOWN_RETURN:  
851  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
852  *ptrptr = ptr;  *ptrptr = ptr;
853  return -1;  return -1;
# Line 698  read_repeat_counts(const uschar *p, int Line 920  read_repeat_counts(const uschar *p, int
920  int min = 0;  int min = 0;
921  int max = -1;  int max = -1;
922    
923    /* Read the minimum value and do a paranoid check: a negative value indicates
924    an integer overflow. */
925    
926  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
927    if (min < 0 || min > 65535)
928      {
929      *errorcodeptr = ERR5;
930      return p;
931      }
932    
933    /* Read the maximum value if there is one, and again do a paranoid on its size.
934    Also, max must not be less than min. */
935    
936  if (*p == '}') max = min; else  if (*p == '}') max = min; else
937    {    {
# Line 706  if (*p == '}') max = min; else Line 939  if (*p == '}') max = min; else
939      {      {
940      max = 0;      max = 0;
941      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
942        if (max < 0 || max > 65535)
943          {
944          *errorcodeptr = ERR5;
945          return p;
946          }
947      if (max < min)      if (max < min)
948        {        {
949        *errorcodeptr = ERR4;        *errorcodeptr = ERR4;
# Line 714  if (*p == '}') max = min; else Line 952  if (*p == '}') max = min; else
952      }      }
953    }    }
954    
955  /* Do paranoid checks, then fill in the required variables, and pass back the  /* Fill in the required variables, and pass back the pointer to the terminating
956  pointer to the terminating '}'. */  '}'. */
957    
958  if (min > 65535 || max > 65535)  *minp = min;
959    *errorcodeptr = ERR5;  *maxp = max;
960  else  return p;
961    }
962    
963    
964    
965    /*************************************************
966    *       Find forward referenced subpattern       *
967    *************************************************/
968    
969    /* This function scans along a pattern's text looking for capturing
970    subpatterns, and counting them. If it finds a named pattern that matches the
971    name it is given, it returns its number. Alternatively, if the name is NULL, it
972    returns when it reaches a given numbered subpattern. This is used for forward
973    references to subpatterns. We know that if (?P< is encountered, the name will
974    be terminated by '>' because that is checked in the first pass.
975    
976    Arguments:
977      ptr          current position in the pattern
978      count        current count of capturing parens so far encountered
979      name         name to seek, or NULL if seeking a numbered subpattern
980      lorn         name length, or subpattern number if name is NULL
981      xmode        TRUE if we are in /x mode
982    
983    Returns:       the number of the named subpattern, or -1 if not found
984    */
985    
986    static int
987    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
988      BOOL xmode)
989    {
990    const uschar *thisname;
991    
992    for (; *ptr != 0; ptr++)
993    {    {
994    *minp = min;    int term;
995    *maxp = max;  
996      /* Skip over backslashed characters and also entire \Q...\E */
997    
998      if (*ptr == '\\')
999        {
1000        if (*(++ptr) == 0) return -1;
1001        if (*ptr == 'Q') for (;;)
1002          {
1003          while (*(++ptr) != 0 && *ptr != '\\');
1004          if (*ptr == 0) return -1;
1005          if (*(++ptr) == 'E') break;
1006          }
1007        continue;
1008        }
1009    
1010      /* Skip over character classes */
1011    
1012      if (*ptr == '[')
1013        {
1014        while (*(++ptr) != ']')
1015          {
1016          if (*ptr == 0) return -1;
1017          if (*ptr == '\\')
1018            {
1019            if (*(++ptr) == 0) return -1;
1020            if (*ptr == 'Q') for (;;)
1021              {
1022              while (*(++ptr) != 0 && *ptr != '\\');
1023              if (*ptr == 0) return -1;
1024              if (*(++ptr) == 'E') break;
1025              }
1026            continue;
1027            }
1028          }
1029        continue;
1030        }
1031    
1032      /* Skip comments in /x mode */
1033    
1034      if (xmode && *ptr == '#')
1035        {
1036        while (*(++ptr) != 0 && *ptr != '\n');
1037        if (*ptr == 0) return -1;
1038        continue;
1039        }
1040    
1041      /* An opening parens must now be a real metacharacter */
1042    
1043      if (*ptr != '(') continue;
1044      if (ptr[1] != '?' && ptr[1] != '*')
1045        {
1046        count++;
1047        if (name == NULL && count == lorn) return count;
1048        continue;
1049        }
1050    
1051      ptr += 2;
1052      if (*ptr == 'P') ptr++;                      /* Allow optional P */
1053    
1054      /* We have to disambiguate (?<! and (?<= from (?<name> */
1055    
1056      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1057           *ptr != '\'')
1058        continue;
1059    
1060      count++;
1061    
1062      if (name == NULL && count == lorn) return count;
1063      term = *ptr++;
1064      if (term == '<') term = '>';
1065      thisname = ptr;
1066      while (*ptr != term) ptr++;
1067      if (name != NULL && lorn == ptr - thisname &&
1068          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1069        return count;
1070    }    }
1071  return p;  
1072    return -1;
1073  }  }
1074    
1075    
# Line 778  for (;;) Line 1123  for (;;)
1123    
1124      case OP_CALLOUT:      case OP_CALLOUT:
1125      case OP_CREF:      case OP_CREF:
1126      case OP_BRANUMBER:      case OP_RREF:
1127        case OP_DEF:
1128      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1129      break;      break;
1130    
# Line 823  for (;;) Line 1169  for (;;)
1169    {    {
1170    int d;    int d;
1171    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1172    switch (op)    switch (op)
1173      {      {
1174        case OP_CBRA:
1175      case OP_BRA:      case OP_BRA:
1176      case OP_ONCE:      case OP_ONCE:
1177      case OP_COND:      case OP_COND:
1178      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1179      if (d < 0) return d;      if (d < 0) return d;
1180      branchlength += d;      branchlength += d;
1181      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 865  for (;;) Line 1210  for (;;)
1210      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1211    
1212      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1213      case OP_CREF:      case OP_CREF:
1214        case OP_RREF:
1215        case OP_DEF:
1216      case OP_OPT:      case OP_OPT:
1217      case OP_CALLOUT:      case OP_CALLOUT:
1218      case OP_SOD:      case OP_SOD:
# Line 884  for (;;) Line 1230  for (;;)
1230    
1231      case OP_CHAR:      case OP_CHAR:
1232      case OP_CHARNC:      case OP_CHARNC:
1233        case OP_NOT:
1234      branchlength++;      branchlength++;
1235      cc += 2;      cc += 2;
1236  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 910  for (;;) Line 1257  for (;;)
1257    
1258      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1259      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1260        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1261      cc += 4;      cc += 4;
1262      break;      break;
1263    
# Line 917  for (;;) Line 1265  for (;;)
1265    
1266      case OP_PROP:      case OP_PROP:
1267      case OP_NOTPROP:      case OP_NOTPROP:
1268      cc++;      cc += 2;
1269      /* Fall through */      /* Fall through */
1270    
1271      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 998  Returns:      pointer to the opcode for Line 1346  Returns:      pointer to the opcode for
1346  static const uschar *  static const uschar *
1347  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1348  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1349  for (;;)  for (;;)
1350    {    {
1351    register int c = *code;    register int c = *code;
1352    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1353    else if (c > OP_BRA)  
1354      /* XCLASS is used for classes that cannot be represented just by a bit
1355      map. This includes negated single high-valued characters. The length in
1356      the table is zero; the actual length is stored in the compiled code. */
1357    
1358      if (c == OP_XCLASS) code += GET(code, 1);
1359    
1360      /* Handle capturing bracket */
1361    
1362      else if (c == OP_CBRA)
1363      {      {
1364      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1365      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1366      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1367      }      }
1368    
1369      /* Otherwise, we can get the item's length from the table, except that for
1370      repeated character types, we have to test for \p and \P, which have an extra
1371      two bytes of parameters. */
1372    
1373    else    else
1374      {      {
1375      code += _pcre_OP_lengths[c];      switch(c)
1376          {
1377          case OP_TYPESTAR:
1378          case OP_TYPEMINSTAR:
1379          case OP_TYPEPLUS:
1380          case OP_TYPEMINPLUS:
1381          case OP_TYPEQUERY:
1382          case OP_TYPEMINQUERY:
1383          case OP_TYPEPOSSTAR:
1384          case OP_TYPEPOSPLUS:
1385          case OP_TYPEPOSQUERY:
1386          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1387          break;
1388    
1389  #ifdef SUPPORT_UTF8        case OP_TYPEUPTO:
1390          case OP_TYPEMINUPTO:
1391          case OP_TYPEEXACT:
1392          case OP_TYPEPOSUPTO:
1393          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1394          break;
1395          }
1396    
1397      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* Add in the fixed length from the table */
1398      by a multi-byte character. The length in the table is a minimum, so we have  
1399      to scan along to skip the extra bytes. All opcodes are less than 128, so we      code += _pcre_OP_lengths[c];
     can use relatively efficient code. */  
1400    
1401      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1402      a multi-byte character. The length in the table is a minimum, so we have to
1403      arrange to skip the extra bytes. */
1404    
1405    #ifdef SUPPORT_UTF8
1406      if (utf8) switch(c)      if (utf8) switch(c)
1407        {        {
1408        case OP_CHAR:        case OP_CHAR:
# Line 1031  for (;;) Line 1410  for (;;)
1410        case OP_EXACT:        case OP_EXACT:
1411        case OP_UPTO:        case OP_UPTO:
1412        case OP_MINUPTO:        case OP_MINUPTO:
1413          case OP_POSUPTO:
1414        case OP_STAR:        case OP_STAR:
1415        case OP_MINSTAR:        case OP_MINSTAR:
1416          case OP_POSSTAR:
1417        case OP_PLUS:        case OP_PLUS:
1418        case OP_MINPLUS:        case OP_MINPLUS:
1419          case OP_POSPLUS:
1420        case OP_QUERY:        case OP_QUERY:
1421        case OP_MINQUERY:        case OP_MINQUERY:
1422        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1423        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1424        break;        break;
1425        }        }
1426  #endif  #endif
# Line 1072  Returns:      pointer to the opcode for Line 1447  Returns:      pointer to the opcode for
1447  static const uschar *  static const uschar *
1448  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1449  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1450  for (;;)  for (;;)
1451    {    {
1452    register int c = *code;    register int c = *code;
1453    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1454    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1455    else if (c > OP_BRA)  
1456      {    /* XCLASS is used for classes that cannot be represented just by a bit
1457      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1458      }    the table is zero; the actual length is stored in the compiled code. */
1459    
1460      if (c == OP_XCLASS) code += GET(code, 1);
1461    
1462      /* Otherwise, we can get the item's length from the table, except that for
1463      repeated character types, we have to test for \p and \P, which have an extra
1464      two bytes of parameters. */
1465    
1466    else    else
1467      {      {
1468      code += _pcre_OP_lengths[c];      switch(c)
1469          {
1470          case OP_TYPESTAR:
1471          case OP_TYPEMINSTAR:
1472          case OP_TYPEPLUS:
1473          case OP_TYPEMINPLUS:
1474          case OP_TYPEQUERY:
1475          case OP_TYPEMINQUERY:
1476          case OP_TYPEPOSSTAR:
1477          case OP_TYPEPOSPLUS:
1478          case OP_TYPEPOSQUERY:
1479          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1480          break;
1481    
1482  #ifdef SUPPORT_UTF8        case OP_TYPEPOSUPTO:
1483          case OP_TYPEUPTO:
1484          case OP_TYPEMINUPTO:
1485          case OP_TYPEEXACT:
1486          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1487          break;
1488          }
1489    
1490        /* Add in the fixed length from the table */
1491    
1492        code += _pcre_OP_lengths[c];
1493    
1494      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1495      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
1496      to scan along to skip the extra bytes. All opcodes are less than 128, so we      to arrange to skip the extra bytes. */
     can use relatively efficient code. */  
1497    
1498    #ifdef SUPPORT_UTF8
1499      if (utf8) switch(c)      if (utf8) switch(c)
1500        {        {
1501        case OP_CHAR:        case OP_CHAR:
# Line 1103  for (;;) Line 1503  for (;;)
1503        case OP_EXACT:        case OP_EXACT:
1504        case OP_UPTO:        case OP_UPTO:
1505        case OP_MINUPTO:        case OP_MINUPTO:
1506          case OP_POSUPTO:
1507        case OP_STAR:        case OP_STAR:
1508        case OP_MINSTAR:        case OP_MINSTAR:
1509          case OP_POSSTAR:
1510        case OP_PLUS:        case OP_PLUS:
1511        case OP_MINPLUS:        case OP_MINPLUS:
1512          case OP_POSPLUS:
1513        case OP_QUERY:        case OP_QUERY:
1514        case OP_MINQUERY:        case OP_MINQUERY:
1515        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1516        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1517        break;        break;
1518        }        }
1519  #endif  #endif
# Line 1132  for (;;) Line 1528  for (;;)
1528  *************************************************/  *************************************************/
1529    
1530  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1531  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1532  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1533  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1534  whose current branch will already have been scanned.  backward and negative forward assertions when its final argument is TRUE. If we
1535    hit an unclosed bracket, we return "empty" - this means we've struck an inner
1536    bracket whose current branch will already have been scanned.
1537    
1538  Arguments:  Arguments:
1539    code        points to start of search    code        points to start of search
# Line 1149  static BOOL Line 1547  static BOOL
1547  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1548  {  {
1549  register int c;  register int c;
1550  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1551       code < endcode;       code < endcode;
1552       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1553    {    {
# Line 1157  for (code = first_significant_code(code Line 1555  for (code = first_significant_code(code
1555    
1556    c = *code;    c = *code;
1557    
1558    if (c >= OP_BRA)    /* Skip over forward assertions; the other assertions are skipped by
1559      first_significant_code() with a TRUE final argument. */
1560    
1561      if (c == OP_ASSERT)
1562        {
1563        do code += GET(code, 1); while (*code == OP_ALT);
1564        c = *code;
1565        continue;
1566        }
1567    
1568      /* Groups with zero repeats can of course be empty; skip them. */
1569    
1570      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1571        {
1572        code += _pcre_OP_lengths[c];
1573        do code += GET(code, 1); while (*code == OP_ALT);
1574        c = *code;
1575        continue;
1576        }
1577    
1578      /* For other groups, scan the branches. */
1579    
1580      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1581      {      {
1582      BOOL empty_branch;      BOOL empty_branch;
1583      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1173  for (code = first_significant_code(code Line 1593  for (code = first_significant_code(code
1593        }        }
1594      while (*code == OP_ALT);      while (*code == OP_ALT);
1595      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1596      c = *code;      c = *code;
1597        continue;
1598      }      }
1599    
1600    else switch (c)    /* Handle the other opcodes */
1601    
1602      switch (c)
1603      {      {
1604      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1605        cannot be represented just by a bit map. This includes negated single
1606        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1607        actual length is stored in the compiled code, so we must update "code"
1608        here. */
1609    
1610  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1611      case OP_XCLASS:      case OP_XCLASS:
1612      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1613      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1614  #endif  #endif
1615    
# Line 1233  for (code = first_significant_code(code Line 1659  for (code = first_significant_code(code
1659      case OP_NOT:      case OP_NOT:
1660      case OP_PLUS:      case OP_PLUS:
1661      case OP_MINPLUS:      case OP_MINPLUS:
1662        case OP_POSPLUS:
1663      case OP_EXACT:      case OP_EXACT:
1664      case OP_NOTPLUS:      case OP_NOTPLUS:
1665      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1666        case OP_NOTPOSPLUS:
1667      case OP_NOTEXACT:      case OP_NOTEXACT:
1668      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1669      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1670        case OP_TYPEPOSPLUS:
1671      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1672      return FALSE;      return FALSE;
1673    
1674        /* These are going to continue, as they may be empty, but we have to
1675        fudge the length for the \p and \P cases. */
1676    
1677        case OP_TYPESTAR:
1678        case OP_TYPEMINSTAR:
1679        case OP_TYPEPOSSTAR:
1680        case OP_TYPEQUERY:
1681        case OP_TYPEMINQUERY:
1682        case OP_TYPEPOSQUERY:
1683        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1684        break;
1685    
1686        /* Same for these */
1687    
1688        case OP_TYPEUPTO:
1689        case OP_TYPEMINUPTO:
1690        case OP_TYPEPOSUPTO:
1691        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1692        break;
1693    
1694      /* End of branch */      /* End of branch */
1695    
1696      case OP_KET:      case OP_KET:
# Line 1250  for (code = first_significant_code(code Line 1699  for (code = first_significant_code(code
1699      case OP_ALT:      case OP_ALT:
1700      return TRUE;      return TRUE;
1701    
1702      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1703      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1704    
1705  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1706      case OP_STAR:      case OP_STAR:
1707      case OP_MINSTAR:      case OP_MINSTAR:
1708        case OP_POSSTAR:
1709      case OP_QUERY:      case OP_QUERY:
1710      case OP_MINQUERY:      case OP_MINQUERY:
1711        case OP_POSQUERY:
1712      case OP_UPTO:      case OP_UPTO:
1713      case OP_MINUPTO:      case OP_MINUPTO:
1714        case OP_POSUPTO:
1715      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1716      break;      break;
1717  #endif  #endif
# Line 1308  return TRUE; Line 1760  return TRUE;
1760  *************************************************/  *************************************************/
1761    
1762  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1763  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1764  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1765  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1766    
1767    Originally, this function only recognized a sequence of letters between the
1768    terminators, but it seems that Perl recognizes any sequence of characters,
1769    though of course unknown POSIX names are subsequently rejected. Perl gives an
1770    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1771    didn't consider this to be a POSIX class. Likewise for [:1234:].
1772    
1773    The problem in trying to be exactly like Perl is in the handling of escapes. We
1774    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1775    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1776    below handles the special case of \], but does not try to do any other escape
1777    processing. This makes it different from Perl for cases such as [:l\ower:]
1778    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1779    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1780    I think.
1781    
1782  Argument:  Arguments:
1783    ptr      pointer to the initial [    ptr      pointer to the initial [
1784    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1785    
1786  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1787  */  */
1788    
1789  static BOOL  static BOOL
1790  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1791  {  {
1792  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1793  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1794  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1795    {    {
1796    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1797    return TRUE;      {
1798        if (*ptr == ']') return FALSE;
1799        if (*ptr == terminator && ptr[1] == ']')
1800          {
1801          *endptr = ptr;
1802          return TRUE;
1803          }
1804        }
1805    }    }
1806  return FALSE;  return FALSE;
1807  }  }
# Line 1355  Returns:     a value representing the na Line 1826  Returns:     a value representing the na
1826  static int  static int
1827  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
1828  {  {
1829    const char *pn = posix_names;
1830  register int yield = 0;  register int yield = 0;
1831  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
1832    {    {
1833    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
1834      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
1835      pn += posix_name_lengths[yield] + 1;
1836    yield++;    yield++;
1837    }    }
1838  return -1;  return -1;
# Line 1374  return -1; Line 1847  return -1;
1847  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
1848  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
1849  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
1850  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1851  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
1852  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
1853  offsets adjusted. That is the job of this function. Before it is called, the  have their offsets adjusted. That one of the jobs of this function. Before it
1854  partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
1855    OP_END.
1856    
1857    This function has been extended with the possibility of forward references for
1858    recursions and subroutine calls. It must also check the list of such references
1859    for the group we are dealing with. If it finds that one of the recursions in
1860    the current group is on this list, it adjusts the offset in the list, not the
1861    value in the reference (which is a group number).
1862    
1863  Arguments:  Arguments:
1864    group      points to the start of the group    group      points to the start of the group
1865    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1866    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1867    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1868      save_hwm   the hwm forward reference pointer at the start of the group
1869    
1870  Returns:     nothing  Returns:     nothing
1871  */  */
1872    
1873  static void  static void
1874  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1875      uschar *save_hwm)
1876  {  {
1877  uschar *ptr = group;  uschar *ptr = group;
1878    
1879  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1880    {    {
1881    int offset = GET(ptr, 1);    int offset;
1882    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1883    
1884      /* See if this recursion is on the forward reference list. If so, adjust the
1885      reference. */
1886    
1887      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1888        {
1889        offset = GET(hc, 0);
1890        if (cd->start_code + offset == ptr + 1)
1891          {
1892          PUT(hc, 0, offset + adjust);
1893          break;
1894          }
1895        }
1896    
1897      /* Otherwise, adjust the recursion offset if it's after the start of this
1898      group. */
1899    
1900      if (hc >= cd->hwm)
1901        {
1902        offset = GET(ptr, 1);
1903        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1904        }
1905    
1906    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1907    }    }
1908  }  }
# Line 1475  Yield:        TRUE when range returned; Line 1981  Yield:        TRUE when range returned;
1981  */  */
1982    
1983  static BOOL  static BOOL
1984  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1985      unsigned int *odptr)
1986  {  {
1987  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1988    
1989  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1990    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1991    
1992  if (c > d) return FALSE;  if (c > d) return FALSE;
1993    
# Line 1492  next = othercase + 1; Line 1996  next = othercase + 1;
1996    
1997  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1998    {    {
1999    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
2000    next++;    next++;
2001    }    }
2002    
# Line 1506  return TRUE; Line 2008  return TRUE;
2008  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2009    
2010    
2011    
2012  /*************************************************  /*************************************************
2013  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
2014  *************************************************/  *************************************************/
2015    
2016  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
2017  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
2018  bits.  sense to automatically possessify the repeated item.
2019    
2020  Arguments:  Arguments:
2021    optionsptr     pointer to the option bits    op_code       the repeated op code
2022    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
2023    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
2024    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
2025    errorcodeptr   points to error code variable    ptr           next character in pattern
2026    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
2027    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
2028    
2029  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
2030  */  */
2031    
2032  static BOOL  static BOOL
2033  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2034    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
2035  {  {
2036  int repeat_type, op_type;  int next;
2037    
2038    /* Skip whitespace and comments in extended mode */
2039    
2040    if ((options & PCRE_EXTENDED) != 0)
2041      {
2042      for (;;)
2043        {
2044        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2045        if (*ptr == '#')
2046          {
2047          while (*(++ptr) != 0)
2048            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2049          }
2050        else break;
2051        }
2052      }
2053    
2054    /* If the next item is one that we can handle, get its value. A non-negative
2055    value is a character, a negative value is an escape value. */
2056    
2057    if (*ptr == '\\')
2058      {
2059      int temperrorcode = 0;
2060      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2061      if (temperrorcode != 0) return FALSE;
2062      ptr++;    /* Point after the escape sequence */
2063      }
2064    
2065    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2066      {
2067    #ifdef SUPPORT_UTF8
2068      if (utf8) { GETCHARINC(next, ptr); } else
2069    #endif
2070      next = *ptr++;
2071      }
2072    
2073    else return FALSE;
2074    
2075    /* Skip whitespace and comments in extended mode */
2076    
2077    if ((options & PCRE_EXTENDED) != 0)
2078      {
2079      for (;;)
2080        {
2081        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2082        if (*ptr == '#')
2083          {
2084          while (*(++ptr) != 0)
2085            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2086          }
2087        else break;
2088        }
2089      }
2090    
2091    /* If the next thing is itself optional, we have to give up. */
2092    
2093    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2094      return FALSE;
2095    
2096    /* Now compare the next item with the previous opcode. If the previous is a
2097    positive single character match, "item" either contains the character or, if
2098    "item" is greater than 127 in utf8 mode, the character's bytes are in
2099    utf8_char. */
2100    
2101    
2102    /* Handle cases when the next item is a character. */
2103    
2104    if (next >= 0) switch(op_code)
2105      {
2106      case OP_CHAR:
2107    #ifdef SUPPORT_UTF8
2108      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2109    #endif
2110      return item != next;
2111    
2112      /* For CHARNC (caseless character) we must check the other case. If we have
2113      Unicode property support, we can use it to test the other case of
2114      high-valued characters. */
2115    
2116      case OP_CHARNC:
2117    #ifdef SUPPORT_UTF8
2118      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2119    #endif
2120      if (item == next) return FALSE;
2121    #ifdef SUPPORT_UTF8
2122      if (utf8)
2123        {
2124        unsigned int othercase;
2125        if (next < 128) othercase = cd->fcc[next]; else
2126    #ifdef SUPPORT_UCP
2127        othercase = _pcre_ucp_othercase((unsigned int)next);
2128    #else
2129        othercase = NOTACHAR;
2130    #endif
2131        return (unsigned int)item != othercase;
2132        }
2133      else
2134    #endif  /* SUPPORT_UTF8 */
2135      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2136    
2137      /* For OP_NOT, "item" must be a single-byte character. */
2138    
2139      case OP_NOT:
2140      if (item == next) return TRUE;
2141      if ((options & PCRE_CASELESS) == 0) return FALSE;
2142    #ifdef SUPPORT_UTF8
2143      if (utf8)
2144        {
2145        unsigned int othercase;
2146        if (next < 128) othercase = cd->fcc[next]; else
2147    #ifdef SUPPORT_UCP
2148        othercase = _pcre_ucp_othercase(next);
2149    #else
2150        othercase = NOTACHAR;
2151    #endif
2152        return (unsigned int)item == othercase;
2153        }
2154      else
2155    #endif  /* SUPPORT_UTF8 */
2156      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2157    
2158      case OP_DIGIT:
2159      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2160    
2161      case OP_NOT_DIGIT:
2162      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2163    
2164      case OP_WHITESPACE:
2165      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2166    
2167      case OP_NOT_WHITESPACE:
2168      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2169    
2170      case OP_WORDCHAR:
2171      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2172    
2173      case OP_NOT_WORDCHAR:
2174      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2175    
2176      case OP_HSPACE:
2177      case OP_NOT_HSPACE:
2178      switch(next)
2179        {
2180        case 0x09:
2181        case 0x20:
2182        case 0xa0:
2183        case 0x1680:
2184        case 0x180e:
2185        case 0x2000:
2186        case 0x2001:
2187        case 0x2002:
2188        case 0x2003:
2189        case 0x2004:
2190        case 0x2005:
2191        case 0x2006:
2192        case 0x2007:
2193        case 0x2008:
2194        case 0x2009:
2195        case 0x200A:
2196        case 0x202f:
2197        case 0x205f:
2198        case 0x3000:
2199        return op_code != OP_HSPACE;
2200        default:
2201        return op_code == OP_HSPACE;
2202        }
2203    
2204      case OP_VSPACE:
2205      case OP_NOT_VSPACE:
2206      switch(next)
2207        {
2208        case 0x0a:
2209        case 0x0b:
2210        case 0x0c:
2211        case 0x0d:
2212        case 0x85:
2213        case 0x2028:
2214        case 0x2029:
2215        return op_code != OP_VSPACE;
2216        default:
2217        return op_code == OP_VSPACE;
2218        }
2219    
2220      default:
2221      return FALSE;
2222      }
2223    
2224    
2225    /* Handle the case when the next item is \d, \s, etc. */
2226    
2227    switch(op_code)
2228      {
2229      case OP_CHAR:
2230      case OP_CHARNC:
2231    #ifdef SUPPORT_UTF8
2232      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2233    #endif
2234      switch(-next)
2235        {
2236        case ESC_d:
2237        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2238    
2239        case ESC_D:
2240        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2241    
2242        case ESC_s:
2243        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2244    
2245        case ESC_S:
2246        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2247    
2248        case ESC_w:
2249        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2250    
2251        case ESC_W:
2252        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2253    
2254        case ESC_h:
2255        case ESC_H:
2256        switch(item)
2257          {
2258          case 0x09:
2259          case 0x20:
2260          case 0xa0:
2261          case 0x1680:
2262          case 0x180e:
2263          case 0x2000:
2264          case 0x2001:
2265          case 0x2002:
2266          case 0x2003:
2267          case 0x2004:
2268          case 0x2005:
2269          case 0x2006:
2270          case 0x2007:
2271          case 0x2008:
2272          case 0x2009:
2273          case 0x200A:
2274          case 0x202f:
2275          case 0x205f:
2276          case 0x3000:
2277          return -next != ESC_h;
2278          default:
2279          return -next == ESC_h;
2280          }
2281    
2282        case ESC_v:
2283        case ESC_V:
2284        switch(item)
2285          {
2286          case 0x0a:
2287          case 0x0b:
2288          case 0x0c:
2289          case 0x0d:
2290          case 0x85:
2291          case 0x2028:
2292          case 0x2029:
2293          return -next != ESC_v;
2294          default:
2295          return -next == ESC_v;
2296          }
2297    
2298        default:
2299        return FALSE;
2300        }
2301    
2302      case OP_DIGIT:
2303      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2304             next == -ESC_h || next == -ESC_v;
2305    
2306      case OP_NOT_DIGIT:
2307      return next == -ESC_d;
2308    
2309      case OP_WHITESPACE:
2310      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2311    
2312      case OP_NOT_WHITESPACE:
2313      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2314    
2315      case OP_HSPACE:
2316      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2317    
2318      case OP_NOT_HSPACE:
2319      return next == -ESC_h;
2320    
2321      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2322      case OP_VSPACE:
2323      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2324    
2325      case OP_NOT_VSPACE:
2326      return next == -ESC_v;
2327    
2328      case OP_WORDCHAR:
2329      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2330    
2331      case OP_NOT_WORDCHAR:
2332      return next == -ESC_w || next == -ESC_d;
2333    
2334      default:
2335      return FALSE;
2336      }
2337    
2338    /* Control does not reach here */
2339    }
2340    
2341    
2342    
2343    /*************************************************
2344    *           Compile one branch                   *
2345    *************************************************/
2346    
2347    /* Scan the pattern, compiling it into the a vector. If the options are
2348    changed during the branch, the pointer is used to change the external options
2349    bits. This function is used during the pre-compile phase when we are trying
2350    to find out the amount of memory needed, as well as during the real compile
2351    phase. The value of lengthptr distinguishes the two phases.
2352    
2353    Arguments:
2354      optionsptr     pointer to the option bits
2355      codeptr        points to the pointer to the current code point
2356      ptrptr         points to the current pattern pointer
2357      errorcodeptr   points to error code variable
2358      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2359      reqbyteptr     set to the last literal character required, else < 0
2360      bcptr          points to current branch chain
2361      cd             contains pointers to tables etc.
2362      lengthptr      NULL during the real compile phase
2363                     points to length accumulator during pre-compile phase
2364    
2365    Returns:         TRUE on success
2366                     FALSE, with *errorcodeptr set non-zero on error
2367    */
2368    
2369    static BOOL
2370    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2371      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2372      compile_data *cd, int *lengthptr)
2373    {
2374    int repeat_type, op_type;
2375  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2376  int bravalue = 0;  int bravalue = 0;
2377  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
2378  int firstbyte, reqbyte;  int firstbyte, reqbyte;
2379  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
2380  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
 int condcount = 0;  
2381  int options = *optionsptr;  int options = *optionsptr;
2382  int after_manual_callout = 0;  int after_manual_callout = 0;
2383    int length_prevgroup = 0;
2384  register int c;  register int c;
2385  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2386    uschar *last_code = code;
2387    uschar *orig_code = code;
2388  uschar *tempcode;  uschar *tempcode;
2389  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2390  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1553  const uschar *ptr = *ptrptr; Line 2392  const uschar *ptr = *ptrptr;
2392  const uschar *tempptr;  const uschar *tempptr;
2393  uschar *previous = NULL;  uschar *previous = NULL;
2394  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2395    uschar *save_hwm = NULL;
2396  uschar classbits[32];  uschar classbits[32];
2397    
2398  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2399  BOOL class_utf8;  BOOL class_utf8;
2400  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2401  uschar *class_utf8data;  uschar *class_utf8data;
2402    uschar *class_utf8data_base;
2403  uschar utf8_char[6];  uschar utf8_char[6];
2404  #else  #else
2405  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2406    uschar *utf8_char = NULL;
2407    #endif
2408    
2409    #ifdef DEBUG
2410    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2411  #endif  #endif
2412    
2413  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1593  req_caseopt = ((options & PCRE_CASELESS) Line 2439  req_caseopt = ((options & PCRE_CASELESS)
2439  for (;; ptr++)  for (;; ptr++)
2440    {    {
2441    BOOL negate_class;    BOOL negate_class;
2442      BOOL should_flip_negation;
2443    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2444    BOOL is_quantifier;    BOOL is_quantifier;
2445      BOOL is_recurse;
2446      BOOL reset_bracount;
2447    int class_charcount;    int class_charcount;
2448    int class_lastchar;    int class_lastchar;
2449    int newoptions;    int newoptions;
2450    int recno;    int recno;
2451      int refsign;
2452    int skipbytes;    int skipbytes;
2453    int subreqbyte;    int subreqbyte;
2454    int subfirstbyte;    int subfirstbyte;
2455      int terminator;
2456    int mclength;    int mclength;
2457    uschar mcbuffer[8];    uschar mcbuffer[8];
2458    
2459    /* Next byte in the pattern */    /* Get next byte in the pattern */
2460    
2461    c = *ptr;    c = *ptr;
2462    
2463      /* If we are in the pre-compile phase, accumulate the length used for the
2464      previous cycle of this loop. */
2465    
2466      if (lengthptr != NULL)
2467        {
2468    #ifdef DEBUG
2469        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2470    #endif
2471        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2472          {
2473          *errorcodeptr = ERR52;
2474          goto FAILED;
2475          }
2476    
2477        /* There is at least one situation where code goes backwards: this is the
2478        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2479        the class is simply eliminated. However, it is created first, so we have to
2480        allow memory for it. Therefore, don't ever reduce the length at this point.
2481        */
2482    
2483        if (code < last_code) code = last_code;
2484    
2485        /* Paranoid check for integer overflow */
2486    
2487        if (OFLOW_MAX - *lengthptr < code - last_code)
2488          {
2489          *errorcodeptr = ERR20;
2490          goto FAILED;
2491          }
2492    
2493        *lengthptr += code - last_code;
2494        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2495    
2496        /* If "previous" is set and it is not at the start of the work space, move
2497        it back to there, in order to avoid filling up the work space. Otherwise,
2498        if "previous" is NULL, reset the current code pointer to the start. */
2499    
2500        if (previous != NULL)
2501          {
2502          if (previous > orig_code)
2503            {
2504            memmove(orig_code, previous, code - previous);
2505            code -= previous - orig_code;
2506            previous = orig_code;
2507            }
2508          }
2509        else code = orig_code;
2510    
2511        /* Remember where this code item starts so we can pick up the length
2512        next time round. */
2513    
2514        last_code = code;
2515        }
2516    
2517      /* In the real compile phase, just check the workspace used by the forward
2518      reference list. */
2519    
2520      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2521        {
2522        *errorcodeptr = ERR52;
2523        goto FAILED;
2524        }
2525    
2526    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2527    
# Line 1623  for (;; ptr++) Line 2537  for (;; ptr++)
2537        {        {
2538        if (previous_callout != NULL)        if (previous_callout != NULL)
2539          {          {
2540          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2541              complete_callout(previous_callout, ptr, cd);
2542          previous_callout = NULL;          previous_callout = NULL;
2543          }          }
2544        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1644  for (;; ptr++) Line 2559  for (;; ptr++)
2559    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2560         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2561      {      {
2562      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2563          complete_callout(previous_callout, ptr, cd);
2564      previous_callout = NULL;      previous_callout = NULL;
2565      }      }
2566    
# Line 1655  for (;; ptr++) Line 2571  for (;; ptr++)
2571      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2572      if (c == '#')      if (c == '#')
2573        {        {
2574        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2575        on the Macintosh. */          {
2576        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2577        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2578          if (*ptr != 0) continue;
2579    
2580          /* Else fall through to handle end of string */
2581          c = 0;
2582        }        }
2583      }      }
2584    
# Line 1672  for (;; ptr++) Line 2592  for (;; ptr++)
2592    
2593    switch(c)    switch(c)
2594      {      {
2595      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2596        case 0:                        /* The branch terminates at string end */
2597      case 0:      case '|':                      /* or | or ) */
     case '|':  
2598      case ')':      case ')':
2599      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2600      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2601      *codeptr = code;      *codeptr = code;
2602      *ptrptr = ptr;      *ptrptr = ptr;
2603        if (lengthptr != NULL)
2604          {
2605          if (OFLOW_MAX - *lengthptr < code - last_code)
2606            {
2607            *errorcodeptr = ERR20;
2608            goto FAILED;
2609            }
2610          *lengthptr += code - last_code;   /* To include callout length */
2611          DPRINTF((">> end branch\n"));
2612          }
2613      return TRUE;      return TRUE;
2614    
2615    
2616        /* ===================================================================*/
2617      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2618      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2619    
# Line 1711  for (;; ptr++) Line 2642  for (;; ptr++)
2642      *code++ = OP_ANY;      *code++ = OP_ANY;
2643      break;      break;
2644    
2645      /* Character classes. If the included characters are all < 255 in value, we  
2646      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2647      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2648      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2649      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2650        map as usual, then invert it at the end. However, we use a different opcode
2651        so that data characters > 255 can be handled correctly.
2652    
2653      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2654      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1730  for (;; ptr++) Line 2663  for (;; ptr++)
2663      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2664    
2665      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2666          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2667        {        {
2668        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2669        goto FAILED;        goto FAILED;
2670        }        }
2671    
2672      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2673        if the first few characters (either before or after ^) are \Q\E or \E we
2674        skip them too. This makes for compatibility with Perl. */
2675    
2676      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2677        for (;;)
2678        {        {
       negate_class = TRUE;  
2679        c = *(++ptr);        c = *(++ptr);
2680          if (c == '\\')
2681            {
2682            if (ptr[1] == 'E') ptr++;
2683              else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2684                else break;
2685            }
2686          else if (!negate_class && c == '^')
2687            negate_class = TRUE;
2688          else break;
2689        }        }
2690      else  
2691        {      /* If a class contains a negative special such as \S, we need to flip the
2692        negate_class = FALSE;      negation flag at the end, so that support for characters > 255 works
2693        }      correctly (they are all included in the class). */
2694    
2695        should_flip_negation = FALSE;
2696    
2697      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2698      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2699      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2700    
2701      class_charcount = 0;      class_charcount = 0;
2702      class_lastchar = -1;      class_lastchar = -1;
2703    
2704        /* Initialize the 32-char bit map to all zeros. We build the map in a
2705        temporary bit of memory, in case the class contains only 1 character (less
2706        than 256), because in that case the compiled code doesn't use the bit map.
2707        */
2708    
2709        memset(classbits, 0, 32 * sizeof(uschar));
2710    
2711  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2712      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2713      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2714        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2715  #endif  #endif
2716    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2717      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2718      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2719      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2720    
2721      do      if (c != 0) do
2722        {        {
2723          const uschar *oldptr;
2724    
2725  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2726        if (utf8 && c > 127)        if (utf8 && c > 127)
2727          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2728          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2729          }          }
2730    
2731          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2732          data and reset the pointer. This is so that very large classes that
2733          contain a zillion UTF-8 characters no longer overwrite the work space
2734          (which is on the stack). */
2735    
2736          if (lengthptr != NULL)
2737            {
2738            *lengthptr += class_utf8data - class_utf8data_base;
2739            class_utf8data = class_utf8data_base;
2740            }
2741    
2742  #endif  #endif
2743    
2744        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
2745    
2746        if (inescq)        if (inescq)
2747          {          {
2748          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2749            {            {
2750            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2751            ptr++;            ptr++;                            /* Skip the 'E' */
2752            continue;            continue;                         /* Carry on with next */
2753            }            }
2754          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2755          }          }
2756    
2757        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1803  for (;; ptr++) Line 2762  for (;; ptr++)
2762    
2763        if (c == '[' &&        if (c == '[' &&
2764            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2765            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2766          {          {
2767          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2768          int posix_class, i;          int posix_class, taboffset, tabopt;
2769          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2770            uschar pbits[32];
2771    
2772          if (ptr[1] != ':')          if (ptr[1] != ':')
2773            {            {
# Line 1819  for (;; ptr++) Line 2779  for (;; ptr++)
2779          if (*ptr == '^')          if (*ptr == '^')
2780            {            {
2781            local_negate = TRUE;            local_negate = TRUE;
2782              should_flip_negation = TRUE;  /* Note negative special */
2783            ptr++;            ptr++;
2784            }            }
2785    
# Line 1836  for (;; ptr++) Line 2797  for (;; ptr++)
2797          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2798            posix_class = 0;            posix_class = 0;
2799    
2800          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2801          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2802          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2803          white space chars afterwards. */          result into the bit map that is being built. */
2804    
2805          posix_class *= 3;          posix_class *= 3;
2806          for (i = 0; i < 3; i++)  
2807            /* Copy in the first table (always present) */
2808    
2809            memcpy(pbits, cbits + posix_class_maps[posix_class],
2810              32 * sizeof(uschar));
2811    
2812            /* If there is a second table, add or remove it as required. */
2813    
2814            taboffset = posix_class_maps[posix_class + 1];
2815            tabopt = posix_class_maps[posix_class + 2];
2816    
2817            if (taboffset >= 0)
2818            {            {
2819            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2820            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2821            else            else
2822              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2823            }            }
2824    
2825            /* Not see if we need to remove any special characters. An option
2826            value of 1 removes vertical space and 2 removes underscore. */
2827    
2828            if (tabopt < 0) tabopt = -tabopt;
2829            if (tabopt == 1) pbits[1] &= ~0x3c;
2830              else if (tabopt == 2) pbits[11] &= 0x7f;
2831    
2832            /* Add the POSIX table or its complement into the main table that is
2833            being built and we are done. */
2834    
2835            if (local_negate)
2836              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2837            else
2838              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2839    
2840          ptr = tempptr + 1;          ptr = tempptr + 1;
2841          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2842          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2843          }          }
2844    
2845        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2846        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2847        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2848        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2849        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2850        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2851    
2852        if (c == '\\')        if (c == '\\')
2853          {          {
2854          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2855            if (*errorcodeptr != 0) goto FAILED;
2856    
2857          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2858          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2859            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2860          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2861            {            {
2862            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1890  for (;; ptr++) Line 2866  for (;; ptr++)
2866            else inescq = TRUE;            else inescq = TRUE;
2867            continue;            continue;
2868            }            }
2869            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2870    
2871          if (c < 0)          if (c < 0)
2872            {            {
2873            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2874            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2875            switch (-c)  
2876              /* Save time by not doing this in the pre-compile phase. */
2877    
2878              if (lengthptr == NULL) switch (-c)
2879              {              {
2880              case ESC_d:              case ESC_d:
2881              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2882              continue;              continue;
2883    
2884              case ESC_D:              case ESC_D:
2885                should_flip_negation = TRUE;
2886              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2887              continue;              continue;
2888    
# Line 1910  for (;; ptr++) Line 2891  for (;; ptr++)
2891              continue;              continue;
2892    
2893              case ESC_W:              case ESC_W:
2894                should_flip_negation = TRUE;
2895              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2896              continue;              continue;
2897    
# Line 1919  for (;; ptr++) Line 2901  for (;; ptr++)
2901              continue;              continue;
2902    
2903              case ESC_S:              case ESC_S:
2904                should_flip_negation = TRUE;
2905              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2906              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2907              continue;              continue;
2908    
2909  #ifdef SUPPORT_UCP              default:    /* Not recognized; fall through */
2910              case ESC_p:              break;      /* Need "default" setting to stop compiler warning. */
2911              case ESC_P:              }
2912    
2913              /* In the pre-compile phase, just do the recognition. */
2914    
2915              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2916                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2917    
2918              /* We need to deal with \H, \h, \V, and \v in both phases because
2919              they use extra memory. */
2920    
2921              if (-c == ESC_h)
2922                {
2923                SETBIT(classbits, 0x09); /* VT */
2924                SETBIT(classbits, 0x20); /* SPACE */
2925                SETBIT(classbits, 0xa0); /* NSBP */
2926    #ifdef SUPPORT_UTF8
2927                if (utf8)
2928                {                {
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
2929                class_utf8 = TRUE;                class_utf8 = TRUE;
2930                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2931                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2932                *class_utf8data++ = property;                *class_utf8data++ = XCL_SINGLE;
2933                class_charcount -= 2;   /* Not a < 256 character */                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2934                  *class_utf8data++ = XCL_RANGE;
2935                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2936                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2937                  *class_utf8data++ = XCL_SINGLE;
2938                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2939                  *class_utf8data++ = XCL_SINGLE;
2940                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2941                  *class_utf8data++ = XCL_SINGLE;
2942                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2943                }                }
             continue;  
2944  #endif  #endif
2945                continue;
2946                }
2947    
2948              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
2949              strict mode. By default, for compatibility with Perl, they are              {
2950              treated as literals. */              for (c = 0; c < 32; c++)
2951                  {
2952                  int x = 0xff;
2953                  switch (c)
2954                    {
2955                    case 0x09/8: x ^= 1 << (0x09%8); break;
2956                    case 0x20/8: x ^= 1 << (0x20%8); break;
2957                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2958                    default: break;
2959                    }
2960                  classbits[c] |= x;
2961                  }
2962    
2963              default:  #ifdef SUPPORT_UTF8
2964              if ((options & PCRE_EXTRA) != 0)              if (utf8)
2965                {                {
2966                *errorcodeptr = ERR7;                class_utf8 = TRUE;
2967                goto FAILED;                *class_utf8data++ = XCL_RANGE;
2968                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2969                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2970                  *class_utf8data++ = XCL_RANGE;
2971                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2972                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2973                  *class_utf8data++ = XCL_RANGE;
2974                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2975                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2976                  *class_utf8data++ = XCL_RANGE;
2977                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2978                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2979                  *class_utf8data++ = XCL_RANGE;
2980                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2981                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2982                  *class_utf8data++ = XCL_RANGE;
2983                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2984                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2985                  *class_utf8data++ = XCL_RANGE;
2986                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2987                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2988                }                }
2989              c = *ptr;              /* The final character */  #endif
2990              class_charcount -= 2;  /* Undo the default count from above */              continue;
2991              }              }
           }  
2992    
2993          /* Fall through if we have a single character (c >= 0). This may be            if (-c == ESC_v)
2994          > 256 in UTF-8 mode. */              {
2995                SETBIT(classbits, 0x0a); /* LF */
2996                SETBIT(classbits, 0x0b); /* VT */
2997                SETBIT(classbits, 0x0c); /* FF */
2998                SETBIT(classbits, 0x0d); /* CR */
2999                SETBIT(classbits, 0x85); /* NEL */
3000    #ifdef SUPPORT_UTF8
3001                if (utf8)
3002                  {
3003                  class_utf8 = TRUE;
3004                  *class_utf8data++ = XCL_RANGE;
3005                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3006                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3007                  }
3008    #endif
3009                continue;
3010                }
3011    
3012          }   /* End of backslash handling */            if (-c == ESC_V)
3013                {
3014                for (c = 0; c < 32; c++)
3015                  {
3016                  int x = 0xff;
3017                  switch (c)
3018                    {
3019                    case 0x0a/8: x ^= 1 << (0x0a%8);
3020                                 x ^= 1 << (0x0b%8);
3021                                 x ^= 1 << (0x0c%8);
3022                                 x ^= 1 << (0x0d%8);
3023                                 break;
3024                    case 0x85/8: x ^= 1 << (0x85%8); break;
3025                    default: break;
3026                    }
3027                  classbits[c] |= x;
3028                  }
3029    
3030        /* A single character may be followed by '-' to form a range. However,  #ifdef SUPPORT_UTF8
3031                if (utf8)
3032                  {
3033                  class_utf8 = TRUE;
3034                  *class_utf8data++ = XCL_RANGE;
3035                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3036                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3037                  *class_utf8data++ = XCL_RANGE;
3038                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3039                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3040                  }
3041    #endif
3042                continue;
3043                }
3044    
3045              /* We need to deal with \P and \p in both phases. */
3046    
3047    #ifdef SUPPORT_UCP
3048              if (-c == ESC_p || -c == ESC_P)
3049                {
3050                BOOL negated;
3051                int pdata;
3052                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3053                if (ptype < 0) goto FAILED;
3054                class_utf8 = TRUE;
3055                *class_utf8data++ = ((-c == ESC_p) != negated)?
3056                  XCL_PROP : XCL_NOTPROP;
3057                *class_utf8data++ = ptype;
3058                *class_utf8data++ = pdata;
3059                class_charcount -= 2;   /* Not a < 256 character */
3060                continue;
3061                }
3062    #endif
3063              /* Unrecognized escapes are faulted if PCRE is running in its
3064              strict mode. By default, for compatibility with Perl, they are
3065              treated as literals. */
3066    
3067              if ((options & PCRE_EXTRA) != 0)
3068                {
3069                *errorcodeptr = ERR7;
3070                goto FAILED;
3071                }
3072    
3073              class_charcount -= 2;  /* Undo the default count from above */
3074              c = *ptr;              /* Get the final character and fall through */
3075              }
3076    
3077            /* Fall through if we have a single character (c >= 0). This may be
3078            greater than 256 in UTF-8 mode. */
3079    
3080            }   /* End of backslash handling */
3081    
3082          /* A single character may be followed by '-' to form a range. However,
3083        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
3084        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
3085          entirely. The code for handling \Q and \E is messy. */
3086    
3087          CHECK_RANGE:
3088          while (ptr[1] == '\\' && ptr[2] == 'E')
3089            {
3090            inescq = FALSE;
3091            ptr += 2;
3092            }
3093    
3094          oldptr = ptr;
3095    
3096          /* Remember \r or \n */
3097    
3098        if (ptr[1] == '-' && ptr[2] != ']')        if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3099    
3100          /* Check for range */
3101    
3102          if (!inescq && ptr[1] == '-')
3103          {          {
3104          int d;          int d;
3105          ptr += 2;          ptr += 2;
3106            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3107    
3108            /* If we hit \Q (not followed by \E) at this point, go into escaped
3109            mode. */
3110    
3111            while (*ptr == '\\' && ptr[1] == 'Q')
3112              {
3113              ptr += 2;
3114              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3115              inescq = TRUE;
3116              break;
3117              }
3118    
3119            if (*ptr == 0 || (!inescq && *ptr == ']'))
3120              {
3121              ptr = oldptr;
3122              goto LONE_SINGLE_CHARACTER;
3123              }
3124    
3125  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3126          if (utf8)          if (utf8)
# Line 1981  for (;; ptr++) Line 3135  for (;; ptr++)
3135          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3136          in such circumstances. */          in such circumstances. */
3137    
3138          if (d == '\\')          if (!inescq && d == '\\')
3139            {            {
3140            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3141            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
3142    
3143            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backspace; \X is literal X; \R is literal R; any other
3144            was literal */            special means the '-' was literal */
3145    
3146            if (d < 0)            if (d < 0)
3147              {              {
3148              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
3149              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
3150                else if (d == -ESC_R) d = 'R'; else
3151                {                {
3152                ptr = oldptr - 2;                ptr = oldptr;
3153                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3154                }                }
3155              }              }
3156            }            }
3157    
3158          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3159          the pre-pass. Optimize one-character ranges */          one-character ranges */
3160    
3161            if (d < c)
3162              {
3163              *errorcodeptr = ERR8;
3164              goto FAILED;
3165              }
3166    
3167          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3168    
3169            /* Remember \r or \n */
3170    
3171            if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3172    
3173          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3174          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
3175          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 2022  for (;; ptr++) Line 3187  for (;; ptr++)
3187  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3188            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3189              {              {
3190              int occ, ocd;              unsigned int occ, ocd;
3191              int cc = c;              unsigned int cc = c;
3192              int origd = d;              unsigned int origd = d;
3193              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3194                {                {
3195                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3196                      ocd <= (unsigned int)d)
3197                    continue;                          /* Skip embedded ranges */
3198    
3199                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3200                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3201                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3202                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3203                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3204                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3205                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3206                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3207                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3208                  d = ocd;                  d = ocd;
3209                  continue;                  continue;
# Line 2082  for (;; ptr++) Line 3251  for (;; ptr++)
3251          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3252          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3253    
3254          for (; c <= d; c++)          class_charcount += d - c + 1;
3255            class_lastchar = d;
3256    
3257            /* We can save a bit of time by skipping this in the pre-compile. */
3258    
3259            if (lengthptr == NULL) for (; c <= d; c++)
3260            {            {
3261            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3262            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2090  for (;; ptr++) Line 3264  for (;; ptr++)
3264              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3265              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3266              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3267            }            }
3268    
3269          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2115  for (;; ptr++) Line 3287  for (;; ptr++)
3287  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3288          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3289            {            {
3290            int chartype;            unsigned int othercase;
3291            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3292              {              {
3293              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3294              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2143  for (;; ptr++) Line 3313  for (;; ptr++)
3313          }          }
3314        }        }
3315    
3316      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3317      loop. This "while" is the end of the "do" above. */  
3318        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3319    
3320        if (c == 0)                          /* Missing terminating ']' */
3321          {
3322          *errorcodeptr = ERR6;
3323          goto FAILED;
3324          }
3325    
3326    
3327    /* This code has been disabled because it would mean that \s counts as
3328    an explicit \r or \n reference, and that's not really what is wanted. Now
3329    we set the flag only if there is a literal "\r" or "\n" in the class. */
3330    
3331    #if 0
3332        /* Remember whether \r or \n are in this class */
3333    
3334        if (negate_class)
3335          {
3336          if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3337          }
3338        else
3339          {
3340          if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3341          }
3342    #endif
3343    
     while ((c = *(++ptr)) != ']' || inescq);  
3344    
3345      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3346      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3347      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3348      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3349      single-bytes only. This is an historical hangover. Maybe one day we can  
3350      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3351        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3352        operate on single-bytes only. This is an historical hangover. Maybe one day
3353        we can tidy these opcodes to handle multi-byte characters.
3354    
3355      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3356      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 2163  for (;; ptr++) Line 3360  for (;; ptr++)
3360      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3361    
3362  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3363      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3364            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3365  #else  #else
3366      if (class_charcount == 1)      if (class_charcount == 1)
3367  #endif  #endif
# Line 2209  for (;; ptr++) Line 3404  for (;; ptr++)
3404      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3405    
3406      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3407      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
3408      we can omit the bitmap. */      such as \S in the class, because in that case all characters > 255 are in
3409        the class, so any that were explicitly given as well can be ignored. If
3410        (when there are explicit characters > 255 that must be listed) there are no
3411        characters < 256, we can omit the bitmap in the actual compiled code. */
3412    
3413  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3414      if (class_utf8)      if (class_utf8 && !should_flip_negation)
3415        {        {
3416        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3417        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
3418        code += LINK_SIZE;        code += LINK_SIZE;
3419        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3420    
3421        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3422        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3423    
3424        if (class_charcount > 0)        if (class_charcount > 0)
3425          {          {
3426          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3427            memmove(code + 32, code, class_utf8data - code);
3428          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3429          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3430          }          }
3431          else code = class_utf8data;
3432    
3433        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3434    
# Line 2246  for (;; ptr++) Line 3437  for (;; ptr++)
3437        }        }
3438  #endif  #endif
3439    
3440      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, set the opcode to OP_CLASS or
3441      and copy it into the code vector. If this is the first thing in the branch,      OP_NCLASS, depending on whether the whole class was negated and whether
3442      there can be no first char setting, whatever the repeat count. Any reqbyte      there were negative specials such as \S in the class. Then copy the 32-byte
3443      setting must remain unchanged after any kind of repeat. */      map into the code vector, negating it if necessary. */
3444    
3445        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3446      if (negate_class)      if (negate_class)
3447        {        {
3448        *code++ = OP_NCLASS;        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3449        for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3450        }        }
3451      else      else
3452        {        {
       *code++ = OP_CLASS;  
3453        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
3454        }        }
3455      code += 32;      code += 32;
3456      break;      break;
3457    
3458    
3459        /* ===================================================================*/
3460      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3461      has been tested above. */      has been tested above. */
3462    
# Line 2331  for (;; ptr++) Line 3524  for (;; ptr++)
3524        }        }
3525      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3526    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3527      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3528      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3529      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2378  for (;; ptr++) Line 3557  for (;; ptr++)
3557          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3558          }          }
3559    
3560          /* If the repetition is unlimited, it pays to see if the next thing on
3561          the line is something that cannot possibly match this character. If so,
3562          automatically possessifying this item gains some performance in the case
3563          where the match fails. */
3564    
3565          if (!possessive_quantifier &&
3566              repeat_max < 0 &&
3567              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3568                options, cd))
3569            {
3570            repeat_type = 0;    /* Force greedy */
3571            possessive_quantifier = TRUE;
3572            }
3573    
3574        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3575        }        }
3576    
3577      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3578      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3579      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3580      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3581        currently used only for single-byte chars. */
3582    
3583      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3584        {        {
3585        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3586        c = previous[1];        c = previous[1];
3587          if (!possessive_quantifier &&
3588              repeat_max < 0 &&
3589              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3590            {
3591            repeat_type = 0;    /* Force greedy */
3592            possessive_quantifier = TRUE;
3593            }
3594        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3595        }        }
3596    
# Line 2403  for (;; ptr++) Line 3604  for (;; ptr++)
3604      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3605        {        {
3606        uschar *oldcode;        uschar *oldcode;
3607        int prop_type;        int prop_type, prop_value;
3608        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3609        c = *previous;        c = *previous;
3610    
3611          if (!possessive_quantifier &&
3612              repeat_max < 0 &&
3613              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3614            {
3615            repeat_type = 0;    /* Force greedy */
3616            possessive_quantifier = TRUE;
3617            }
3618    
3619        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3620        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3621          previous[1] : -1;          {
3622            prop_type = previous[1];
3623            prop_value = previous[2];
3624            }
3625          else prop_type = prop_value = -1;
3626    
3627        oldcode = code;        oldcode = code;
3628        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2422  for (;; ptr++) Line 3635  for (;; ptr++)
3635        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3636        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3637    
3638        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3639    
3640        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3641    
# Line 2443  for (;; ptr++) Line 3656  for (;; ptr++)
3656          }          }
3657    
3658        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3659        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3660        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3661        one less than the maximum. */        one less than the maximum. */
3662    
# Line 2470  for (;; ptr++) Line 3683  for (;; ptr++)
3683    
3684          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3685          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3686          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3687          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3688          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3689    
# Line 2486  for (;; ptr++) Line 3699  for (;; ptr++)
3699  #endif  #endif
3700              {              {
3701              *code++ = c;              *code++ = c;
3702              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3703                  {
3704                  *code++ = prop_type;
3705                  *code++ = prop_value;
3706                  }
3707              }              }
3708            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3709            }            }
3710    
3711          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3712          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3713            UPTO is just for 1 instance, we can use QUERY instead. */
3714    
3715          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3716            {            {
# Line 2505  for (;; ptr++) Line 3723  for (;; ptr++)
3723            else            else
3724  #endif  #endif
3725            *code++ = c;            *code++ = c;
3726            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3727                {
3728                *code++ = prop_type;
3729                *code++ = prop_value;
3730                }
3731            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3732            *code++ = OP_UPTO + repeat_type;  
3733            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3734                {
3735                *code++ = OP_QUERY + repeat_type;
3736                }
3737              else
3738                {
3739                *code++ = OP_UPTO + repeat_type;
3740                PUT2INC(code, 0, repeat_max);
3741                }
3742            }            }
3743          }          }
3744    
# Line 2524  for (;; ptr++) Line 3754  for (;; ptr++)
3754  #endif  #endif
3755        *code++ = c;        *code++ = c;
3756    
3757        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3758        defines the required property. */        define the required property. */
3759    
3760  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3761        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3762            {
3763            *code++ = prop_type;
3764            *code++ = prop_value;
3765            }
3766  #endif  #endif
3767        }        }
3768    
# Line 2551  for (;; ptr++) Line 3785  for (;; ptr++)
3785        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3786        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3787    
3788        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3789    
3790        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
3791          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 2571  for (;; ptr++) Line 3805  for (;; ptr++)
3805      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3806      cases. */      cases. */
3807    
3808      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3809               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3810        {        {
3811        register int i;        register int i;
3812        int ketoffset = 0;        int ketoffset = 0;
3813        int len = code - previous;        int len = code - previous;
3814        uschar *bralink = NULL;        uschar *bralink = NULL;
3815    
3816          /* Repeating a DEFINE group is pointless */
3817    
3818          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3819            {
3820            *errorcodeptr = ERR55;
3821            goto FAILED;
3822            }
3823    
3824        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3825        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3826        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2601  for (;; ptr++) Line 3843  for (;; ptr++)
3843    
3844        if (repeat_min == 0)        if (repeat_min == 0)
3845          {          {
3846          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
3847          altogether. */          output altogether, like this:
3848    
3849          if (repeat_max == 0)          ** if (repeat_max == 0)
3850            {          **   {
3851            code = previous;          **   code = previous;
3852            goto END_REPEAT;          **   goto END_REPEAT;
3853            }          **   }
3854    
3855            However, that fails when a group is referenced as a subroutine from
3856            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3857            so that it is skipped on execution. As we don't have a list of which
3858            groups are referenced, we cannot do this selectively.
3859    
3860            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3861            and do no more at this point. However, we do need to adjust any
3862            OP_RECURSE calls inside the group that refer to the group itself or any
3863            internal or forward referenced group, because the offset is from the
3864            start of the whole regex. Temporarily terminate the pattern while doing
3865            this. */
3866    
3867          /* If the maximum is 1 or unlimited, we just have to stick in the          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
         BRAZERO and do no more at this point. However, we do need to adjust  
         any OP_RECURSE calls inside the group that refer to the group itself or  
         any internal group, because the offset is from the start of the whole  
         regex. Temporarily terminate the pattern while doing this. */  
   
         if (repeat_max <= 1)  
3868            {            {
3869            *code = OP_END;            *code = OP_END;
3870            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3871            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3872            code++;            code++;
3873              if (repeat_max == 0)
3874                {
3875                *previous++ = OP_SKIPZERO;
3876                goto END_REPEAT;
3877                }
3878            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
3879            }            }
3880    
# Line 2637  for (;; ptr++) Line 3890  for (;; ptr++)
3890            {            {
3891            int offset;            int offset;
3892            *code = OP_END;            *code = OP_END;
3893            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3894            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3895            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3896            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3910  for (;; ptr++)
3910        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3911        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3912        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3913        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3914          forward reference subroutine calls in the group, there will be entries on
3915          the workspace list; replicate these with an appropriate increment. */
3916    
3917        else        else
3918          {          {
3919          if (repeat_min > 1)          if (repeat_min > 1)
3920            {            {
3921            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3922            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3923              potential integer overflow. */
3924    
3925              if (lengthptr != NULL)
3926                {
3927                int delta = (repeat_min - 1)*length_prevgroup;
3928                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3929                                                                (double)INT_MAX ||
3930                    OFLOW_MAX - *lengthptr < delta)
3931                  {
3932                  *errorcodeptr = ERR20;
3933                  goto FAILED;
3934                  }
3935                *lengthptr += delta;
3936                }
3937    
3938              /* This is compiling for real */
3939    
3940              else
3941              {              {
3942              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3943              code += len;              for (i = 1; i < repeat_min; i++)
3944                  {
3945                  uschar *hc;
3946                  uschar *this_hwm = cd->hwm;
3947                  memcpy(code, previous, len);
3948                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3949                    {
3950                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3951                    cd->hwm += LINK_SIZE;
3952                    }
3953                  save_hwm = this_hwm;
3954                  code += len;
3955                  }
3956              }              }
3957            }            }
3958    
3959          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3960          }          }
3961    
# Line 2677  for (;; ptr++) Line 3963  for (;; ptr++)
3963        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3964        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3965        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3966        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3967          replicate entries on the forward reference list. */
3968    
3969        if (repeat_max >= 0)        if (repeat_max >= 0)
3970          {          {
3971          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3972            just adjust the length as if we had. For each repetition we must add 1
3973            to the length for BRAZERO and for all but the last repetition we must
3974            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3975            paranoid checks to avoid integer overflow. */
3976    
3977            if (lengthptr != NULL && repeat_max > 0)
3978              {
3979              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3980                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3981              if ((double)repeat_max *
3982                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3983                      > (double)INT_MAX ||
3984                  OFLOW_MAX - *lengthptr < delta)
3985                {
3986                *errorcodeptr = ERR20;
3987                goto FAILED;
3988                }
3989              *lengthptr += delta;
3990              }
3991    
3992            /* This is compiling for real */
3993    
3994            else for (i = repeat_max - 1; i >= 0; i--)
3995            {            {
3996              uschar *hc;
3997              uschar *this_hwm = cd->hwm;
3998    
3999            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
4000    
4001            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2698  for (;; ptr++) Line 4011  for (;; ptr++)
4011              }              }
4012    
4013            memcpy(code, previous, len);            memcpy(code, previous, len);
4014              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4015                {
4016                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4017                cd->hwm += LINK_SIZE;
4018                }
4019              save_hwm = this_hwm;
4020            code += len;            code += len;
4021            }            }
4022    
# Line 2720  for (;; ptr++) Line 4039  for (;; ptr++)
4039        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
4040        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
4041        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
4042        correct offset was computed above. */        correct offset was computed above.
4043    
4044          Then, when we are doing the actual compile phase, check to see whether
4045          this group is a non-atomic one that could match an empty string. If so,
4046          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4047          that runtime checking can be done. [This check is also applied to
4048          atomic groups at runtime, but in a different way.] */
4049    
4050        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
4051            {
4052            uschar *ketcode = code - ketoffset;
4053            uschar *bracode = ketcode - GET(ketcode, 1);
4054            *ketcode = OP_KETRMAX + repeat_type;
4055            if (lengthptr == NULL && *bracode != OP_ONCE)
4056              {
4057              uschar *scode = bracode;
4058              do
4059                {
4060                if (could_be_empty_branch(scode, ketcode, utf8))
4061                  {
4062                  *bracode += OP_SBRA - OP_BRA;
4063                  break;
4064                  }
4065                scode += GET(scode, 1);
4066                }
4067              while (*scode == OP_ALT);
4068              }
4069            }
4070        }        }
4071    
4072      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2733  for (;; ptr++) Line 4077  for (;; ptr++)
4077        goto FAILED;        goto FAILED;
4078        }        }
4079    
4080      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
4081      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
4082      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
4083      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4084      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
4085        but the special opcodes can optimize it a bit. The repeated item starts at
4086        tempcode, not at previous, which might be the first part of a string whose
4087        (former) last char we repeated.
4088    
4089        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4090        an 'upto' may follow. We skip over an 'exact' item, and then test the
4091        length of what remains before proceeding. */
4092    
4093      if (possessive_quantifier)      if (possessive_quantifier)
4094        {        {
4095        int len = code - tempcode;        int len;
4096        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4097        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
4098        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode] +
4099        tempcode[0] = OP_ONCE;            ((*tempcode == OP_TYPEEXACT &&
4100        *code++ = OP_KET;               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4101        PUTINC(code, 0, len);        len = code - tempcode;
4102        PUT(tempcode, 1, len);        if (len > 0) switch (*tempcode)
4103            {
4104            case OP_STAR:  *tempcode = OP_POSSTAR; break;
4105            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4106            case OP_QUERY: *tempcode = OP_POSQUERY; break;
4107            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4108    
4109            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4110            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4111            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4112            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4113    
4114            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4115            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4116            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4117            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4118    
4119            default:
4120            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4121            code += 1 + LINK_SIZE;
4122            len += 1 + LINK_SIZE;
4123            tempcode[0] = OP_ONCE;
4124            *code++ = OP_KET;
4125            PUTINC(code, 0, len);
4126            PUT(tempcode, 1, len);
4127            break;
4128            }
4129        }        }
4130    
4131      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2761  for (;; ptr++) Line 4138  for (;; ptr++)
4138      break;      break;
4139    
4140    
4141      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
4142      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
4143      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
4144      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
4145    
4146      case '(':      case '(':
4147      newoptions = options;      newoptions = options;
4148      skipbytes = 0;      skipbytes = 0;
4149        bravalue = OP_CBRA;
4150        save_hwm = cd->hwm;
4151        reset_bracount = FALSE;
4152    
4153        /* First deal with various "verbs" that can be introduced by '*'. */
4154    
4155        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4156          {
4157          int i, namelen;
4158          const char *vn = verbnames;
4159          const uschar *name = ++ptr;
4160          previous = NULL;
4161          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4162          if (*ptr == ':')
4163            {
4164            *errorcodeptr = ERR59;   /* Not supported */
4165            goto FAILED;
4166            }
4167          if (*ptr != ')')
4168            {
4169            *errorcodeptr = ERR60;
4170            goto FAILED;
4171            }
4172          namelen = ptr - name;
4173          for (i = 0; i < verbcount; i++)
4174            {
4175            if (namelen == verbs[i].len &&
4176                strncmp((char *)name, vn, namelen) == 0)
4177              {
4178              *code = verbs[i].op;
4179              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4180              break;
4181              }
4182            vn += verbs[i].len + 1;
4183            }
4184          if (i < verbcount) continue;
4185          *errorcodeptr = ERR60;
4186          goto FAILED;
4187          }
4188    
4189      if (*(++ptr) == '?')      /* Deal with the extended parentheses; all are introduced by '?', and the
4190        appearance of any of them means that this is not a capturing group. */
4191    
4192        else if (*ptr == '?')
4193        {        {
4194        int set, unset;        int i, set, unset, namelen;
4195        int *optset;        int *optset;
4196          const uschar *name;
4197          uschar *slot;
4198    
4199        switch (*(++ptr))        switch (*(++ptr))
4200          {          {
4201          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
4202          ptr++;          ptr++;
4203          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
4204            if (*ptr == 0)
4205              {
4206              *errorcodeptr = ERR18;
4207              goto FAILED;
4208              }
4209          continue;          continue;
4210    
4211          case ':':                 /* Non-extracting bracket */  
4212            /* ------------------------------------------------------------ */
4213            case '|':                 /* Reset capture count for each branch */
4214            reset_bracount = TRUE;
4215            /* Fall through */
4216    
4217            /* ------------------------------------------------------------ */
4218            case ':':                 /* Non-capturing bracket */
4219          bravalue = OP_BRA;          bravalue = OP_BRA;
4220          ptr++;          ptr++;
4221          break;          break;
4222    
4223    
4224            /* ------------------------------------------------------------ */
4225          case '(':          case '(':
4226          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4227    
4228          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
4229            group), a name (referring to a named group), or 'R', referring to
4230            recursion. R<digits> and R&name are also permitted for recursion tests.
4231    
4232            There are several syntaxes for testing a named group: (?(name)) is used
4233            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4234    
4235            There are two unfortunate ambiguities, caused by history. (a) 'R' can
4236            be the recursive thing or the name 'R' (and similarly for 'R' followed
4237            by digits), and (b) a number could be a name that consists of digits.
4238            In both cases, we look for a name first; if not found, we try the other
4239            cases. */
4240    
4241            /* For conditions that are assertions, check the syntax, and then exit
4242            the switch. This will take control down to where bracketed groups,
4243            including assertions, are processed. */
4244    
4245            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4246              break;
4247    
4248            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4249            below), and all need to skip 3 bytes at the start of the group. */
4250    
4251            code[1+LINK_SIZE] = OP_CREF;
4252            skipbytes = 3;
4253            refsign = -1;
4254    
4255            /* Check for a test for recursion in a named group. */
4256    
4257          if (ptr[1] == 'R')          if (ptr[1] == 'R' && ptr[2] == '&')
4258            {            {
4259            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4260            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4261            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4262            }            }
4263    
4264          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4265          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4266    
4267          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4268            {            {
4269            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4270            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4271            }            }
4272          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
4273          set bravalue above. */            {
4274          break;            terminator = '\'';
4275              ptr++;
4276          case '=':                 /* Positive lookahead */            }
4277          bravalue = OP_ASSERT;          else
4278          ptr++;            {
4279          break;            terminator = 0;
4280              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4281              }
4282    
4283          case '!':                 /* Negative lookahead */          /* We now expect to read a name; any thing else is an error */
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
4284    
4285          case '<':                 /* Lookbehinds */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
         switch (*(++ptr))  
4286            {            {
4287            case '=':               /* Positive lookbehind */            ptr += 1;  /* To get the right offset */
4288            bravalue = OP_ASSERTBACK;            *errorcodeptr = ERR28;
4289            ptr++;            goto FAILED;
4290            break;            }
4291    
4292            case '!':               /* Negative lookbehind */          /* Read the name, but also get it as a number if it's all digits */
4293            bravalue = OP_ASSERTBACK_NOT;  
4294            recno = 0;
4295            name = ++ptr;
4296            while ((cd->ctypes[*ptr] & ctype_word) != 0)
4297              {
4298              if (recno >= 0)
4299                recno = ((digitab[*ptr] & ctype_digit) != 0)?
4300                  recno * 10 + *ptr - '0' : -1;
4301            ptr++;            ptr++;
           break;  
4302            }            }
4303          break;          namelen = ptr - name;
4304    
4305          case '>':                 /* One-time brackets */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4306          bravalue = OP_ONCE;            {
4307          ptr++;            ptr--;      /* Error offset */
4308          break;            *errorcodeptr = ERR26;
4309              goto FAILED;
4310              }
4311    
4312          case 'C':                 /* Callout - may be followed by digits; */          /* Do no further checking in the pre-compile phase. */
4313          previous_callout = code;  /* Save for later completion */  
4314          after_manual_callout = 1; /* Skip one item before completing */          if (lengthptr != NULL) break;
4315          *code++ = OP_CALLOUT;     /* Already checked that the terminating */  
4316            {                       /* closing parenthesis is present. */          /* In the real compile we do the work of looking for the actual
4317            int n = 0;          reference. If the string started with "+" or "-" we require the rest to
4318            while ((digitab[*(++ptr)] & ctype_digit) != 0)          be digits, in which case recno will be set. */
4319              n = n * 10 + *ptr - '0';  
4320            if (n > 255)          if (refsign > 0)
4321              {
4322              if (recno <= 0)
4323              {              {
4324              *errorcodeptr = ERR38;              *errorcodeptr = ERR58;
4325              goto FAILED;              goto FAILED;
4326              }              }
4327            *code++ = n;            recno = (refsign == '-')?
4328            PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */              cd->bracount - recno + 1 : recno +cd->bracount;
4329            PUT(code, LINK_SIZE, 0);                    /* Default length */            if (recno <= 0 || recno > cd->final_bracount)
4330            code += 2 * LINK_SIZE;              {
4331                *errorcodeptr = ERR15;
4332                goto FAILED;
4333                }
4334              PUT2(code, 2+LINK_SIZE, recno);
4335              break;