/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 79 by nigel, Sat Feb 24 21:40:52 2007 UTC revision 359 by ph10, Wed Jul 9 16:20:19 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56    /* When DEBUG is defined, we need the pcre_printint() function, which is also
57    used by pcretest. DEBUG is not defined when building a production library. */
58    
59    #ifdef DEBUG
60    #include "pcre_printint.src"
61    #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 106  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144  terminated by a zero length entry. The first three must be alpha, upper, lower,  searched linearly. Put all the names into a single string, in order to reduce
145  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. */
146    
147  static const char *const posix_names[] = {  typedef struct verbitem {
148    "alpha", "lower", "upper",    int   len;
149    "alnum", "ascii", "blank", "cntrl", "digit", "graph",    int   op;
150    "print", "punct", "space", "word",  "xdigit" };  } verbitem;
151    
152    static const char verbnames[] =
153      "ACCEPT\0"
154      "COMMIT\0"
155      "F\0"
156      "FAIL\0"
157      "PRUNE\0"
158      "SKIP\0"
159      "THEN";
160    
161    static const verbitem verbs[] = {
162      { 6, OP_ACCEPT },
163      { 6, OP_COMMIT },
164      { 1, OP_FAIL },
165      { 4, OP_FAIL },
166      { 5, OP_PRUNE },
167      { 4, OP_SKIP  },
168      { 4, OP_THEN  }
169    };
170    
171    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174    /* Tables of names of POSIX character classes and their lengths. The names are
175    now all in a single string, to reduce the number of relocations when a shared
176    library is dynamically loaded. The list of lengths is terminated by a zero
177    length entry. The first three must be alpha, lower, upper, as this is assumed
178    for handling case independence. */
179    
180    static const char posix_names[] =
181      "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
182      "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
183      "word\0"   "xdigit";
184    
185  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
186    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
189  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
190  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
191    characters are removed, and for [:alpha:] and [:alnum:] the underscore
192    character is removed. The triples in the table consist of the base map offset,
193    second map offset or -1 if no second map, and a non-negative value for map
194    addition or a negative value for map subtraction (if there are two maps). The
195    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196    remove vertical space characters, 2 => remove underscore. */
197    
198  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
199    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
200    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
201    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
202    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
203    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
204    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
205    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
206    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
207    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
208    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
209    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
210    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
211    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
212    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
213  };  };
214    
215    
216  /* The texts of compile-time error messages. These are "char *" because they  #define STRING(a)  # a
217  are passed to the outside world. */  #define XSTRING(s) STRING(s)
218    
219  static const char *error_texts[] = {  /* The texts of compile-time error messages. These are "char *" because they
220    "no error",  are passed to the outside world. Do not ever re-use any error number, because
221    "\\ at end of pattern",  they are documented. Always add a new error instead. Messages marked DEAD below
222    "\\c at end of pattern",  are no longer used. This used to be a table of strings, but in order to reduce
223    "unrecognized character follows \\",  the number of relocations needed when a shared library is loaded dynamically,
224    "numbers out of order in {} quantifier",  it is now one long string. We cannot use a table of offsets, because the
225    lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226    simply count through to the one we want - this isn't a performance issue
227    because these strings are used only when there is a compilation error. */
228    
229    static const char error_texts[] =
230      "no error\0"
231      "\\ at end of pattern\0"
232      "\\c at end of pattern\0"
233      "unrecognized character follows \\\0"
234      "numbers out of order in {} quantifier\0"
235    /* 5 */    /* 5 */
236    "number too big in {} quantifier",    "number too big in {} quantifier\0"
237    "missing terminating ] for character class",    "missing terminating ] for character class\0"
238    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
239    "range out of order in character class",    "range out of order in character class\0"
240    "nothing to repeat",    "nothing to repeat\0"
241    /* 10 */    /* 10 */
242    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
244    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
245    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
246    "missing )",    "missing )\0"
247    /* 15 */    /* 15 */
248    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
249    "erroffset passed as NULL",    "erroffset passed as NULL\0"
250    "unknown option bit(s) set",    "unknown option bit(s) set\0"
251    "missing ) after comment",    "missing ) after comment\0"
252    "parentheses nested too deeply",    "parentheses nested too deeply\0"  /** DEAD **/
253    /* 20 */    /* 20 */
254    "regular expression too large",    "regular expression is too large\0"
255    "failed to get memory",    "failed to get memory\0"
256    "unmatched parentheses",    "unmatched parentheses\0"
257    "internal error: code overflow",    "internal error: code overflow\0"
258    "unrecognized character after (?<",    "unrecognized character after (?<\0"
259    /* 25 */    /* 25 */
260    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
261    "malformed number after (?(",    "malformed number or name after (?(\0"
262    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
263    "assertion expected after (?(",    "assertion expected after (?(\0"
264    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
265    /* 30 */    /* 30 */
266    "unknown POSIX class name",    "unknown POSIX class name\0"
267    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
268    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269    "spare error",    "spare error\0"  /** DEAD **/
270    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
271    /* 35 */    /* 35 */
272    "invalid condition (?(0)",    "invalid condition (?(0)\0"
273    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
274    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275    "number after (?C is > 255",    "number after (?C is > 255\0"
276    "closing ) for (?C expected",    "closing ) for (?C expected\0"
277    /* 40 */    /* 40 */
278    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
279    "unrecognized character after (?P",    "unrecognized character after (?P\0"
280    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)\0"
281    "two named groups have the same name",    "two named subpatterns have the same name\0"
282    "invalid UTF-8 string",    "invalid UTF-8 string\0"
283    /* 45 */    /* 45 */
284    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
285    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
286    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p\0"
287  };    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289      /* 50 */
290      "repeated subpattern is too long\0"    /** DEAD **/
291      "octal value is greater than \\377 (not in UTF-8 mode)\0"
292      "internal error: overran compiling workspace\0"
293      "internal error: previously-checked referenced subpattern not found\0"
294      "DEFINE group contains more than one branch\0"
295      /* 55 */
296      "repeating a DEFINE group is not allowed\0"
297      "inconsistent NEWLINE options\0"
298      "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299      "a numbered reference must not be zero\0"
300      "(*VERB) with an argument is not supported\0"
301      /* 60 */
302      "(*VERB) not recognized\0"
303      "number is too big\0"
304      "subpattern name expected\0"
305      "digit expected after (?+\0"
306      "] is an invalid data character in JavaScript compatibility mode";
307    
308    
309  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 220  For convenience, we use the same bit def Line 322  For convenience, we use the same bit def
322    
323  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
324    
325  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
326  static const unsigned char digitab[] =  static const unsigned char digitab[] =
327    {    {
328    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 358  static const unsigned char digitab[] =
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360    
361  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
362  static const unsigned char digitab[] =  static const unsigned char digitab[] =
363    {    {
364    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 372  static const unsigned char digitab[] =
372    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
373    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
374    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
375    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
376    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
377    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
378    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 406  static const unsigned char ebcdic_charta
406    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
407    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
408    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
409    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
410    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
411    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
412    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 433  static const unsigned char ebcdic_charta
433  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
434    
435  static BOOL  static BOOL
436    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
438    
439    
440    
441    /*************************************************
442    *            Find an error text                  *
443    *************************************************/
444    
445    /* The error texts are now all in one long string, to save on relocations. As
446    some of the text is of unknown length, we can't use a table of offsets.
447    Instead, just count through the strings. This is not a performance issue
448    because it happens only when there has been a compilation error.
449    
450    Argument:   the error number
451    Returns:    pointer to the error string
452    */
453    
454    static const char *
455    find_error_text(int n)
456    {
457    const char *s = error_texts;
458    for (; n > 0; n--) while (*s++ != 0);
459    return s;
460    }
461    
462    
463  /*************************************************  /*************************************************
# Line 342  static BOOL Line 466  static BOOL
466    
467  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
468  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
469  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
470  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472    ptr is pointing at the \. On exit, it is on the final character of the escape
473    sequence.
474    
475  Arguments:  Arguments:
476    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 355  Arguments: Line 481  Arguments:
481    
482  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
483                   negative => a special escape sequence                   negative => a special escape sequence
484                   on error, errorptr is set                   on error, errorcodeptr is set
485  */  */
486    
487  static int  static int
488  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
489    int options, BOOL isclass)    int options, BOOL isclass)
490  {  {
491  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
492    const uschar *ptr = *ptrptr + 1;
493  int c, i;  int c, i;
494    
495    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
496    ptr--;                            /* Set pointer back to the last byte */
497    
498  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
499    
 c = *(++ptr);  
500  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
501    
502  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
504  Otherwise further processing may be required. */  Otherwise further processing may be required. */
505    
506  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
507  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
508  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
509    
510  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
511  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
512  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
513  #endif  #endif
514    
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 517  else if ((i = escapes[c - 0x48]) != 0)
517  else  else
518    {    {
519    const uschar *oldptr;    const uschar *oldptr;
520      BOOL braced, negated;
521    
522    switch (c)    switch (c)
523      {      {
524      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 532  else
532      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
533      break;      break;
534    
535        /* \g must be followed by one of a number of specific things:
536    
537        (1) A number, either plain or braced. If positive, it is an absolute
538        backreference. If negative, it is a relative backreference. This is a Perl
539        5.10 feature.
540    
541        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542        is part of Perl's movement towards a unified syntax for back references. As
543        this is synonymous with \k{name}, we fudge it up by pretending it really
544        was \k.
545    
546        (3) For Oniguruma compatibility we also support \g followed by a name or a
547        number either in angle brackets or in single quotes. However, these are
548        (possibly recursive) subroutine calls, _not_ backreferences. Just return
549        the -ESC_g code (cf \k). */
550    
551        case 'g':
552        if (ptr[1] == '<' || ptr[1] == '\'')
553          {
554          c = -ESC_g;
555          break;
556          }
557    
558        /* Handle the Perl-compatible cases */
559    
560        if (ptr[1] == '{')
561          {
562          const uschar *p;
563          for (p = ptr+2; *p != 0 && *p != '}'; p++)
564            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565          if (*p != 0 && *p != '}')
566            {
567            c = -ESC_k;
568            break;
569            }
570          braced = TRUE;
571          ptr++;
572          }
573        else braced = FALSE;
574    
575        if (ptr[1] == '-')
576          {
577          negated = TRUE;
578          ptr++;
579          }
580        else negated = FALSE;
581    
582        c = 0;
583        while ((digitab[ptr[1]] & ctype_digit) != 0)
584          c = c * 10 + *(++ptr) - '0';
585    
586        if (c < 0)   /* Integer overflow */
587          {
588          *errorcodeptr = ERR61;
589          break;
590          }
591    
592        if (braced && *(++ptr) != '}')
593          {
594          *errorcodeptr = ERR57;
595          break;
596          }
597    
598        if (c == 0)
599          {
600          *errorcodeptr = ERR58;
601          break;
602          }
603    
604        if (negated)
605          {
606          if (c > bracount)
607            {
608            *errorcodeptr = ERR15;
609            break;
610            }
611          c = bracount - (c - 1);
612          }
613    
614        c = -(ESC_REF + c);
615        break;
616    
617      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
618      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
619      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 422  else Line 635  else
635        c -= '0';        c -= '0';
636        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
637          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
638          if (c < 0)    /* Integer overflow */
639            {
640            *errorcodeptr = ERR61;
641            break;
642            }
643        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
644          {          {
645          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 442  else Line 660  else
660        }        }
661    
662      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
663      larger first octal digit. */      larger first octal digit. The original code used just to take the least
664        significant 8 bits of octal numbers (I think this is what early Perls used
665        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666        than 3 octal digits. */
667    
668      case '0':      case '0':
669      c -= '0';      c -= '0';
670      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
672      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
673      break;      break;
674    
675      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
676      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
677        treated as a data character. */
678    
679      case 'x':      case 'x':
680  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
681        {        {
682        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
683        register int count = 0;        int count = 0;
684    
685        c = 0;        c = 0;
686        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
687          {          {
688          int cc = *pt++;          register int cc = *pt++;
689            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
690          count++;          count++;
691  #if !EBCDIC    /* ASCII coding */  
692    #ifndef EBCDIC  /* ASCII coding */
693          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
694          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
696          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
697          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698  #endif  #endif
699          }          }
700    
701        if (*pt == '}')        if (*pt == '}')
702          {          {
703          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
704          ptr = pt;          ptr = pt;
705          break;          break;
706          }          }
707    
708        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
709        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
710        }        }
 #endif  
711    
712      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
713    
714      c = 0;      c = 0;
715      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
716        {        {
717        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
718        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
719  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
720        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
721        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
723        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
724        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725  #endif  #endif
726        }        }
727      break;      break;
728    
729      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730        This coding is ASCII-specific, but then the whole concept of \cx is
731        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732    
733      case 'c':      case 'c':
734      c = *(++ptr);      c = *(++ptr);
735      if (c == 0)      if (c == 0)
736        {        {
737        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
738        return 0;        break;
739        }        }
740    
741      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
742      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
743      c ^= 0x40;      c ^= 0x40;
744  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
745      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
746      c ^= 0xC0;      c ^= 0xC0;
747  #endif  #endif
748      break;      break;
749    
750      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
752      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
753      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
754      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
755    
756      default:      default:
757      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 560  escape sequence. Line 783  escape sequence.
783  Argument:  Argument:
784    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
785    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
786      dptr           points to an int that is set to the detailed property value
787    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
788    
789  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
790  */  */
791    
792  static int  static int
793  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
794  {  {
795  int c, i, bot, top;  int c, i, bot, top;
796  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
797  char name[4];  char name[32];
798    
799  c = *(++ptr);  c = *(++ptr);
800  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
801    
802  *negptr = FALSE;  *negptr = FALSE;
803    
804  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
805  preceded by ^ for negation. */  negation. */
806    
807  if (c == '{')  if (c == '{')
808    {    {
# Line 587  if (c == '{') Line 811  if (c == '{')
811      *negptr = TRUE;      *negptr = TRUE;
812      ptr++;      ptr++;
813      }      }
814    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
815      {      {
816      c = *(++ptr);      c = *(++ptr);
817      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
818      if (c == '}') break;      if (c == '}') break;
819      name[i] = c;      name[i] = c;
820      }      }
821    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
822    name[i] = 0;    name[i] = 0;
823    }    }
824    
# Line 619  top = _pcre_utt_size; Line 839  top = _pcre_utt_size;
839    
840  while (bot < top)  while (bot < top)
841    {    {
842    i = (bot + top)/2;    i = (bot + top) >> 1;
843    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
845        {
846        *dptr = _pcre_utt[i].value;
847        return _pcre_utt[i].type;
848        }
849    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
850    }    }
851    
 UNKNOWN_RETURN:  
852  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
853  *ptrptr = ptr;  *ptrptr = ptr;
854  return -1;  return -1;
# Line 698  read_repeat_counts(const uschar *p, int Line 921  read_repeat_counts(const uschar *p, int
921  int min = 0;  int min = 0;
922  int max = -1;  int max = -1;
923    
924    /* Read the minimum value and do a paranoid check: a negative value indicates
925    an integer overflow. */
926    
927  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
928    if (min < 0 || min > 65535)
929      {
930      *errorcodeptr = ERR5;
931      return p;
932      }
933    
934    /* Read the maximum value if there is one, and again do a paranoid on its size.
935    Also, max must not be less than min. */
936    
937  if (*p == '}') max = min; else  if (*p == '}') max = min; else
938    {    {
# Line 706  if (*p == '}') max = min; else Line 940  if (*p == '}') max = min; else
940      {      {
941      max = 0;      max = 0;
942      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
943        if (max < 0 || max > 65535)
944          {
945          *errorcodeptr = ERR5;
946          return p;
947          }
948      if (max < min)      if (max < min)
949        {        {
950        *errorcodeptr = ERR4;        *errorcodeptr = ERR4;
# Line 714  if (*p == '}') max = min; else Line 953  if (*p == '}') max = min; else
953      }      }
954    }    }
955    
956  /* Do paranoid checks, then fill in the required variables, and pass back the  /* Fill in the required variables, and pass back the pointer to the terminating
957  pointer to the terminating '}'. */  '}'. */
958    
959  if (min > 65535 || max > 65535)  *minp = min;
960    *errorcodeptr = ERR5;  *maxp = max;
961  else  return p;
962    }
963    
964    
965    
966    /*************************************************
967    *       Find forward referenced subpattern       *
968    *************************************************/
969    
970    /* This function scans along a pattern's text looking for capturing
971    subpatterns, and counting them. If it finds a named pattern that matches the
972    name it is given, it returns its number. Alternatively, if the name is NULL, it
973    returns when it reaches a given numbered subpattern. This is used for forward
974    references to subpatterns. We know that if (?P< is encountered, the name will
975    be terminated by '>' because that is checked in the first pass.
976    
977    Arguments:
978      ptr          current position in the pattern
979      cd           compile background data
980      name         name to seek, or NULL if seeking a numbered subpattern
981      lorn         name length, or subpattern number if name is NULL
982      xmode        TRUE if we are in /x mode
983    
984    Returns:       the number of the named subpattern, or -1 if not found
985    */
986    
987    static int
988    find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
989      BOOL xmode)
990    {
991    const uschar *thisname;
992    int count = cd->bracount;
993    
994    for (; *ptr != 0; ptr++)
995    {    {
996    *minp = min;    int term;
997    *maxp = max;  
998      /* Skip over backslashed characters and also entire \Q...\E */
999    
1000      if (*ptr == '\\')
1001        {
1002        if (*(++ptr) == 0) return -1;
1003        if (*ptr == 'Q') for (;;)
1004          {
1005          while (*(++ptr) != 0 && *ptr != '\\');
1006          if (*ptr == 0) return -1;
1007          if (*(++ptr) == 'E') break;
1008          }
1009        continue;
1010        }
1011    
1012      /* Skip over character classes; this logic must be similar to the way they
1013      are handled for real. If the first character is '^', skip it. Also, if the
1014      first few characters (either before or after ^) are \Q\E or \E we skip them
1015      too. This makes for compatibility with Perl. */
1016    
1017      if (*ptr == '[')
1018        {
1019        BOOL negate_class = FALSE;
1020        for (;;)
1021          {
1022          int c = *(++ptr);
1023          if (c == '\\')
1024            {
1025            if (ptr[1] == 'E') ptr++;
1026              else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1027                else break;
1028            }
1029          else if (!negate_class && c == '^')
1030            negate_class = TRUE;
1031          else break;
1032          }
1033    
1034        /* If the next character is ']', it is a data character that must be
1035        skipped, except in JavaScript compatibility mode. */
1036    
1037        if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1038          ptr++;
1039    
1040        while (*(++ptr) != ']')
1041          {
1042          if (*ptr == 0) return -1;
1043          if (*ptr == '\\')
1044            {
1045            if (*(++ptr) == 0) return -1;
1046            if (*ptr == 'Q') for (;;)
1047              {
1048              while (*(++ptr) != 0 && *ptr != '\\');
1049              if (*ptr == 0) return -1;
1050              if (*(++ptr) == 'E') break;
1051              }
1052            continue;
1053            }
1054          }
1055        continue;
1056        }
1057    
1058      /* Skip comments in /x mode */
1059    
1060      if (xmode && *ptr == '#')
1061        {
1062        while (*(++ptr) != 0 && *ptr != '\n');
1063        if (*ptr == 0) return -1;
1064        continue;
1065        }
1066    
1067      /* An opening parens must now be a real metacharacter */
1068    
1069      if (*ptr != '(') continue;
1070      if (ptr[1] != '?' && ptr[1] != '*')
1071        {
1072        count++;
1073        if (name == NULL && count == lorn) return count;
1074        continue;
1075        }
1076    
1077      ptr += 2;
1078      if (*ptr == 'P') ptr++;                      /* Allow optional P */
1079    
1080      /* We have to disambiguate (?<! and (?<= from (?<name> */
1081    
1082      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1083           *ptr != '\'')
1084        continue;
1085    
1086      count++;
1087    
1088      if (name == NULL && count == lorn) return count;
1089      term = *ptr++;
1090      if (term == '<') term = '>';
1091      thisname = ptr;
1092      while (*ptr != term) ptr++;
1093      if (name != NULL && lorn == ptr - thisname &&
1094          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1095        return count;
1096    }    }
1097  return p;  
1098    return -1;
1099  }  }
1100    
1101    
# Line 778  for (;;) Line 1149  for (;;)
1149    
1150      case OP_CALLOUT:      case OP_CALLOUT:
1151      case OP_CREF:      case OP_CREF:
1152      case OP_BRANUMBER:      case OP_RREF:
1153        case OP_DEF:
1154      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1155      break;      break;
1156    
# Line 823  for (;;) Line 1195  for (;;)
1195    {    {
1196    int d;    int d;
1197    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1198    switch (op)    switch (op)
1199      {      {
1200        case OP_CBRA:
1201      case OP_BRA:      case OP_BRA:
1202      case OP_ONCE:      case OP_ONCE:
1203      case OP_COND:      case OP_COND:
1204      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1205      if (d < 0) return d;      if (d < 0) return d;
1206      branchlength += d;      branchlength += d;
1207      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 865  for (;;) Line 1236  for (;;)
1236      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1237    
1238      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1239      case OP_CREF:      case OP_CREF:
1240        case OP_RREF:
1241        case OP_DEF:
1242      case OP_OPT:      case OP_OPT:
1243      case OP_CALLOUT:      case OP_CALLOUT:
1244      case OP_SOD:      case OP_SOD:
# Line 884  for (;;) Line 1256  for (;;)
1256    
1257      case OP_CHAR:      case OP_CHAR:
1258      case OP_CHARNC:      case OP_CHARNC:
1259        case OP_NOT:
1260      branchlength++;      branchlength++;
1261      cc += 2;      cc += 2;
1262  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 910  for (;;) Line 1283  for (;;)
1283    
1284      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1285      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1286        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1287      cc += 4;      cc += 4;
1288      break;      break;
1289    
# Line 917  for (;;) Line 1291  for (;;)
1291    
1292      case OP_PROP:      case OP_PROP:
1293      case OP_NOTPROP:      case OP_NOTPROP:
1294      cc++;      cc += 2;
1295      /* Fall through */      /* Fall through */
1296    
1297      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 927  for (;;) Line 1301  for (;;)
1301      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1302      case OP_WORDCHAR:      case OP_WORDCHAR:
1303      case OP_ANY:      case OP_ANY:
1304        case OP_ALLANY:
1305      branchlength++;      branchlength++;
1306      cc++;      cc++;
1307      break;      break;
# Line 998  Returns:      pointer to the opcode for Line 1373  Returns:      pointer to the opcode for
1373  static const uschar *  static const uschar *
1374  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1375  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1376  for (;;)  for (;;)
1377    {    {
1378    register int c = *code;    register int c = *code;
1379    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1380    else if (c > OP_BRA)  
1381      /* XCLASS is used for classes that cannot be represented just by a bit
1382      map. This includes negated single high-valued characters. The length in
1383      the table is zero; the actual length is stored in the compiled code. */
1384    
1385      if (c == OP_XCLASS) code += GET(code, 1);
1386    
1387      /* Handle capturing bracket */
1388    
1389      else if (c == OP_CBRA)
1390      {      {
1391      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1392      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1393      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1394      }      }
1395    
1396      /* Otherwise, we can get the item's length from the table, except that for
1397      repeated character types, we have to test for \p and \P, which have an extra
1398      two bytes of parameters. */
1399    
1400    else    else
1401      {      {
1402      code += _pcre_OP_lengths[c];      switch(c)
1403          {
1404          case OP_TYPESTAR:
1405          case OP_TYPEMINSTAR:
1406          case OP_TYPEPLUS:
1407          case OP_TYPEMINPLUS:
1408          case OP_TYPEQUERY:
1409          case OP_TYPEMINQUERY:
1410          case OP_TYPEPOSSTAR:
1411          case OP_TYPEPOSPLUS:
1412          case OP_TYPEPOSQUERY:
1413          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1414          break;
1415    
1416  #ifdef SUPPORT_UTF8        case OP_TYPEUPTO:
1417          case OP_TYPEMINUPTO:
1418          case OP_TYPEEXACT:
1419          case OP_TYPEPOSUPTO:
1420          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1421          break;
1422          }
1423    
1424      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* Add in the fixed length from the table */
1425      by a multi-byte character. The length in the table is a minimum, so we have  
1426      to scan along to skip the extra bytes. All opcodes are less than 128, so we      code += _pcre_OP_lengths[c];
     can use relatively efficient code. */  
1427    
1428      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1429      a multi-byte character. The length in the table is a minimum, so we have to
1430      arrange to skip the extra bytes. */
1431    
1432    #ifdef SUPPORT_UTF8
1433      if (utf8) switch(c)      if (utf8) switch(c)
1434        {        {
1435        case OP_CHAR:        case OP_CHAR:
# Line 1031  for (;;) Line 1437  for (;;)
1437        case OP_EXACT:        case OP_EXACT:
1438        case OP_UPTO:        case OP_UPTO:
1439        case OP_MINUPTO:        case OP_MINUPTO:
1440          case OP_POSUPTO:
1441        case OP_STAR:        case OP_STAR:
1442        case OP_MINSTAR:        case OP_MINSTAR:
1443          case OP_POSSTAR:
1444        case OP_PLUS:        case OP_PLUS:
1445        case OP_MINPLUS:        case OP_MINPLUS:
1446          case OP_POSPLUS:
1447        case OP_QUERY:        case OP_QUERY:
1448        case OP_MINQUERY:        case OP_MINQUERY:
1449        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1450        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1451        break;        break;
1452        }        }
1453  #endif  #endif
# Line 1072  Returns:      pointer to the opcode for Line 1474  Returns:      pointer to the opcode for
1474  static const uschar *  static const uschar *
1475  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1476  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1477  for (;;)  for (;;)
1478    {    {
1479    register int c = *code;    register int c = *code;
1480    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1481    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1482    else if (c > OP_BRA)  
1483      {    /* XCLASS is used for classes that cannot be represented just by a bit
1484      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1485      }    the table is zero; the actual length is stored in the compiled code. */
1486    
1487      if (c == OP_XCLASS) code += GET(code, 1);
1488    
1489      /* Otherwise, we can get the item's length from the table, except that for
1490      repeated character types, we have to test for \p and \P, which have an extra
1491      two bytes of parameters. */
1492    
1493    else    else
1494      {      {
1495      code += _pcre_OP_lengths[c];      switch(c)
1496          {
1497          case OP_TYPESTAR:
1498          case OP_TYPEMINSTAR:
1499          case OP_TYPEPLUS:
1500          case OP_TYPEMINPLUS:
1501          case OP_TYPEQUERY:
1502          case OP_TYPEMINQUERY:
1503          case OP_TYPEPOSSTAR:
1504          case OP_TYPEPOSPLUS:
1505          case OP_TYPEPOSQUERY:
1506          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1507          break;
1508    
1509  #ifdef SUPPORT_UTF8        case OP_TYPEPOSUPTO:
1510          case OP_TYPEUPTO:
1511          case OP_TYPEMINUPTO:
1512          case OP_TYPEEXACT:
1513          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1514          break;
1515          }
1516    
1517        /* Add in the fixed length from the table */
1518    
1519        code += _pcre_OP_lengths[c];
1520    
1521      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1522      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
1523      to scan along to skip the extra bytes. All opcodes are less than 128, so we      to arrange to skip the extra bytes. */
     can use relatively efficient code. */  
1524    
1525    #ifdef SUPPORT_UTF8
1526      if (utf8) switch(c)      if (utf8) switch(c)
1527        {        {
1528        case OP_CHAR:        case OP_CHAR:
# Line 1103  for (;;) Line 1530  for (;;)
1530        case OP_EXACT:        case OP_EXACT:
1531        case OP_UPTO:        case OP_UPTO:
1532        case OP_MINUPTO:        case OP_MINUPTO:
1533          case OP_POSUPTO:
1534        case OP_STAR:        case OP_STAR:
1535        case OP_MINSTAR:        case OP_MINSTAR:
1536          case OP_POSSTAR:
1537        case OP_PLUS:        case OP_PLUS:
1538        case OP_MINPLUS:        case OP_MINPLUS:
1539          case OP_POSPLUS:
1540        case OP_QUERY:        case OP_QUERY:
1541        case OP_MINQUERY:        case OP_MINQUERY:
1542        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1543        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1544        break;        break;
1545        }        }
1546  #endif  #endif
# Line 1132  for (;;) Line 1555  for (;;)
1555  *************************************************/  *************************************************/
1556    
1557  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1558  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1559  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1560  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1561  whose current branch will already have been scanned.  backward and negative forward assertions when its final argument is TRUE. If we
1562    hit an unclosed bracket, we return "empty" - this means we've struck an inner
1563    bracket whose current branch will already have been scanned.
1564    
1565  Arguments:  Arguments:
1566    code        points to start of search    code        points to start of search
# Line 1149  static BOOL Line 1574  static BOOL
1574  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1575  {  {
1576  register int c;  register int c;
1577  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1578       code < endcode;       code < endcode;
1579       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1580    {    {
# Line 1157  for (code = first_significant_code(code Line 1582  for (code = first_significant_code(code
1582    
1583    c = *code;    c = *code;
1584    
1585    if (c >= OP_BRA)    /* Skip over forward assertions; the other assertions are skipped by
1586      first_significant_code() with a TRUE final argument. */
1587    
1588      if (c == OP_ASSERT)
1589        {
1590        do code += GET(code, 1); while (*code == OP_ALT);
1591        c = *code;
1592        continue;
1593        }
1594    
1595      /* Groups with zero repeats can of course be empty; skip them. */
1596    
1597      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1598        {
1599        code += _pcre_OP_lengths[c];
1600        do code += GET(code, 1); while (*code == OP_ALT);
1601        c = *code;
1602        continue;
1603        }
1604    
1605      /* For other groups, scan the branches. */
1606    
1607      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1608      {      {
1609      BOOL empty_branch;      BOOL empty_branch;
1610      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1173  for (code = first_significant_code(code Line 1620  for (code = first_significant_code(code
1620        }        }
1621      while (*code == OP_ALT);      while (*code == OP_ALT);
1622      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1623      c = *code;      c = *code;
1624        continue;
1625      }      }
1626    
1627    else switch (c)    /* Handle the other opcodes */
1628    
1629      switch (c)
1630      {      {
1631      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1632        cannot be represented just by a bit map. This includes negated single
1633        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1634        actual length is stored in the compiled code, so we must update "code"
1635        here. */
1636    
1637  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1638      case OP_XCLASS:      case OP_XCLASS:
1639      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1640      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1641  #endif  #endif
1642    
# Line 1227  for (code = first_significant_code(code Line 1680  for (code = first_significant_code(code
1680      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1681      case OP_WORDCHAR:      case OP_WORDCHAR:
1682      case OP_ANY:      case OP_ANY:
1683        case OP_ALLANY:
1684      case OP_ANYBYTE:      case OP_ANYBYTE:
1685      case OP_CHAR:      case OP_CHAR:
1686      case OP_CHARNC:      case OP_CHARNC:
1687      case OP_NOT:      case OP_NOT:
1688      case OP_PLUS:      case OP_PLUS:
1689      case OP_MINPLUS:      case OP_MINPLUS:
1690        case OP_POSPLUS:
1691      case OP_EXACT:      case OP_EXACT:
1692      case OP_NOTPLUS:      case OP_NOTPLUS:
1693      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1694        case OP_NOTPOSPLUS:
1695      case OP_NOTEXACT:      case OP_NOTEXACT:
1696      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1697      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1698        case OP_TYPEPOSPLUS:
1699      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1700      return FALSE;      return FALSE;
1701    
1702        /* These are going to continue, as they may be empty, but we have to
1703        fudge the length for the \p and \P cases. */
1704    
1705        case OP_TYPESTAR:
1706        case OP_TYPEMINSTAR:
1707        case OP_TYPEPOSSTAR:
1708        case OP_TYPEQUERY:
1709        case OP_TYPEMINQUERY:
1710        case OP_TYPEPOSQUERY:
1711        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1712        break;
1713    
1714        /* Same for these */
1715    
1716        case OP_TYPEUPTO:
1717        case OP_TYPEMINUPTO:
1718        case OP_TYPEPOSUPTO:
1719        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1720        break;
1721    
1722      /* End of branch */      /* End of branch */
1723    
1724      case OP_KET:      case OP_KET:
# Line 1250  for (code = first_significant_code(code Line 1727  for (code = first_significant_code(code
1727      case OP_ALT:      case OP_ALT:
1728      return TRUE;      return TRUE;
1729    
1730      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1731      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1732    
1733  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1734      case OP_STAR:      case OP_STAR:
1735      case OP_MINSTAR:      case OP_MINSTAR:
1736        case OP_POSSTAR:
1737      case OP_QUERY:      case OP_QUERY:
1738      case OP_MINQUERY:      case OP_MINQUERY:
1739        case OP_POSQUERY:
1740      case OP_UPTO:      case OP_UPTO:
1741      case OP_MINUPTO:      case OP_MINUPTO:
1742        case OP_POSUPTO:
1743      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1744      break;      break;
1745  #endif  #endif
# Line 1308  return TRUE; Line 1788  return TRUE;
1788  *************************************************/  *************************************************/
1789    
1790  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1791  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1792  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1793  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1794    
1795    Originally, this function only recognized a sequence of letters between the
1796    terminators, but it seems that Perl recognizes any sequence of characters,
1797    though of course unknown POSIX names are subsequently rejected. Perl gives an
1798    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1799    didn't consider this to be a POSIX class. Likewise for [:1234:].
1800    
1801    The problem in trying to be exactly like Perl is in the handling of escapes. We
1802    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1803    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1804    below handles the special case of \], but does not try to do any other escape
1805    processing. This makes it different from Perl for cases such as [:l\ower:]
1806    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1807    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1808    I think.
1809    
1810  Argument:  Arguments:
1811    ptr      pointer to the initial [    ptr      pointer to the initial [
1812    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1813    
1814  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1815  */  */
1816    
1817  static BOOL  static BOOL
1818  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1819  {  {
1820  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1821  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1822  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1823    {    {
1824    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1825    return TRUE;      {
1826        if (*ptr == ']') return FALSE;
1827        if (*ptr == terminator && ptr[1] == ']')
1828          {
1829          *endptr = ptr;
1830          return TRUE;
1831          }
1832        }
1833    }    }
1834  return FALSE;  return FALSE;
1835  }  }
# Line 1355  Returns:     a value representing the na Line 1854  Returns:     a value representing the na
1854  static int  static int
1855  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
1856  {  {
1857    const char *pn = posix_names;
1858  register int yield = 0;  register int yield = 0;
1859  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
1860    {    {
1861    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
1862      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
1863      pn += posix_name_lengths[yield] + 1;
1864    yield++;    yield++;
1865    }    }
1866  return -1;  return -1;
# Line 1374  return -1; Line 1875  return -1;
1875  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
1876  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
1877  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
1878  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1879  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
1880  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
1881  offsets adjusted. That is the job of this function. Before it is called, the  have their offsets adjusted. That one of the jobs of this function. Before it
1882  partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
1883    OP_END.
1884    
1885    This function has been extended with the possibility of forward references for
1886    recursions and subroutine calls. It must also check the list of such references
1887    for the group we are dealing with. If it finds that one of the recursions in
1888    the current group is on this list, it adjusts the offset in the list, not the
1889    value in the reference (which is a group number).
1890    
1891  Arguments:  Arguments:
1892    group      points to the start of the group    group      points to the start of the group
1893    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1894    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1895    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1896      save_hwm   the hwm forward reference pointer at the start of the group
1897    
1898  Returns:     nothing  Returns:     nothing
1899  */  */
1900    
1901  static void  static void
1902  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1903      uschar *save_hwm)
1904  {  {
1905  uschar *ptr = group;  uschar *ptr = group;
1906    
1907  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1908    {    {
1909    int offset = GET(ptr, 1);    int offset;
1910    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
   ptr += 1 + LINK_SIZE;  
   }  
 }  
1911    
1912      /* See if this recursion is on the forward reference list. If so, adjust the
1913      reference. */
1914    
1915      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1916        {
1917        offset = GET(hc, 0);
1918        if (cd->start_code + offset == ptr + 1)
1919          {
1920          PUT(hc, 0, offset + adjust);
1921          break;
1922          }
1923        }
1924    
1925  /*************************************************    /* Otherwise, adjust the recursion offset if it's after the start of this
1926      group. */
1927    
1928      if (hc >= cd->hwm)
1929        {
1930        offset = GET(ptr, 1);
1931        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1932        }
1933    
1934      ptr += 1 + LINK_SIZE;
1935      }
1936    }
1937    
1938    
1939    
1940    /*************************************************
1941  *        Insert an automatic callout point       *  *        Insert an automatic callout point       *
1942  *************************************************/  *************************************************/
1943    
# Line 1475  Yield:        TRUE when range returned; Line 2009  Yield:        TRUE when range returned;
2009  */  */
2010    
2011  static BOOL  static BOOL
2012  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2013      unsigned int *odptr)
2014  {  {
2015  int c, chartype, othercase, next;  unsigned int c, othercase, next;
2016    
2017  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2018    {    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
2019    
2020  if (c > d) return FALSE;  if (c > d) return FALSE;
2021    
# Line 1492  next = othercase + 1; Line 2024  next = othercase + 1;
2024    
2025  for (++c; c <= d; c++)  for (++c; c <= d; c++)
2026    {    {
2027    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (UCD_OTHERCASE(c) != next) break;
         othercase != next)  
     break;  
2028    next++;    next++;
2029    }    }
2030    
# Line 1506  return TRUE; Line 2036  return TRUE;
2036  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2037    
2038    
2039    
2040    /*************************************************
2041    *     Check if auto-possessifying is possible    *
2042    *************************************************/
2043    
2044    /* This function is called for unlimited repeats of certain items, to see
2045    whether the next thing could possibly match the repeated item. If not, it makes
2046    sense to automatically possessify the repeated item.
2047    
2048    Arguments:
2049      op_code       the repeated op code
2050      this          data for this item, depends on the opcode
2051      utf8          TRUE in UTF-8 mode
2052      utf8_char     used for utf8 character bytes, NULL if not relevant
2053      ptr           next character in pattern
2054      options       options bits
2055      cd            contains pointers to tables etc.
2056    
2057    Returns:        TRUE if possessifying is wanted
2058    */
2059    
2060    static BOOL
2061    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2062      const uschar *ptr, int options, compile_data *cd)
2063    {
2064    int next;
2065    
2066    /* Skip whitespace and comments in extended mode */
2067    
2068    if ((options & PCRE_EXTENDED) != 0)
2069      {
2070      for (;;)
2071        {
2072        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2073        if (*ptr == '#')
2074          {
2075          while (*(++ptr) != 0)
2076            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2077          }
2078        else break;
2079        }
2080      }
2081    
2082    /* If the next item is one that we can handle, get its value. A non-negative
2083    value is a character, a negative value is an escape value. */
2084    
2085    if (*ptr == '\\')
2086      {
2087      int temperrorcode = 0;
2088      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2089      if (temperrorcode != 0) return FALSE;
2090      ptr++;    /* Point after the escape sequence */
2091      }
2092    
2093    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2094      {
2095    #ifdef SUPPORT_UTF8
2096      if (utf8) { GETCHARINC(next, ptr); } else
2097    #endif
2098      next = *ptr++;
2099      }
2100    
2101    else return FALSE;
2102    
2103    /* Skip whitespace and comments in extended mode */
2104    
2105    if ((options & PCRE_EXTENDED) != 0)
2106      {
2107      for (;;)
2108        {
2109        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2110        if (*ptr == '#')
2111          {
2112          while (*(++ptr) != 0)
2113            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2114          }
2115        else break;
2116        }
2117      }
2118    
2119    /* If the next thing is itself optional, we have to give up. */
2120    
2121    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2122      return FALSE;
2123    
2124    /* Now compare the next item with the previous opcode. If the previous is a
2125    positive single character match, "item" either contains the character or, if
2126    "item" is greater than 127 in utf8 mode, the character's bytes are in
2127    utf8_char. */
2128    
2129    
2130    /* Handle cases when the next item is a character. */
2131    
2132    if (next >= 0) switch(op_code)
2133      {
2134      case OP_CHAR:
2135    #ifdef SUPPORT_UTF8
2136      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2137    #endif
2138      return item != next;
2139    
2140      /* For CHARNC (caseless character) we must check the other case. If we have
2141      Unicode property support, we can use it to test the other case of
2142      high-valued characters. */
2143    
2144      case OP_CHARNC:
2145    #ifdef SUPPORT_UTF8
2146      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2147    #endif
2148      if (item == next) return FALSE;
2149    #ifdef SUPPORT_UTF8
2150      if (utf8)
2151        {
2152        unsigned int othercase;
2153        if (next < 128) othercase = cd->fcc[next]; else
2154    #ifdef SUPPORT_UCP
2155        othercase = UCD_OTHERCASE((unsigned int)next);
2156    #else
2157        othercase = NOTACHAR;
2158    #endif
2159        return (unsigned int)item != othercase;
2160        }
2161      else
2162    #endif  /* SUPPORT_UTF8 */
2163      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2164    
2165      /* For OP_NOT, "item" must be a single-byte character. */
2166    
2167      case OP_NOT:
2168      if (item == next) return TRUE;
2169      if ((options & PCRE_CASELESS) == 0) return FALSE;
2170    #ifdef SUPPORT_UTF8
2171      if (utf8)
2172        {
2173        unsigned int othercase;
2174        if (next < 128) othercase = cd->fcc[next]; else
2175    #ifdef SUPPORT_UCP
2176        othercase = UCD_OTHERCASE(next);
2177    #else
2178        othercase = NOTACHAR;
2179    #endif
2180        return (unsigned int)item == othercase;
2181        }
2182      else
2183    #endif  /* SUPPORT_UTF8 */
2184      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2185    
2186      case OP_DIGIT:
2187      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2188    
2189      case OP_NOT_DIGIT:
2190      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2191    
2192      case OP_WHITESPACE:
2193      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2194    
2195      case OP_NOT_WHITESPACE:
2196      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2197    
2198      case OP_WORDCHAR:
2199      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2200    
2201      case OP_NOT_WORDCHAR:
2202      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2203    
2204      case OP_HSPACE:
2205      case OP_NOT_HSPACE:
2206      switch(next)
2207        {
2208        case 0x09:
2209        case 0x20:
2210        case 0xa0:
2211        case 0x1680:
2212        case 0x180e:
2213        case 0x2000:
2214        case 0x2001:
2215        case 0x2002:
2216        case 0x2003:
2217        case 0x2004:
2218        case 0x2005:
2219        case 0x2006:
2220        case 0x2007:
2221        case 0x2008:
2222        case 0x2009:
2223        case 0x200A:
2224        case 0x202f:
2225        case 0x205f:
2226        case 0x3000:
2227        return op_code != OP_HSPACE;
2228        default:
2229        return op_code == OP_HSPACE;
2230        }
2231    
2232      case OP_VSPACE:
2233      case OP_NOT_VSPACE:
2234      switch(next)
2235        {
2236        case 0x0a:
2237        case 0x0b:
2238        case 0x0c:
2239        case 0x0d:
2240        case 0x85:
2241        case 0x2028:
2242        case 0x2029:
2243        return op_code != OP_VSPACE;
2244        default:
2245        return op_code == OP_VSPACE;
2246        }
2247    
2248      default:
2249      return FALSE;
2250      }
2251    
2252    
2253    /* Handle the case when the next item is \d, \s, etc. */
2254    
2255    switch(op_code)
2256      {
2257      case OP_CHAR:
2258      case OP_CHARNC:
2259    #ifdef SUPPORT_UTF8
2260      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2261    #endif
2262      switch(-next)
2263        {
2264        case ESC_d:
2265        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2266    
2267        case ESC_D:
2268        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2269    
2270        case ESC_s:
2271        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2272    
2273        case ESC_S:
2274        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2275    
2276        case ESC_w:
2277        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2278    
2279        case ESC_W:
2280        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2281    
2282        case ESC_h:
2283        case ESC_H:
2284        switch(item)
2285          {
2286          case 0x09:
2287          case 0x20:
2288          case 0xa0:
2289          case 0x1680:
2290          case 0x180e:
2291          case 0x2000:
2292          case 0x2001:
2293          case 0x2002:
2294          case 0x2003:
2295          case 0x2004:
2296          case 0x2005:
2297          case 0x2006:
2298          case 0x2007:
2299          case 0x2008:
2300          case 0x2009:
2301          case 0x200A:
2302          case 0x202f:
2303          case 0x205f:
2304          case 0x3000:
2305          return -next != ESC_h;
2306          default:
2307          return -next == ESC_h;
2308          }
2309    
2310        case ESC_v:
2311        case ESC_V:
2312        switch(item)
2313          {
2314          case 0x0a:
2315          case 0x0b:
2316          case 0x0c:
2317          case 0x0d:
2318          case 0x85:
2319          case 0x2028:
2320          case 0x2029:
2321          return -next != ESC_v;
2322          default:
2323          return -next == ESC_v;
2324          }
2325    
2326        default:
2327        return FALSE;
2328        }
2329    
2330      case OP_DIGIT:
2331      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2332             next == -ESC_h || next == -ESC_v;
2333    
2334      case OP_NOT_DIGIT:
2335      return next == -ESC_d;
2336    
2337      case OP_WHITESPACE:
2338      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2339    
2340      case OP_NOT_WHITESPACE:
2341      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2342    
2343      case OP_HSPACE:
2344      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2345    
2346      case OP_NOT_HSPACE:
2347      return next == -ESC_h;
2348    
2349      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2350      case OP_VSPACE:
2351      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2352    
2353      case OP_NOT_VSPACE:
2354      return next == -ESC_v;
2355    
2356      case OP_WORDCHAR:
2357      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2358    
2359      case OP_NOT_WORDCHAR:
2360      return next == -ESC_w || next == -ESC_d;
2361    
2362      default:
2363      return FALSE;
2364      }
2365    
2366    /* Control does not reach here */
2367    }
2368    
2369    
2370    
2371  /*************************************************  /*************************************************
2372  *           Compile one branch                   *  *           Compile one branch                   *
2373  *************************************************/  *************************************************/
2374    
2375  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
2376  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
2377  bits.  bits. This function is used during the pre-compile phase when we are trying
2378    to find out the amount of memory needed, as well as during the real compile
2379    phase. The value of lengthptr distinguishes the two phases.
2380    
2381  Arguments:  Arguments:
2382    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2383    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2384    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2385    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1524  Arguments: Line 2387  Arguments:
2387    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2388    bcptr          points to current branch chain    bcptr          points to current branch chain
2389    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2390      lengthptr      NULL during the real compile phase
2391                     points to length accumulator during pre-compile phase
2392    
2393  Returns:         TRUE on success  Returns:         TRUE on success
2394                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2395  */  */
2396    
2397  static BOOL  static BOOL
2398  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2399    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2400    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2401  {  {
2402  int repeat_type, op_type;  int repeat_type, op_type;
2403  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1541  int greedy_default, greedy_non_default; Line 2406  int greedy_default, greedy_non_default;
2406  int firstbyte, reqbyte;  int firstbyte, reqbyte;
2407  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
2408  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
 int condcount = 0;  
2409  int options = *optionsptr;  int options = *optionsptr;
2410  int after_manual_callout = 0;  int after_manual_callout = 0;
2411    int length_prevgroup = 0;
2412  register int c;  register int c;
2413  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2414    uschar *last_code = code;
2415    uschar *orig_code = code;
2416  uschar *tempcode;  uschar *tempcode;
2417  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2418  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1553  const uschar *ptr = *ptrptr; Line 2420  const uschar *ptr = *ptrptr;
2420  const uschar *tempptr;  const uschar *tempptr;
2421  uschar *previous = NULL;  uschar *previous = NULL;
2422  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2423    uschar *save_hwm = NULL;
2424  uschar classbits[32];  uschar classbits[32];
2425    
2426  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2427  BOOL class_utf8;  BOOL class_utf8;
2428  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2429  uschar *class_utf8data;  uschar *class_utf8data;
2430    uschar *class_utf8data_base;
2431  uschar utf8_char[6];  uschar utf8_char[6];
2432  #else  #else
2433  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2434    uschar *utf8_char = NULL;
2435    #endif
2436    
2437    #ifdef DEBUG
2438    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2439  #endif  #endif
2440    
2441  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1593  req_caseopt = ((options & PCRE_CASELESS) Line 2467  req_caseopt = ((options & PCRE_CASELESS)
2467  for (;; ptr++)  for (;; ptr++)
2468    {    {
2469    BOOL negate_class;    BOOL negate_class;
2470      BOOL should_flip_negation;
2471    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2472    BOOL is_quantifier;    BOOL is_quantifier;
2473      BOOL is_recurse;
2474      BOOL reset_bracount;
2475    int class_charcount;    int class_charcount;
2476    int class_lastchar;    int class_lastchar;
2477    int newoptions;    int newoptions;
2478    int recno;    int recno;
2479      int refsign;
2480    int skipbytes;    int skipbytes;
2481    int subreqbyte;    int subreqbyte;
2482    int subfirstbyte;    int subfirstbyte;
2483      int terminator;
2484    int mclength;    int mclength;
2485    uschar mcbuffer[8];    uschar mcbuffer[8];
2486    
2487    /* Next byte in the pattern */    /* Get next byte in the pattern */
2488    
2489    c = *ptr;    c = *ptr;
2490    
2491      /* If we are in the pre-compile phase, accumulate the length used for the
2492      previous cycle of this loop. */
2493    
2494      if (lengthptr != NULL)
2495        {
2496    #ifdef DEBUG
2497        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2498    #endif
2499        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2500          {
2501          *errorcodeptr = ERR52;
2502          goto FAILED;
2503          }
2504    
2505        /* There is at least one situation where code goes backwards: this is the
2506        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2507        the class is simply eliminated. However, it is created first, so we have to
2508        allow memory for it. Therefore, don't ever reduce the length at this point.
2509        */
2510    
2511        if (code < last_code) code = last_code;
2512    
2513        /* Paranoid check for integer overflow */
2514    
2515        if (OFLOW_MAX - *lengthptr < code - last_code)
2516          {
2517          *errorcodeptr = ERR20;
2518          goto FAILED;
2519          }
2520    
2521        *lengthptr += code - last_code;
2522        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2523    
2524        /* If "previous" is set and it is not at the start of the work space, move
2525        it back to there, in order to avoid filling up the work space. Otherwise,
2526        if "previous" is NULL, reset the current code pointer to the start. */
2527    
2528        if (previous != NULL)
2529          {
2530          if (previous > orig_code)
2531            {
2532            memmove(orig_code, previous, code - previous);
2533            code -= previous - orig_code;
2534            previous = orig_code;
2535            }
2536          }
2537        else code = orig_code;
2538    
2539        /* Remember where this code item starts so we can pick up the length
2540        next time round. */
2541    
2542        last_code = code;
2543        }
2544    
2545      /* In the real compile phase, just check the workspace used by the forward
2546      reference list. */
2547    
2548      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2549        {
2550        *errorcodeptr = ERR52;
2551        goto FAILED;
2552        }
2553    
2554    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2555    
2556    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1623  for (;; ptr++) Line 2565  for (;; ptr++)
2565        {        {
2566        if (previous_callout != NULL)        if (previous_callout != NULL)
2567          {          {
2568          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2569              complete_callout(previous_callout, ptr, cd);
2570          previous_callout = NULL;          previous_callout = NULL;
2571          }          }
2572        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1644  for (;; ptr++) Line 2587  for (;; ptr++)
2587    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2588         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2589      {      {
2590      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2591          complete_callout(previous_callout, ptr, cd);
2592      previous_callout = NULL;      previous_callout = NULL;
2593      }      }
2594    
# Line 1655  for (;; ptr++) Line 2599  for (;; ptr++)
2599      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2600      if (c == '#')      if (c == '#')
2601        {        {
2602        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2603        on the Macintosh. */          {
2604        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2605        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2606          if (*ptr != 0) continue;
2607    
2608          /* Else fall through to handle end of string */
2609          c = 0;
2610        }        }
2611      }      }
2612    
# Line 1672  for (;; ptr++) Line 2620  for (;; ptr++)
2620    
2621    switch(c)    switch(c)
2622      {      {
2623      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2624        case 0:                        /* The branch terminates at string end */
2625      case 0:      case '|':                      /* or | or ) */
     case '|':  
2626      case ')':      case ')':
2627      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2628      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2629      *codeptr = code;      *codeptr = code;
2630      *ptrptr = ptr;      *ptrptr = ptr;
2631        if (lengthptr != NULL)
2632          {
2633          if (OFLOW_MAX - *lengthptr < code - last_code)
2634            {
2635            *errorcodeptr = ERR20;
2636            goto FAILED;
2637            }
2638          *lengthptr += code - last_code;   /* To include callout length */
2639          DPRINTF((">> end branch\n"));
2640          }
2641      return TRUE;      return TRUE;
2642    
2643    
2644        /* ===================================================================*/
2645      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2646      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2647    
# Line 1708  for (;; ptr++) Line 2667  for (;; ptr++)
2667      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
2668      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
2669      previous = code;      previous = code;
2670      *code++ = OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2671      break;      break;
2672    
2673      /* Character classes. If the included characters are all < 255 in value, we  
2674      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2675      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2676      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2677      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2678        map as usual, then invert it at the end. However, we use a different opcode
2679        so that data characters > 255 can be handled correctly.
2680    
2681      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2682      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2683      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2684      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
2685      */  
2686        In JavaScript compatibility mode, an isolated ']' causes an error. In
2687        default (Perl) mode, it is treated as a data character. */
2688    
2689        case ']':
2690        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2691          {
2692          *errorcodeptr = ERR64;
2693          goto FAILED;
2694          }
2695        goto NORMAL_CHAR;
2696    
2697      case '[':      case '[':
2698      previous = code;      previous = code;
# Line 1730  for (;; ptr++) Line 2701  for (;; ptr++)
2701      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2702    
2703      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2704          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2705        {        {
2706        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2707        goto FAILED;        goto FAILED;
2708        }        }
2709    
2710      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2711        if the first few characters (either before or after ^) are \Q\E or \E we
2712        skip them too. This makes for compatibility with Perl. */
2713    
2714      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2715        for (;;)
2716        {        {
       negate_class = TRUE;  
2717        c = *(++ptr);        c = *(++ptr);
2718          if (c == '\\')
2719            {
2720            if (ptr[1] == 'E') ptr++;
2721              else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2722                else break;
2723            }
2724          else if (!negate_class && c == '^')
2725            negate_class = TRUE;
2726          else break;
2727        }        }
2728      else  
2729        /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2730        an initial ']' is taken as a data character -- the code below handles
2731        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2732        [^] must match any character, so generate OP_ALLANY. */
2733    
2734        if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2735        {        {
2736        negate_class = FALSE;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
2737          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2738          zerofirstbyte = firstbyte;
2739          break;
2740        }        }
2741    
2742        /* If a class contains a negative special such as \S, we need to flip the
2743        negation flag at the end, so that support for characters > 255 works
2744        correctly (they are all included in the class). */
2745    
2746        should_flip_negation = FALSE;
2747    
2748      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2749      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2750      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2751    
2752      class_charcount = 0;      class_charcount = 0;
2753      class_lastchar = -1;      class_lastchar = -1;
2754    
2755        /* Initialize the 32-char bit map to all zeros. We build the map in a
2756        temporary bit of memory, in case the class contains only 1 character (less
2757        than 256), because in that case the compiled code doesn't use the bit map.
2758        */
2759    
2760        memset(classbits, 0, 32 * sizeof(uschar));
2761    
2762  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2763      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2764      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2765        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2766  #endif  #endif
2767    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2768      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2769      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2770      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2771    
2772      do      if (c != 0) do
2773        {        {
2774          const uschar *oldptr;
2775    
2776  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2777        if (utf8 && c > 127)        if (utf8 && c > 127)
2778          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2779          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2780          }          }
2781    
2782          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2783          data and reset the pointer. This is so that very large classes that
2784          contain a zillion UTF-8 characters no longer overwrite the work space
2785          (which is on the stack). */
2786    
2787          if (lengthptr != NULL)
2788            {
2789            *lengthptr += class_utf8data - class_utf8data_base;
2790            class_utf8data = class_utf8data_base;
2791            }
2792    
2793  #endif  #endif
2794    
2795        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
2796    
2797        if (inescq)        if (inescq)
2798          {          {
2799          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2800            {            {
2801            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2802            ptr++;            ptr++;                            /* Skip the 'E' */
2803            continue;            continue;                         /* Carry on with next */
2804            }            }
2805          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2806          }          }
2807    
2808        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1803  for (;; ptr++) Line 2813  for (;; ptr++)
2813    
2814        if (c == '[' &&        if (c == '[' &&
2815            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2816            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2817          {          {
2818          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2819          int posix_class, i;          int posix_class, taboffset, tabopt;
2820          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2821            uschar pbits[32];
2822    
2823          if (ptr[1] != ':')          if (ptr[1] != ':')
2824            {            {
# Line 1819  for (;; ptr++) Line 2830  for (;; ptr++)
2830          if (*ptr == '^')          if (*ptr == '^')
2831            {            {
2832            local_negate = TRUE;            local_negate = TRUE;
2833              should_flip_negation = TRUE;  /* Note negative special */
2834            ptr++;            ptr++;
2835            }            }
2836    
# Line 1836  for (;; ptr++) Line 2848  for (;; ptr++)
2848          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2849            posix_class = 0;            posix_class = 0;
2850    
2851          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2852          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2853          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2854          white space chars afterwards. */          result into the bit map that is being built. */
2855    
2856          posix_class *= 3;          posix_class *= 3;
2857          for (i = 0; i < 3; i++)  
2858            /* Copy in the first table (always present) */
2859    
2860            memcpy(pbits, cbits + posix_class_maps[posix_class],
2861              32 * sizeof(uschar));
2862    
2863            /* If there is a second table, add or remove it as required. */
2864    
2865            taboffset = posix_class_maps[posix_class + 1];
2866            tabopt = posix_class_maps[posix_class + 2];
2867    
2868            if (taboffset >= 0)
2869            {            {
2870            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2871            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2872            else            else
2873              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2874            }            }
2875    
2876            /* Not see if we need to remove any special characters. An option
2877            value of 1 removes vertical space and 2 removes underscore. */
2878    
2879            if (tabopt < 0) tabopt = -tabopt;
2880            if (tabopt == 1) pbits[1] &= ~0x3c;
2881              else if (tabopt == 2) pbits[11] &= 0x7f;
2882    
2883            /* Add the POSIX table or its complement into the main table that is
2884            being built and we are done. */
2885    
2886            if (local_negate)
2887              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2888            else
2889              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2890    
2891          ptr = tempptr + 1;          ptr = tempptr + 1;
2892          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2893          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2894          }          }
2895    
2896        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2897        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2898        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2899        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2900        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2901        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2902    
2903        if (c == '\\')        if (c == '\\')
2904          {          {
2905          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2906            if (*errorcodeptr != 0) goto FAILED;
2907    
2908          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2909          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2910            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2911          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2912            {            {
2913            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1890  for (;; ptr++) Line 2917  for (;; ptr++)
2917            else inescq = TRUE;            else inescq = TRUE;
2918            continue;            continue;
2919            }            }
2920            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2921    
2922          if (c < 0)          if (c < 0)
2923            {            {
2924            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2925            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2926            switch (-c)  
2927              /* Save time by not doing this in the pre-compile phase. */
2928    
2929              if (lengthptr == NULL) switch (-c)
2930              {              {
2931              case ESC_d:              case ESC_d:
2932              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2933              continue;              continue;
2934    
2935              case ESC_D:              case ESC_D:
2936                should_flip_negation = TRUE;
2937              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2938              continue;              continue;
2939    
# Line 1910  for (;; ptr++) Line 2942  for (;; ptr++)
2942              continue;              continue;
2943    
2944              case ESC_W:              case ESC_W:
2945                should_flip_negation = TRUE;
2946              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2947              continue;              continue;
2948    
# Line 1919  for (;; ptr++) Line 2952  for (;; ptr++)
2952              continue;              continue;
2953    
2954              case ESC_S:              case ESC_S:
2955                should_flip_negation = TRUE;
2956              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2957              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2958              continue;              continue;
2959    
2960  #ifdef SUPPORT_UCP              default:    /* Not recognized; fall through */
2961              case ESC_p:              break;      /* Need "default" setting to stop compiler warning. */
2962              case ESC_P:              }
2963    
2964              /* In the pre-compile phase, just do the recognition. */
2965    
2966              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2967                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2968    
2969              /* We need to deal with \H, \h, \V, and \v in both phases because
2970              they use extra memory. */
2971    
2972              if (-c == ESC_h)
2973                {
2974                SETBIT(classbits, 0x09); /* VT */
2975                SETBIT(classbits, 0x20); /* SPACE */
2976                SETBIT(classbits, 0xa0); /* NSBP */
2977    #ifdef SUPPORT_UTF8
2978                if (utf8)
2979                {                {
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
2980                class_utf8 = TRUE;                class_utf8 = TRUE;
2981                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2982                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2983                *class_utf8data++ = property;                *class_utf8data++ = XCL_SINGLE;
2984                class_charcount -= 2;   /* Not a < 256 character */                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2985                  *class_utf8data++ = XCL_RANGE;
2986                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2987                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2988                  *class_utf8data++ = XCL_SINGLE;
2989                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2990                  *class_utf8data++ = XCL_SINGLE;
2991                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2992                  *class_utf8data++ = XCL_SINGLE;
2993                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2994                }                }
             continue;  
2995  #endif  #endif
2996                continue;
2997                }
2998    
2999              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
3000              strict mode. By default, for compatibility with Perl, they are              {
3001              treated as literals. */              for (c = 0; c < 32; c++)
3002                  {
3003                  int x = 0xff;
3004                  switch (c)
3005                    {
3006                    case 0x09/8: x ^= 1 << (0x09%8); break;
3007                    case 0x20/8: x ^= 1 << (0x20%8); break;
3008                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
3009                    default: break;
3010                    }
3011                  classbits[c] |= x;
3012                  }
3013    
3014              default:  #ifdef SUPPORT_UTF8
3015              if ((options & PCRE_EXTRA) != 0)              if (utf8)
3016                {                {
3017                *errorcodeptr = ERR7;                class_utf8 = TRUE;
3018                goto FAILED;                *class_utf8data++ = XCL_RANGE;
3019                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3020                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3021                  *class_utf8data++ = XCL_RANGE;
3022                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3023                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3024                  *class_utf8data++ = XCL_RANGE;
3025                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3026                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3027                  *class_utf8data++ = XCL_RANGE;
3028                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3029                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3030                  *class_utf8data++ = XCL_RANGE;
3031                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3032                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3033                  *class_utf8data++ = XCL_RANGE;
3034                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3035                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3036                  *class_utf8data++ = XCL_RANGE;
3037                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3038                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3039                }                }
3040              c = *ptr;              /* The final character */  #endif
3041              class_charcount -= 2;  /* Undo the default count from above */              continue;
3042              }              }
           }  
   
         /* Fall through if we have a single character (c >= 0). This may be  
         > 256 in UTF-8 mode. */  
3043    
3044          }   /* End of backslash handling */            if (-c == ESC_v)
3045                {
3046                SETBIT(classbits, 0x0a); /* LF */
3047                SETBIT(classbits, 0x0b); /* VT */
3048                SETBIT(classbits, 0x0c); /* FF */
3049                SETBIT(classbits, 0x0d); /* CR */
3050                SETBIT(classbits, 0x85); /* NEL */
3051    #ifdef SUPPORT_UTF8
3052                if (utf8)
3053                  {
3054                  class_utf8 = TRUE;
3055                  *class_utf8data++ = XCL_RANGE;
3056                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3057                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3058                  }
3059    #endif
3060                continue;
3061                }
3062    
3063              if (-c == ESC_V)
3064                {
3065                for (c = 0; c < 32; c++)
3066                  {
3067                  int x = 0xff;
3068                  switch (c)
3069                    {
3070                    case 0x0a/8: x ^= 1 << (0x0a%8);
3071                                 x ^= 1 << (0x0b%8);
3072                                 x ^= 1 << (0x0c%8);
3073                                 x ^= 1 << (0x0d%8);
3074                                 break;
3075                    case 0x85/8: x ^= 1 << (0x85%8); break;
3076                    default: break;
3077                    }
3078                  classbits[c] |= x;
3079                  }
3080    
3081    #ifdef SUPPORT_UTF8
3082                if (utf8)
3083                  {
3084                  class_utf8 = TRUE;
3085                  *class_utf8data++ = XCL_RANGE;
3086                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3087                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3088                  *class_utf8data++ = XCL_RANGE;
3089                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3090                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3091                  }
3092    #endif
3093                continue;
3094                }
3095    
3096              /* We need to deal with \P and \p in both phases. */
3097    
3098    #ifdef SUPPORT_UCP
3099              if (-c == ESC_p || -c == ESC_P)
3100                {
3101                BOOL negated;
3102                int pdata;
3103                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3104                if (ptype < 0) goto FAILED;
3105                class_utf8 = TRUE;
3106                *class_utf8data++ = ((-c == ESC_p) != negated)?
3107                  XCL_PROP : XCL_NOTPROP;
3108                *class_utf8data++ = ptype;
3109                *class_utf8data++ = pdata;
3110                class_charcount -= 2;   /* Not a < 256 character */
3111                continue;
3112                }
3113    #endif
3114              /* Unrecognized escapes are faulted if PCRE is running in its
3115              strict mode. By default, for compatibility with Perl, they are
3116              treated as literals. */
3117    
3118              if ((options & PCRE_EXTRA) != 0)
3119                {
3120                *errorcodeptr = ERR7;
3121                goto FAILED;
3122                }
3123    
3124              class_charcount -= 2;  /* Undo the default count from above */
3125              c = *ptr;              /* Get the final character and fall through */
3126              }
3127    
3128            /* Fall through if we have a single character (c >= 0). This may be
3129            greater than 256 in UTF-8 mode. */
3130    
3131            }   /* End of backslash handling */
3132    
3133        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
3134        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
3135        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
3136          entirely. The code for handling \Q and \E is messy. */
3137    
3138          CHECK_RANGE:
3139          while (ptr[1] == '\\' && ptr[2] == 'E')
3140            {
3141            inescq = FALSE;
3142            ptr += 2;
3143            }
3144    
3145          oldptr = ptr;
3146    
3147          /* Remember \r or \n */
3148    
3149          if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3150    
3151          /* Check for range */
3152    
3153        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
3154          {          {
3155          int d;          int d;
3156          ptr += 2;          ptr += 2;
3157            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3158    
3159            /* If we hit \Q (not followed by \E) at this point, go into escaped
3160            mode. */
3161    
3162            while (*ptr == '\\' && ptr[1] == 'Q')
3163              {
3164              ptr += 2;
3165              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3166              inescq = TRUE;
3167              break;
3168              }
3169    
3170            if (*ptr == 0 || (!inescq && *ptr == ']'))
3171              {
3172              ptr = oldptr;
3173              goto LONE_SINGLE_CHARACTER;
3174              }
3175    
3176  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3177          if (utf8)          if (utf8)
# Line 1981  for (;; ptr++) Line 3186  for (;; ptr++)
3186          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3187          in such circumstances. */          in such circumstances. */
3188    
3189          if (d == '\\')          if (!inescq && d == '\\')
3190            {            {
3191            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3192            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
3193    
3194            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backspace; \X is literal X; \R is literal R; any other
3195            was literal */            special means the '-' was literal */
3196    
3197            if (d < 0)            if (d < 0)
3198              {              {
3199              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
3200              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
3201                else if (d == -ESC_R) d = 'R'; else
3202                {                {
3203                ptr = oldptr - 2;                ptr = oldptr;
3204                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3205                }                }
3206              }              }
3207            }            }
3208    
3209          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3210          the pre-pass. Optimize one-character ranges */          one-character ranges */
3211    
3212            if (d < c)
3213              {
3214              *errorcodeptr = ERR8;
3215              goto FAILED;
3216              }
3217    
3218          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3219    
3220            /* Remember \r or \n */
3221    
3222            if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3223    
3224          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3225          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
3226          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 2022  for (;; ptr++) Line 3238  for (;; ptr++)
3238  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3239            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3240              {              {
3241              int occ, ocd;              unsigned int occ, ocd;
3242              int cc = c;              unsigned int cc = c;
3243              int origd = d;              unsigned int origd = d;
3244              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3245                {                {
3246                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3247                      ocd <= (unsigned int)d)
3248                    continue;                          /* Skip embedded ranges */
3249    
3250                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3251                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3252                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3253                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3254                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3255                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3256                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3257                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3258                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3259                  d = ocd;                  d = ocd;
3260                  continue;                  continue;
# Line 2082  for (;; ptr++) Line 3302  for (;; ptr++)
3302          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3303          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3304    
3305          for (; c <= d; c++)          class_charcount += d - c + 1;
3306            class_lastchar = d;
3307    
3308            /* We can save a bit of time by skipping this in the pre-compile. */
3309    
3310            if (lengthptr == NULL) for (; c <= d; c++)
3311            {            {
3312            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3313            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2090  for (;; ptr++) Line 3315  for (;; ptr++)
3315              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3316              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3317              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3318            }            }
3319    
3320          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2115  for (;; ptr++) Line 3338  for (;; ptr++)
3338  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3339          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3340            {            {
3341            int chartype;            unsigned int othercase;
3342            int othercase;            if ((othercase = UCD_OTHERCASE(c)) != c)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3343              {              {
3344              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3345              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2143  for (;; ptr++) Line 3364  for (;; ptr++)
3364          }          }
3365        }        }
3366    
3367      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3368      loop. This "while" is the end of the "do" above. */  
3369        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3370    
3371        if (c == 0)                          /* Missing terminating ']' */
3372          {
3373          *errorcodeptr = ERR6;
3374          goto FAILED;
3375          }
3376    
3377    
3378    /* This code has been disabled because it would mean that \s counts as
3379    an explicit \r or \n reference, and that's not really what is wanted. Now
3380    we set the flag only if there is a literal "\r" or "\n" in the class. */
3381    
3382    #if 0
3383        /* Remember whether \r or \n are in this class */
3384    
3385        if (negate_class)
3386          {
3387          if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3388          }
3389        else
3390          {
3391          if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3392          }
3393    #endif
3394    
     while ((c = *(++ptr)) != ']' || inescq);  
3395    
3396      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3397      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3398      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3399      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3400      single-bytes only. This is an historical hangover. Maybe one day we can  
3401      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3402        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3403        operate on single-bytes only. This is an historical hangover. Maybe one day
3404        we can tidy these opcodes to handle multi-byte characters.
3405    
3406      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3407      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 2163  for (;; ptr++) Line 3411  for (;; ptr++)
3411      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3412    
3413  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3414      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3415            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3416  #else  #else
3417      if (class_charcount == 1)      if (class_charcount == 1)
3418  #endif  #endif
# Line 2209  for (;; ptr++) Line 3455  for (;; ptr++)
3455      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3456    
3457      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3458      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
3459      we can omit the bitmap. */      such as \S in the class, because in that case all characters > 255 are in
3460        the class, so any that were explicitly given as well can be ignored. If
3461        (when there are explicit characters > 255 that must be listed) there are no
3462        characters < 256, we can omit the bitmap in the actual compiled code. */
3463    
3464  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3465      if (class_utf8)      if (class_utf8 && !should_flip_negation)
3466        {        {
3467        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3468        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
3469        code += LINK_SIZE;        code += LINK_SIZE;
3470        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3471    
3472        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3473        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3474    
3475        if (class_charcount > 0)        if (class_charcount > 0)
3476          {          {
3477          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3478            memmove(code + 32, code, class_utf8data - code);
3479          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3480          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3481          }          }
3482          else code = class_utf8data;
3483    
3484        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3485    
# Line 2246  for (;; ptr++) Line 3488  for (;; ptr++)
3488        }        }
3489  #endif  #endif
3490    
3491      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, set the opcode to OP_CLASS or
3492      and copy it into the code vector. If this is the first thing in the branch,      OP_NCLASS, depending on whether the whole class was negated and whether
3493      there can be no first char setting, whatever the repeat count. Any reqbyte      there were negative specials such as \S in the class. Then copy the 32-byte
3494      setting must remain unchanged after any kind of repeat. */      map into the code vector, negating it if necessary. */
3495    
3496        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3497      if (negate_class)      if (negate_class)
3498        {        {
3499        *code++ = OP_NCLASS;        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3500        for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3501        }        }
3502      else      else
3503        {        {
       *code++ = OP_CLASS;  
3504        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
3505        }        }
3506      code += 32;      code += 32;
3507      break;      break;
3508    
3509    
3510        /* ===================================================================*/
3511      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3512      has been tested above. */      has been tested above. */
3513    
# Line 2331  for (;; ptr++) Line 3575  for (;; ptr++)
3575        }        }
3576      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3577    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3578      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3579      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3580      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2378  for (;; ptr++) Line 3608  for (;; ptr++)
3608          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3609          }          }
3610    
3611          /* If the repetition is unlimited, it pays to see if the next thing on
3612          the line is something that cannot possibly match this character. If so,
3613          automatically possessifying this item gains some performance in the case
3614          where the match fails. */
3615    
3616          if (!possessive_quantifier &&
3617              repeat_max < 0 &&
3618              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3619                options, cd))
3620            {
3621            repeat_type = 0;    /* Force greedy */
3622            possessive_quantifier = TRUE;
3623            }
3624    
3625        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3626        }        }
3627    
3628      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3629      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3630      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3631      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3632        currently used only for single-byte chars. */
3633    
3634      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3635        {        {
3636        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3637        c = previous[1];        c = previous[1];
3638          if (!possessive_quantifier &&
3639              repeat_max < 0 &&
3640              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3641            {
3642            repeat_type = 0;    /* Force greedy */
3643            possessive_quantifier = TRUE;
3644            }
3645        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3646        }        }
3647    
# Line 2403  for (;; ptr++) Line 3655  for (;; ptr++)
3655      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3656        {        {
3657        uschar *oldcode;        uschar *oldcode;
3658        int prop_type;        int prop_type, prop_value;
3659        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3660        c = *previous;        c = *previous;
3661    
3662          if (!possessive_quantifier &&
3663              repeat_max < 0 &&
3664              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3665            {
3666            repeat_type = 0;    /* Force greedy */
3667            possessive_quantifier = TRUE;
3668            }
3669    
3670        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3671        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3672          previous[1] : -1;          {
3673            prop_type = previous[1];
3674            prop_value = previous[2];
3675            }
3676          else prop_type = prop_value = -1;
3677    
3678        oldcode = code;        oldcode = code;
3679        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2422  for (;; ptr++) Line 3686  for (;; ptr++)
3686        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3687        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3688    
3689        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3690    
3691        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3692    
# Line 2443  for (;; ptr++) Line 3707  for (;; ptr++)
3707          }          }
3708    
3709        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3710        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3711        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3712        one less than the maximum. */        one less than the maximum. */
3713    
# Line 2470  for (;; ptr++) Line 3734  for (;; ptr++)
3734    
3735          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3736          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3737          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3738          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3739          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3740    
# Line 2486  for (;; ptr++) Line 3750  for (;; ptr++)
3750  #endif  #endif
3751              {              {
3752              *code++ = c;              *code++ = c;
3753              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3754                  {
3755                  *code++ = prop_type;
3756                  *code++ = prop_value;
3757                  }
3758              }              }
3759            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3760            }            }
3761    
3762          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3763          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3764            UPTO is just for 1 instance, we can use QUERY instead. */
3765    
3766          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3767            {            {
# Line 2505  for (;; ptr++) Line 3774  for (;; ptr++)
3774            else            else
3775  #endif  #endif
3776            *code++ = c;            *code++ = c;
3777            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3778                {
3779                *code++ = prop_type;
3780                *code++ = prop_value;
3781                }
3782            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3783            *code++ = OP_UPTO + repeat_type;  
3784            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3785                {
3786                *code++ = OP_QUERY + repeat_type;
3787                }
3788              else
3789                {
3790                *code++ = OP_UPTO + repeat_type;
3791                PUT2INC(code, 0, repeat_max);
3792                }
3793            }            }
3794          }          }
3795    
# Line 2524  for (;; ptr++) Line 3805  for (;; ptr++)
3805  #endif  #endif
3806        *code++ = c;        *code++ = c;
3807    
3808        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3809        defines the required property. */        define the required property. */
3810    
3811  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3812        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3813            {
3814            *code++ = prop_type;
3815            *code++ = prop_value;
3816            }
3817  #endif  #endif
3818        }        }
3819    
# Line 2551  for (;; ptr++) Line 3836  for (;; ptr++)
3836        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3837        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3838    
3839        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3840    
3841        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
3842          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 2571  for (;; ptr++) Line 3856  for (;; ptr++)
3856      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3857      cases. */      cases. */
3858    
3859      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3860               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3861        {        {
3862        register int i;        register int i;
3863        int ketoffset = 0;        int ketoffset = 0;
3864        int len = code - previous;        int len = code - previous;
3865        uschar *bralink = NULL;        uschar *bralink = NULL;
3866    
3867          /* Repeating a DEFINE group is pointless */
3868    
3869          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3870            {
3871            *errorcodeptr = ERR55;
3872            goto FAILED;
3873            }
3874    
3875        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3876        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3877        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2601  for (;; ptr++) Line 3894  for (;; ptr++)
3894    
3895        if (repeat_min == 0)        if (repeat_min == 0)
3896          {          {
3897          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
3898          altogether. */          output altogether, like this:
   
         if (repeat_max == 0)  
           {  
           code = previous;  
           goto END_REPEAT;  
           }  
3899    
3900          /* If the maximum is 1 or unlimited, we just have to stick in the          ** if (repeat_max == 0)
3901          BRAZERO and do no more at this point. However, we do need to adjust          **   {
3902          any OP_RECURSE calls inside the group that refer to the group itself or          **   code = previous;
3903          any internal group, because the offset is from the start of the whole          **   goto END_REPEAT;
3904          regex. Temporarily terminate the pattern while doing this. */          **   }
3905    
3906            However, that fails when a group is referenced as a subroutine from
3907            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3908            so that it is skipped on execution. As we don't have a list of which
3909            groups are referenced, we cannot do this selectively.
3910    
3911            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3912            and do no more at this point. However, we do need to adjust any
3913            OP_RECURSE calls inside the group that refer to the group itself or any
3914            internal or forward referenced group, because the offset is from the
3915            start of the whole regex. Temporarily terminate the pattern while doing
3916            this. */
3917    
3918          if (repeat_max <= 1)          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3919            {            {
3920            *code = OP_END;            *code = OP_END;
3921            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3922            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3923            code++;            code++;
3924              if (repeat_max == 0)
3925                {
3926                *previous++ = OP_SKIPZERO;
3927                goto END_REPEAT;
3928                }
3929            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
3930            }            }
3931    
# Line 2637  for (;; ptr++) Line 3941  for (;; ptr++)
3941            {            {
3942            int offset;            int offset;
3943            *code = OP_END;            *code = OP_END;
3944            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3945            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3946            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3947            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3961  for (;; ptr++)
3961        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3962        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3963        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3964        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3965          forward reference subroutine calls in the group, there will be entries on
3966          the workspace list; replicate these with an appropriate increment. */
3967    
3968        else        else
3969          {          {
3970          if (repeat_min > 1)          if (repeat_min > 1)
3971            {            {
3972            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3973            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3974              potential integer overflow. */
3975    
3976              if (lengthptr != NULL)
3977                {
3978                int delta = (repeat_min - 1)*length_prevgroup;
3979                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3980                                                                (double)INT_MAX ||
3981                    OFLOW_MAX - *lengthptr < delta)
3982                  {
3983                  *errorcodeptr = ERR20;
3984                  goto FAILED;
3985                  }
3986                *lengthptr += delta;
3987                }
3988    
3989              /* This is compiling for real */
3990    
3991              else
3992              {              {
3993              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3994              code += len;              for (i = 1; i < repeat_min; i++)
3995                  {
3996                  uschar *hc;
3997                  uschar *this_hwm = cd->hwm;
3998                  memcpy(code, previous, len);
3999                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4000                    {
4001                    PUT(cd->hwm, 0, GET(hc, 0) + len);
4002                    cd->hwm += LINK_SIZE;
4003                    }
4004                  save_hwm = this_hwm;
4005                  code += len;
4006                  }
4007              }              }
4008            }            }
4009    
4010          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
4011          }          }
4012    
# Line 2677  for (;; ptr++) Line 4014  for (;; ptr++)
4014        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
4015        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
4016        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
4017        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
4018          replicate entries on the forward reference list. */
4019    
4020        if (repeat_max >= 0)        if (repeat_max >= 0)
4021          {          {
4022          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
4023            just adjust the length as if we had. For each repetition we must add 1
4024            to the length for BRAZERO and for all but the last repetition we must
4025            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4026            paranoid checks to avoid integer overflow. */
4027    
4028            if (lengthptr != NULL && repeat_max > 0)
4029              {
4030              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4031                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4032              if ((double)repeat_max *
4033                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4034                      > (double)INT_MAX ||
4035                  OFLOW_MAX - *lengthptr < delta)
4036                {
4037                *errorcodeptr = ERR20;
4038                goto FAILED;
4039                }
4040              *lengthptr += delta;
4041              }
4042    
4043            /* This is compiling for real */
4044    
4045            else for (i = repeat_max - 1; i >= 0; i--)
4046            {            {
4047              uschar *hc;
4048              uschar *this_hwm = cd->hwm;
4049    
4050            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
4051    
4052            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2698  for (;; ptr++) Line 4062  for (;; ptr++)
4062              }              }
4063    
4064            memcpy(code, previous, len);            memcpy(code, previous, len);
4065              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4066                {
4067                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4068                cd->hwm += LINK_SIZE;
4069                }
4070              save_hwm = this_hwm;
4071            code += len;            code += len;
4072            }            }
4073    
# Line 2720  for (;; ptr++) Line 4090  for (;; ptr++)
4090        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
4091        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
4092        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
4093        correct offset was computed above. */        correct offset was computed above.
4094    
4095        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
4096          this group is a non-atomic one that could match an empty string. If so,
4097          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4098          that runtime checking can be done. [This check is also applied to
4099          atomic groups at runtime, but in a different way.] */
4100    
4101          else
4102            {
4103            uschar *ketcode = code - ketoffset;
4104            uschar *bracode = ketcode - GET(ketcode, 1);
4105            *ketcode = OP_KETRMAX + repeat_type;
4106            if (lengthptr == NULL && *bracode != OP_ONCE)
4107              {
4108              uschar *scode = bracode;
4109              do
4110                {
4111                if (could_be_empty_branch(scode, ketcode, utf8))
4112                  {
4113                  *bracode += OP_SBRA - OP_BRA;
4114                  break;
4115                  }
4116                scode += GET(scode, 1);
4117                }
4118              while (*scode == OP_ALT);
4119              }
4120            }
4121        }        }
4122    
4123        /* If previous is OP_FAIL, it was generated by an empty class [] in
4124        JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4125        by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4126        error above. We can just ignore the repeat in JS case. */
4127    
4128        else if (*previous == OP_FAIL) goto END_REPEAT;
4129    
4130      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
4131    
4132      else      else
# Line 2733  for (;; ptr++) Line 4135  for (;; ptr++)
4135        goto FAILED;        goto FAILED;
4136        }        }
4137    
4138      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
4139      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
4140      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
4141      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4142      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
4143        but the special opcodes can optimize it a bit. The repeated item starts at
4144        tempcode, not at previous, which might be the first part of a string whose
4145        (former) last char we repeated.
4146    
4147        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4148        an 'upto' may follow. We skip over an 'exact' item, and then test the
4149        length of what remains before proceeding. */
4150    
4151      if (possessive_quantifier)      if (possessive_quantifier)
4152        {        {
4153        int len = code - tempcode;        int len;
4154        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4155        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
4156        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode] +
4157        tempcode[0] = OP_ONCE;            ((*tempcode == OP_TYPEEXACT &&
4158        *code++ = OP_KET;               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4159        PUTINC(code, 0, len);        len = code - tempcode;
4160        PUT(tempcode, 1, len);        if (len > 0) switch (*tempcode)
4161            {
4162            case OP_STAR:  *tempcode = OP_POSSTAR; break;
4163            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4164            case OP_QUERY: *tempcode = OP_POSQUERY; break;
4165            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4166    
4167            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4168            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4169            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4170            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4171    
4172            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4173            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4174            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4175            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4176    
4177            default:
4178            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4179            code += 1 + LINK_SIZE;
4180            len += 1 + LINK_SIZE;
4181            tempcode[0] = OP_ONCE;
4182            *code++ = OP_KET;
4183            PUTINC(code, 0, len);
4184            PUT(tempcode, 1, len);
4185            break;
4186            }
4187        }        }
4188    
4189      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2761  for (;; ptr++) Line 4196  for (;; ptr++)
4196      break;      break;
4197    
4198    
4199      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
4200      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
4201      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
4202      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
4203    
4204      case '(':      case '(':
4205      newoptions = options;      newoptions = options;
4206      skipbytes = 0;      skipbytes = 0;
4207        bravalue = OP_CBRA;
4208        save_hwm = cd->hwm;
4209        reset_bracount = FALSE;
4210    
4211        /* First deal with various "verbs" that can be introduced by '*'. */
4212    
4213        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4214          {
4215          int i, namelen;
4216          const char *vn = verbnames;
4217          const uschar *name = ++ptr;
4218          previous = NULL;
4219          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4220          if (*ptr == ':')
4221            {
4222            *errorcodeptr = ERR59;   /* Not supported */
4223            goto FAILED;
4224            }
4225          if (*ptr != ')')
4226            {
4227            *errorcodeptr = ERR60;
4228            goto FAILED;
4229            }
4230          namelen = ptr - name;
4231          for (i = 0; i < verbcount; i++)
4232            {
4233            if (namelen == verbs[i].len &&
4234                strncmp((char *)name, vn, namelen) == 0)
4235              {
4236              *code = verbs[i].op;
4237              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4238              break;
4239              }
4240            vn += verbs[i].len + 1;
4241            }
4242          if (i < verbcount) continue;
4243          *errorcodeptr = ERR60;
4244          goto FAILED;
4245          }
4246    
4247        /* Deal with the extended parentheses; all are introduced by '?', and the
4248        appearance of any of them means that this is not a capturing group. */
4249    
4250      if (*(++ptr) == '?')      else if (*ptr == '?')
4251        {        {
4252        int set, unset;        int i, set, unset, namelen;
4253        int *optset;        int *optset;
4254          const uschar *name;
4255          uschar *slot;
4256    
4257        switch (*(++ptr))        switch (*(++ptr))
4258          {          {
4259          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
4260          ptr++;          ptr++;
4261          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
4262            if (*ptr == 0)
4263              {
4264              *errorcodeptr = ERR18;
4265              goto FAILED;
4266              }
4267          continue;          continue;
4268    
4269          case ':':                 /* Non-extracting bracket */  
4270            /* ------------------------------------------------------------ */
4271            case '|':                 /* Reset capture count for each branch */
4272            reset_bracount = TRUE;
4273            /* Fall through */
4274    
4275            /* ------------------------------------------------------------ */
4276            case ':':                 /* Non-capturing bracket */
4277          bravalue = OP_BRA;          bravalue = OP_BRA;
4278          ptr++;          ptr++;
4279          break;          break;
4280    
4281    
4282            /* ------------------------------------------------------------ */
4283          case '(':          case '(':
4284          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4285    
4286          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
4287            group), a name (referring to a named group), or 'R', referring to
4288            recursion. R<digits> and R&name are also permitted for recursion tests.
4289    
4290            There are several syntaxes for testing a named group: (?(name)) is used
4291            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4292    
4293            There are two unfortunate ambiguities, caused by history. (a) 'R' can
4294            be the recursive thing or the name 'R' (and similarly for 'R' followed
4295            by digits), and (b) a number could be a name that consists of digits.
4296            In both cases, we look for a name first; if not found, we try the other
4297            cases. */
4298    
4299            /* For conditions that are assertions, check the syntax, and then exit
4300            the switch. This will take control down to where bracketed groups,
4301            including assertions, are processed. */
4302    
4303            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4304              break;
4305    
4306            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4307            below), and all need to skip 3 bytes at the start of the group. */
4308    
4309            code[1+LINK_SIZE] = OP_CREF;
4310            skipbytes = 3;
4311            refsign = -1;
4312    
4313            /* Check for a test for recursion in a named group. */
4314    
4315          if (ptr[1] == 'R')          if (ptr[1] == 'R' && ptr[2] == '&')
4316            {            {
4317            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4318            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4319            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4320            }            }
4321    
4322          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4323          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4324    
4325          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4326            {            {
4327            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4328            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4329            }            }
4330          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
         set bravalue above. */  
         break;  
   
         case '=':                 /* Positive lookahead */  
         bravalue = OP_ASSERT;  
         ptr++;  
         break;  
   
         case '!':                 /* Negative lookahead */  
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;