/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

code/trunk/pcre_compile.c revision 180 by ph10, Wed Jun 13 10:59:18 2007 UTC code/branches/pcre16/pcre_compile.c revision 770 by zherczeg, Mon Nov 28 20:39:30 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #define NLBLOCK cd             /* Block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
50  #define PSSTART start_pattern  /* Field containing processed string start */  #define PSSTART start_pattern  /* Field containing processed string start */
51  #define PSEND   end_pattern    /* Field containing processed string end */  #define PSEND   end_pattern    /* Field containing processed string end */
52    
   
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57  used by pcretest. DEBUG is not defined when building a production library. */  also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. */
59    
60  #ifdef DEBUG  #ifdef PCRE_DEBUG
61  #include "pcre_printint.src"  #include "pcre_printint.src"
62  #endif  #endif
63    
# Line 62  used by pcretest. DEBUG is not defined w Line 66  used by pcretest. DEBUG is not defined w
66    
67  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69    /* Maximum length value to check against when making sure that the integer that
70    holds the compiled pattern length does not overflow. We make it a bit less than
71    INT_MAX to allow for adding in group terminating bytes, so that we don't have
72    to check them every time. */
73    
74    #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77  /*************************************************  /*************************************************
78  *      Code parameters and static tables         *  *      Code parameters and static tables         *
# Line 81  is 4 there is plenty of room. */ Line 92  is 4 there is plenty of room. */
92    
93  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (4096)
94    
95    /* The overrun tests check for a slightly smaller size so that they detect the
96    overrun before it actually does run off the end of the data block. */
97    
98    #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
103  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
104  is invalid. */  is invalid. */
105    
106  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC
107    
108    /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109    in UTF-8 mode. */
110    
111  static const short int escapes[] = {  static const short int escapes[] = {
112       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,                       0,
113       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,                       0,
114     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */       0,                       0,
115  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */       0,                       0,
116  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */       0,                       0,
117  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */       CHAR_COLON,              CHAR_SEMICOLON,
118     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
119  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
120  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */       CHAR_COMMERCIAL_AT,      -ESC_A,
121       0,      0, -ESC_z                                            /* x - z */       -ESC_B,                  -ESC_C,
122         -ESC_D,                  -ESC_E,
123         0,                       -ESC_G,
124         -ESC_H,                  0,
125         0,                       -ESC_K,
126         0,                       0,
127         -ESC_N,                  0,
128         -ESC_P,                  -ESC_Q,
129         -ESC_R,                  -ESC_S,
130         0,                       0,
131         -ESC_V,                  -ESC_W,
132         -ESC_X,                  0,
133         -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
134         CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
135         CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
136         CHAR_GRAVE_ACCENT,       7,
137         -ESC_b,                  0,
138         -ESC_d,                  ESC_e,
139         ESC_f,                   0,
140         -ESC_h,                  0,
141         0,                       -ESC_k,
142         0,                       0,
143         ESC_n,                   0,
144         -ESC_p,                  0,
145         ESC_r,                   -ESC_s,
146         ESC_tee,                 0,
147         -ESC_v,                  -ESC_w,
148         0,                       0,
149         -ESC_z
150  };  };
151    
152  #else           /* This is the "abnormal" table for EBCDIC systems */  #else
153    
154    /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156  static const short int escapes[] = {  static const short int escapes[] = {
157  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
158  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 120  static const short int escapes[] = { Line 171  static const short int escapes[] = {
171  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
172  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
173  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
174  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
175  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
176  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
177  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
# Line 130  static const short int escapes[] = { Line 181  static const short int escapes[] = {
181  #endif  #endif
182    
183    
184  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185  terminated by a zero length entry. The first three must be alpha, lower, upper,  searched linearly. Put all the names into a single string, in order to reduce
186  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. The
187    string is built from string macros so that it works in UTF-8 mode on EBCDIC
188  static const char *const posix_names[] = {  platforms. */
189    "alpha", "lower", "upper",  
190    "alnum", "ascii", "blank", "cntrl", "digit", "graph",  typedef struct verbitem {
191    "print", "punct", "space", "word",  "xdigit" };    int   len;                 /* Length of verb name */
192      int   op;                  /* Op when no arg, or -1 if arg mandatory */
193      int   op_arg;              /* Op when arg present, or -1 if not allowed */
194    } verbitem;
195    
196    static const char verbnames[] =
197      "\0"                       /* Empty name is a shorthand for MARK */
198      STRING_MARK0
199      STRING_ACCEPT0
200      STRING_COMMIT0
201      STRING_F0
202      STRING_FAIL0
203      STRING_PRUNE0
204      STRING_SKIP0
205      STRING_THEN;
206    
207    static const verbitem verbs[] = {
208      { 0, -1,        OP_MARK },
209      { 4, -1,        OP_MARK },
210      { 6, OP_ACCEPT, -1 },
211      { 6, OP_COMMIT, -1 },
212      { 1, OP_FAIL,   -1 },
213      { 4, OP_FAIL,   -1 },
214      { 5, OP_PRUNE,  OP_PRUNE_ARG },
215      { 4, OP_SKIP,   OP_SKIP_ARG  },
216      { 4, OP_THEN,   OP_THEN_ARG  }
217    };
218    
219    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220    
221  static const uschar posix_name_lengths[] = {  
222    /* Tables of names of POSIX character classes and their lengths. The names are
223    now all in a single string, to reduce the number of relocations when a shared
224    library is dynamically loaded. The list of lengths is terminated by a zero
225    length entry. The first three must be alpha, lower, upper, as this is assumed
226    for handling case independence. */
227    
228    static const char posix_names[] =
229      STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230      STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231      STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232      STRING_word0  STRING_xdigit;
233    
234    static const pcre_uint8 posix_name_lengths[] = {
235    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 169  static const int posix_class_maps[] = { Line 261  static const int posix_class_maps[] = {
261    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
262  };  };
263    
264    /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265    substitutes must be in the order of the names, defined above, and there are
266    both positive and negative cases. NULL means no substitute. */
267    
268    #ifdef SUPPORT_UCP
269    static const pcre_uchar string_PNd[]  = {
270      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
271      CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
272    static const pcre_uchar string_pNd[]  = {
273      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
274      CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
275    static const pcre_uchar string_PXsp[] = {
276      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
277      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
278    static const pcre_uchar string_pXsp[] = {
279      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
280      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
281    static const pcre_uchar string_PXwd[] = {
282      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
283      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
284    static const pcre_uchar string_pXwd[] = {
285      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
286      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
287    
288    static const pcre_uchar *substitutes[] = {
289      string_PNd,           /* \D */
290      string_pNd,           /* \d */
291      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
292      string_pXsp,          /* \s */
293      string_PXwd,          /* \W */
294      string_pXwd           /* \w */
295    };
296    
297    static const pcre_uchar string_pL[] =   {
298      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300    static const pcre_uchar string_pLl[] =  {
301      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
302      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303    static const pcre_uchar string_pLu[] =  {
304      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    static const pcre_uchar string_pXan[] = {
307      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
308      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
309    static const pcre_uchar string_h[] =    {
310      CHAR_BACKSLASH, CHAR_h, '\0' };
311    static const pcre_uchar string_pXps[] = {
312      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
313      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
314    static const pcre_uchar string_PL[] =   {
315      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
316      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
317    static const pcre_uchar string_PLl[] =  {
318      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
319      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
320    static const pcre_uchar string_PLu[] =  {
321      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
322      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
323    static const pcre_uchar string_PXan[] = {
324      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
325      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
326    static const pcre_uchar string_H[] =    {
327      CHAR_BACKSLASH, CHAR_H, '\0' };
328    static const pcre_uchar string_PXps[] = {
329      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
330      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
331    
332    static const pcre_uchar *posix_substitutes[] = {
333      string_pL,            /* alpha */
334      string_pLl,           /* lower */
335      string_pLu,           /* upper */
336      string_pXan,          /* alnum */
337      NULL,                 /* ascii */
338      string_h,             /* blank */
339      NULL,                 /* cntrl */
340      string_pNd,           /* digit */
341      NULL,                 /* graph */
342      NULL,                 /* print */
343      NULL,                 /* punct */
344      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
345      string_pXwd,          /* word */
346      NULL,                 /* xdigit */
347      /* Negated cases */
348      string_PL,            /* ^alpha */
349      string_PLl,           /* ^lower */
350      string_PLu,           /* ^upper */
351      string_PXan,          /* ^alnum */
352      NULL,                 /* ^ascii */
353      string_H,             /* ^blank */
354      NULL,                 /* ^cntrl */
355      string_PNd,           /* ^digit */
356      NULL,                 /* ^graph */
357      NULL,                 /* ^print */
358      NULL,                 /* ^punct */
359      string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
360      string_PXwd,          /* ^word */
361      NULL                  /* ^xdigit */
362    };
363    #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
364    #endif
365    
366  #define STRING(a)  # a  #define STRING(a)  # a
367  #define XSTRING(s) STRING(s)  #define XSTRING(s) STRING(s)
# Line 176  static const int posix_class_maps[] = { Line 369  static const int posix_class_maps[] = {
369  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
370  are passed to the outside world. Do not ever re-use any error number, because  are passed to the outside world. Do not ever re-use any error number, because
371  they are documented. Always add a new error instead. Messages marked DEAD below  they are documented. Always add a new error instead. Messages marked DEAD below
372  are no longer used. */  are no longer used. This used to be a table of strings, but in order to reduce
373    the number of relocations needed when a shared library is loaded dynamically,
374  static const char *error_texts[] = {  it is now one long string. We cannot use a table of offsets, because the
375    "no error",  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
376    "\\ at end of pattern",  simply count through to the one we want - this isn't a performance issue
377    "\\c at end of pattern",  because these strings are used only when there is a compilation error.
378    "unrecognized character follows \\",  
379    "numbers out of order in {} quantifier",  Each substring ends with \0 to insert a null character. This includes the final
380    substring, so that the whole string ends with \0\0, which can be detected when
381    counting through. */
382    
383    static const char error_texts[] =
384      "no error\0"
385      "\\ at end of pattern\0"
386      "\\c at end of pattern\0"
387      "unrecognized character follows \\\0"
388      "numbers out of order in {} quantifier\0"
389    /* 5 */    /* 5 */
390    "number too big in {} quantifier",    "number too big in {} quantifier\0"
391    "missing terminating ] for character class",    "missing terminating ] for character class\0"
392    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
393    "range out of order in character class",    "range out of order in character class\0"
394    "nothing to repeat",    "nothing to repeat\0"
395    /* 10 */    /* 10 */
396    "operand of unlimited repeat could match the empty string",  /** DEAD **/    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
397    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
398    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
399    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
400    "missing )",    "missing )\0"
401    /* 15 */    /* 15 */
402    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
403    "erroffset passed as NULL",    "erroffset passed as NULL\0"
404    "unknown option bit(s) set",    "unknown option bit(s) set\0"
405    "missing ) after comment",    "missing ) after comment\0"
406    "parentheses nested too deeply",  /** DEAD **/    "parentheses nested too deeply\0"  /** DEAD **/
407    /* 20 */    /* 20 */
408    "regular expression too large",    "regular expression is too large\0"
409    "failed to get memory",    "failed to get memory\0"
410    "unmatched parentheses",    "unmatched parentheses\0"
411    "internal error: code overflow",    "internal error: code overflow\0"
412    "unrecognized character after (?<",    "unrecognized character after (?<\0"
413    /* 25 */    /* 25 */
414    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
415    "malformed number or name after (?(",    "malformed number or name after (?(\0"
416    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
417    "assertion expected after (?(",    "assertion expected after (?(\0"
418    "(?R or (?[+-]digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
419    /* 30 */    /* 30 */
420    "unknown POSIX class name",    "unknown POSIX class name\0"
421    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
422    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
423    "spare error",  /** DEAD **/    "spare error\0"  /** DEAD **/
424    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
425    /* 35 */    /* 35 */
426    "invalid condition (?(0)",    "invalid condition (?(0)\0"
427    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
428    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
429    "number after (?C is > 255",    "number after (?C is > 255\0"
430    "closing ) for (?C expected",    "closing ) for (?C expected\0"
431    /* 40 */    /* 40 */
432    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
433    "unrecognized character after (?P",    "unrecognized character after (?P\0"
434    "syntax error in subpattern name (missing terminator)",    "syntax error in subpattern name (missing terminator)\0"
435    "two named subpatterns have the same name",    "two named subpatterns have the same name\0"
436    "invalid UTF-8 string",    "invalid UTF-8 string\0"
437    /* 45 */    /* 45 */
438    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
439    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
440    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p\0"
441    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
442    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
443    /* 50 */    /* 50 */
444    "repeated subpattern is too long",    "repeated subpattern is too long\0"    /** DEAD **/
445    "octal value is greater than \\377 (not in UTF-8 mode)",    "octal value is greater than \\377 (not in UTF-8 mode)\0"
446    "internal error: overran compiling workspace",    "internal error: overran compiling workspace\0"
447    "internal error: previously-checked referenced subpattern not found",    "internal error: previously-checked referenced subpattern not found\0"
448    "DEFINE group contains more than one branch",    "DEFINE group contains more than one branch\0"
449    /* 55 */    /* 55 */
450    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
451    "inconsistent NEWLINE options",    "inconsistent NEWLINE options\0"
452    "\\g is not followed by a braced name or an optionally braced non-zero number",    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
453    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"    "a numbered reference must not be zero\0"
454  };    "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
455      /* 60 */
456      "(*VERB) not recognized\0"
457      "number is too big\0"
458      "subpattern name expected\0"
459      "digit expected after (?+\0"
460      "] is an invalid data character in JavaScript compatibility mode\0"
461      /* 65 */
462      "different names for subpatterns of the same number are not allowed\0"
463      "(*MARK) must have an argument\0"
464      "this version of PCRE is not compiled with PCRE_UCP support\0"
465      "\\c must be followed by an ASCII character\0"
466      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
467      /* 70 */
468      "internal error: unknown opcode in find_fixedlength()\0"
469      ;
470    
471  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
472  patterns. Note that the tables in chartables are dependent on the locale, and  patterns. Note that the tables in chartables are dependent on the locale, and
# Line 268  For convenience, we use the same bit def Line 484  For convenience, we use the same bit def
484    
485  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
486    
487  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC
488    
489    /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
490    UTF-8 mode. */
491    
492  static const unsigned char digitab[] =  static const unsigned char digitab[] =
493    {    {
494    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 304  static const unsigned char digitab[] = Line 524  static const unsigned char digitab[] =
524    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
525    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
526    
527  #else           /* This is the "abnormal" case, for EBCDIC systems */  #else
528    
529    /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
530    
531  static const unsigned char digitab[] =  static const unsigned char digitab[] =
532    {    {
533    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 379  static const unsigned char ebcdic_charta Line 602  static const unsigned char ebcdic_charta
602  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
603    
604  static BOOL  static BOOL
605    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
606      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
607    
608    
609    
610  /*************************************************  /*************************************************
611    *            Find an error text                  *
612    *************************************************/
613    
614    /* The error texts are now all in one long string, to save on relocations. As
615    some of the text is of unknown length, we can't use a table of offsets.
616    Instead, just count through the strings. This is not a performance issue
617    because it happens only when there has been a compilation error.
618    
619    Argument:   the error number
620    Returns:    pointer to the error string
621    */
622    
623    static const char *
624    find_error_text(int n)
625    {
626    const char *s = error_texts;
627    for (; n > 0; n--)
628      {
629      while (*s++ != 0) {};
630      if (*s == 0) return "Error text not found (please report)";
631      }
632    return s;
633    }
634    
635    
636    /*************************************************
637    *            Check for counted repeat            *
638    *************************************************/
639    
640    /* This function is called when a '{' is encountered in a place where it might
641    start a quantifier. It looks ahead to see if it really is a quantifier or not.
642    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
643    where the ddds are digits.
644    
645    Arguments:
646      p         pointer to the first char after '{'
647    
648    Returns:    TRUE or FALSE
649    */
650    
651    static BOOL
652    is_counted_repeat(const pcre_uchar *p)
653    {
654    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
655    while ((digitab[*p] & ctype_digit) != 0) p++;
656    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
657    
658    if (*p++ != CHAR_COMMA) return FALSE;
659    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
660    
661    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
662    while ((digitab[*p] & ctype_digit) != 0) p++;
663    
664    return (*p == CHAR_RIGHT_CURLY_BRACKET);
665    }
666    
667    
668    
669    /*************************************************
670  *            Handle escapes                      *  *            Handle escapes                      *
671  *************************************************/  *************************************************/
672    
# Line 405  Arguments: Line 687  Arguments:
687    
688  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
689                   negative => a special escape sequence                   negative => a special escape sequence
690                   on error, errorptr is set                   on error, errorcodeptr is set
691  */  */
692    
693  static int  static int
694  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
695    int options, BOOL isclass)    int options, BOOL isclass)
696  {  {
697  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
698  const uschar *ptr = *ptrptr + 1;  const pcre_uchar *ptr = *ptrptr + 1;
699  int c, i;  int c, i;
700    
701  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
# Line 423  ptr--;                            /* Set Line 705  ptr--;                            /* Set
705    
706  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
707    
708  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
709  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
710  Otherwise further processing may be required. */  Otherwise further processing may be required. */
711    
712  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
713  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
714  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
715    
716  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
717  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
718  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
719  #endif  #endif
720    
# Line 440  else if ((i = escapes[c - 0x48]) != 0) Line 722  else if ((i = escapes[c - 0x48]) != 0)
722    
723  else  else
724    {    {
725    const uschar *oldptr;    const pcre_uchar *oldptr;
726    BOOL braced, negated;    BOOL braced, negated;
727    
728    switch (c)    switch (c)
# Line 448  else Line 730  else
730      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
731      error. */      error. */
732    
733      case 'l':      case CHAR_l:
734      case 'L':      case CHAR_L:
     case 'N':  
     case 'u':  
     case 'U':  
735      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
736      break;      break;
737    
738      /* \g must be followed by a number, either plain or braced. If positive, it      case CHAR_u:
739      is an absolute backreference. If negative, it is a relative backreference.      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
740      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a        {
741      reference to a named group. This is part of Perl's movement towards a        /* In JavaScript, \u must be followed by four hexadecimal numbers.
742      unified syntax for back references. As this is synonymous with \k{name}, we        Otherwise it is a lowercase u letter. */
743      fudge it up by pretending it really was \k. */        if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
744               && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
745      case 'g':          {
746      if (ptr[1] == '{')          c = 0;
747        {          for (i = 0; i < 4; ++i)
748        const uschar *p;            {
749        for (p = ptr+2; *p != 0 && *p != '}'; p++)            register int cc = *(++ptr);
750          if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
751        if (*p != 0 && *p != '}')            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
752              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
753    #else           /* EBCDIC coding */
754              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
755              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
756    #endif
757              }
758            }
759          }
760        else
761          *errorcodeptr = ERR37;
762        break;
763    
764        case CHAR_U:
765        /* In JavaScript, \U is an uppercase U letter. */
766        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
767        break;
768    
769        /* In a character class, \g is just a literal "g". Outside a character
770        class, \g must be followed by one of a number of specific things:
771    
772        (1) A number, either plain or braced. If positive, it is an absolute
773        backreference. If negative, it is a relative backreference. This is a Perl
774        5.10 feature.
775    
776        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
777        is part of Perl's movement towards a unified syntax for back references. As
778        this is synonymous with \k{name}, we fudge it up by pretending it really
779        was \k.
780    
781        (3) For Oniguruma compatibility we also support \g followed by a name or a
782        number either in angle brackets or in single quotes. However, these are
783        (possibly recursive) subroutine calls, _not_ backreferences. Just return
784        the -ESC_g code (cf \k). */
785    
786        case CHAR_g:
787        if (isclass) break;
788        if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
789          {
790          c = -ESC_g;
791          break;
792          }
793    
794        /* Handle the Perl-compatible cases */
795    
796        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
797          {
798          const pcre_uchar *p;
799          for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
800            if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
801          if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
802          {          {
803          c = -ESC_k;          c = -ESC_k;
804          break;          break;
# Line 479  else Line 808  else
808        }        }
809      else braced = FALSE;      else braced = FALSE;
810    
811      if (ptr[1] == '-')      if (ptr[1] == CHAR_MINUS)
812        {        {
813        negated = TRUE;        negated = TRUE;
814        ptr++;        ptr++;
# Line 488  else Line 817  else
817    
818      c = 0;      c = 0;
819      while ((digitab[ptr[1]] & ctype_digit) != 0)      while ((digitab[ptr[1]] & ctype_digit) != 0)
820        c = c * 10 + *(++ptr) - '0';        c = c * 10 + *(++ptr) - CHAR_0;
821    
822      if (c == 0 || (braced && *(++ptr) != '}'))      if (c < 0)   /* Integer overflow */
823          {
824          *errorcodeptr = ERR61;
825          break;
826          }
827    
828        if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
829        {        {
830        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
831        return 0;        break;
832          }
833    
834        if (c == 0)
835          {
836          *errorcodeptr = ERR58;
837          break;
838        }        }
839    
840      if (negated)      if (negated)
# Line 501  else Line 842  else
842        if (c > bracount)        if (c > bracount)
843          {          {
844          *errorcodeptr = ERR15;          *errorcodeptr = ERR15;
845          return 0;          break;
846          }          }
847        c = bracount - (c - 1);        c = bracount - (c - 1);
848        }        }
# Line 521  else Line 862  else
862      value is greater than 377, the least significant 8 bits are taken. Inside a      value is greater than 377, the least significant 8 bits are taken. Inside a
863      character class, \ followed by a digit is always an octal number. */      character class, \ followed by a digit is always an octal number. */
864    
865      case '1': case '2': case '3': case '4': case '5':      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
866      case '6': case '7': case '8': case '9':      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
867    
868      if (!isclass)      if (!isclass)
869        {        {
870        oldptr = ptr;        oldptr = ptr;
871        c -= '0';        c -= CHAR_0;
872        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
873          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - CHAR_0;
874          if (c < 0)    /* Integer overflow */
875            {
876            *errorcodeptr = ERR61;
877            break;
878            }
879        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
880          {          {
881          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 542  else Line 888  else
888      generates a binary zero byte and treats the digit as a following literal.      generates a binary zero byte and treats the digit as a following literal.
889      Thus we have to pull back the pointer by one. */      Thus we have to pull back the pointer by one. */
890    
891      if ((c = *ptr) >= '8')      if ((c = *ptr) >= CHAR_8)
892        {        {
893        ptr--;        ptr--;
894        c = 0;        c = 0;
# Line 555  else Line 901  else
901      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
902      than 3 octal digits. */      than 3 octal digits. */
903    
904      case '0':      case CHAR_0:
905      c -= '0';      c -= CHAR_0;
906      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
907          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - CHAR_0;
908      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf8 && c > 255) *errorcodeptr = ERR51;
909      break;      break;
910    
# Line 566  else Line 912  else
912      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
913      treated as a data character. */      treated as a data character. */
914    
915      case 'x':      case CHAR_x:
916      if (ptr[1] == '{')      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
917          {
918          /* In JavaScript, \x must be followed by two hexadecimal numbers.
919          Otherwise it is a lowercase x letter. */
920          if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
921            {
922            c = 0;
923            for (i = 0; i < 2; ++i)
924              {
925              register int cc = *(++ptr);
926    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
927              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
928              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
929    #else           /* EBCDIC coding */
930              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
931              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
932    #endif
933              }
934            }
935          break;
936          }
937    
938        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
939        {        {
940        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
941        int count = 0;        int count = 0;
942    
943        c = 0;        c = 0;
944        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
945          {          {
946          register int cc = *pt++;          register int cc = *pt++;
947          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
948          count++;          count++;
949    
950  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
951          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
952          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
953  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
954          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
955          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
956  #endif  #endif
957          }          }
958    
959        if (*pt == '}')        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
960          {          {
961          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
962          ptr = pt;          ptr = pt;
# Line 604  else Line 972  else
972      c = 0;      c = 0;
973      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
974        {        {
975        int cc;                               /* Some compilers don't like ++ */        int cc;                                  /* Some compilers don't like */
976        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                           /* ++ in initializers */
977  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
978        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
979        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
980  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
981        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
982        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
983  #endif  #endif
984        }        }
985      break;      break;
986    
987      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
988      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
989        coding is ASCII-specific, but then the whole concept of \cx is
990      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
991    
992      case 'c':      case CHAR_c:
993      c = *(++ptr);      c = *(++ptr);
994      if (c == 0)      if (c == 0)
995        {        {
996        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
997        return 0;        break;
998        }        }
999    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1000  #ifndef EBCDIC  /* ASCII coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
1001      if (c >= 'a' && c <= 'z') c -= 32;        {
1002          *errorcodeptr = ERR68;
1003          break;
1004          }
1005        if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1006      c ^= 0x40;      c ^= 0x40;
1007  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1008      if (c >= 'a' && c <= 'z') c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1009      c ^= 0xC0;      c ^= 0xC0;
1010  #endif  #endif
1011      break;      break;
1012    
1013      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1014      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
1015      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
1016      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
1017      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
1018    
1019      default:      default:
1020      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 654  else Line 1027  else
1027      }      }
1028    }    }
1029    
1030    /* Perl supports \N{name} for character names, as well as plain \N for "not
1031    newline". PCRE does not support \N{name}. However, it does support
1032    quantification such as \N{2,3}. */
1033    
1034    if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1035         !is_counted_repeat(ptr+2))
1036      *errorcodeptr = ERR37;
1037    
1038    /* If PCRE_UCP is set, we change the values for \d etc. */
1039    
1040    if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1041      c -= (ESC_DU - ESC_D);
1042    
1043    /* Set the pointer to the final character before returning. */
1044    
1045  *ptrptr = ptr;  *ptrptr = ptr;
1046  return c;  return c;
1047  }  }
# Line 680  Returns:         type value from ucp_typ Line 1068  Returns:         type value from ucp_typ
1068  */  */
1069    
1070  static int  static int
1071  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1072  {  {
1073  int c, i, bot, top;  int c, i, bot, top;
1074  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1075  char name[32];  pcre_uchar name[32];
1076    
1077  c = *(++ptr);  c = *(++ptr);
1078  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 694  if (c == 0) goto ERROR_RETURN; Line 1082  if (c == 0) goto ERROR_RETURN;
1082  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1083  negation. */  negation. */
1084    
1085  if (c == '{')  if (c == CHAR_LEFT_CURLY_BRACKET)
1086    {    {
1087    if (ptr[1] == '^')    if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1088      {      {
1089      *negptr = TRUE;      *negptr = TRUE;
1090      ptr++;      ptr++;
1091      }      }
1092    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
1093      {      {
1094      c = *(++ptr);      c = *(++ptr);
1095      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
1096      if (c == '}') break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1097      name[i] = c;      name[i] = c;
1098      }      }
1099    if (c !='}') goto ERROR_RETURN;    if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1100    name[i] = 0;    name[i] = 0;
1101    }    }
1102    
# Line 725  else Line 1113  else
1113  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1114    
1115  bot = 0;  bot = 0;
1116  top = _pcre_utt_size;  top = PRIV(utt_size);
1117    
1118  while (bot < top)  while (bot < top)
1119    {    {
1120    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1121    c = strcmp(name, _pcre_utt[i].name);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1122    if (c == 0)    if (c == 0)
1123      {      {
1124      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1125      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1126      }      }
1127    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1128    }    }
# Line 754  return -1; Line 1142  return -1;
1142    
1143    
1144  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == '}') return TRUE;  
   
 if (*p++ != ',') return FALSE;  
 if (*p == '}') return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == '}');  
 }  
   
   
   
 /*************************************************  
1145  *         Read repeat counts                     *  *         Read repeat counts                     *
1146  *************************************************/  *************************************************/
1147    
# Line 805  Returns:         pointer to '}' on succe Line 1160  Returns:         pointer to '}' on succe
1160                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1161  */  */
1162    
1163  static const uschar *  static const pcre_uchar *
1164  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1165  {  {
1166  int min = 0;  int min = 0;
1167  int max = -1;  int max = -1;
# Line 814  int max = -1; Line 1169  int max = -1;
1169  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1170  an integer overflow. */  an integer overflow. */
1171    
1172  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1173  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1174    {    {
1175    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 824  if (min < 0 || min > 65535) Line 1179  if (min < 0 || min > 65535)
1179  /* Read the maximum value if there is one, and again do a paranoid on its size.  /* Read the maximum value if there is one, and again do a paranoid on its size.
1180  Also, max must not be less than min. */  Also, max must not be less than min. */
1181    
1182  if (*p == '}') max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1183    {    {
1184    if (*(++p) != '}')    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1185      {      {
1186      max = 0;      max = 0;
1187      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1188      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1189        {        {
1190        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 854  return p; Line 1209  return p;
1209    
1210    
1211  /*************************************************  /*************************************************
1212  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1213  *************************************************/  *************************************************/
1214    
1215  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1216    top-level call starts at the beginning of the pattern. All other calls must
1217    start at a parenthesis. It scans along a pattern's text looking for capturing
1218  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1219  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1220  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. Recursion is used to keep
1221  references to subpatterns. We know that if (?P< is encountered, the name will  track of subpatterns that reset the capturing group numbers - the (?| feature.
1222  be terminated by '>' because that is checked in the first pass.  
1223    This function was originally called only from the second pass, in which we know
1224    that if (?< or (?' or (?P< is encountered, the name will be correctly
1225    terminated because that is checked in the first pass. There is now one call to
1226    this function in the first pass, to check for a recursive back reference by
1227    name (so that we can make the whole group atomic). In this case, we need check
1228    only up to the current position in the pattern, and that is still OK because
1229    and previous occurrences will have been checked. To make this work, the test
1230    for "end of pattern" is a check against cd->end_pattern in the main loop,
1231    instead of looking for a binary zero. This means that the special first-pass
1232    call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1233    processing items within the loop are OK, because afterwards the main loop will
1234    terminate.)
1235    
1236  Arguments:  Arguments:
1237    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1238    count        current count of capturing parens so far encountered    cd           compile background data
1239    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1240    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1241    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1242      utf8         TRUE if we are in UTF-8 mode
1243      count        pointer to the current capturing subpattern number (updated)
1244    
1245  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1246  */  */
1247    
1248  static int  static int
1249  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1250    BOOL xmode)    BOOL xmode, BOOL utf8, int *count)
1251  {  {
1252  const uschar *thisname;  pcre_uchar *ptr = *ptrptr;
1253    int start_count = *count;
1254    int hwm_count = start_count;
1255    BOOL dup_parens = FALSE;
1256    
1257  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1258    dealing with. The very first call may not start with a parenthesis. */
1259    
1260    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1261    {    {
1262    int term;    /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1263    
1264      if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1265    
1266      /* Handle a normal, unnamed capturing parenthesis. */
1267    
1268      else if (ptr[1] != CHAR_QUESTION_MARK)
1269        {
1270        *count += 1;
1271        if (name == NULL && *count == lorn) return *count;
1272        ptr++;
1273        }
1274    
1275      /* All cases now have (? at the start. Remember when we are in a group
1276      where the parenthesis numbers are duplicated. */
1277    
1278      else if (ptr[2] == CHAR_VERTICAL_LINE)
1279        {
1280        ptr += 3;
1281        dup_parens = TRUE;
1282        }
1283    
1284      /* Handle comments; all characters are allowed until a ket is reached. */
1285    
1286      else if (ptr[2] == CHAR_NUMBER_SIGN)
1287        {
1288        for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1289        goto FAIL_EXIT;
1290        }
1291    
1292      /* Handle a condition. If it is an assertion, just carry on so that it
1293      is processed as normal. If not, skip to the closing parenthesis of the
1294      condition (there can't be any nested parens). */
1295    
1296      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1297        {
1298        ptr += 2;
1299        if (ptr[1] != CHAR_QUESTION_MARK)
1300          {
1301          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1302          if (*ptr != 0) ptr++;
1303          }
1304        }
1305    
1306      /* Start with (? but not a condition. */
1307    
1308      else
1309        {
1310        ptr += 2;
1311        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1312    
1313        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1314    
1315        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1316            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1317          {
1318          int term;
1319          const pcre_uchar *thisname;
1320          *count += 1;
1321          if (name == NULL && *count == lorn) return *count;
1322          term = *ptr++;
1323          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1324          thisname = ptr;
1325          while (*ptr != term) ptr++;
1326          if (name != NULL && lorn == ptr - thisname &&
1327              STRNCMP_UC_UC(name, thisname, lorn) == 0)
1328            return *count;
1329          term++;
1330          }
1331        }
1332      }
1333    
1334    /* Past any initial parenthesis handling, scan for parentheses or vertical
1335    bars. Stop if we get to cd->end_pattern. Note that this is important for the
1336    first-pass call when this value is temporarily adjusted to stop at the current
1337    position. So DO NOT change this to a test for binary zero. */
1338    
1339    for (; ptr < cd->end_pattern; ptr++)
1340      {
1341    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1342    
1343    if (*ptr == '\\')    if (*ptr == CHAR_BACKSLASH)
1344      {      {
1345      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1346      if (*ptr == 'Q') for (;;)      if (*ptr == CHAR_Q) for (;;)
1347        {        {
1348        while (*(++ptr) != 0 && *ptr != '\\');        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1349        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1350        if (*(++ptr) == 'E') break;        if (*(++ptr) == CHAR_E) break;
1351        }        }
1352      continue;      continue;
1353      }      }
1354    
1355    /* Skip over character classes */    /* Skip over character classes; this logic must be similar to the way they
1356      are handled for real. If the first character is '^', skip it. Also, if the
1357      first few characters (either before or after ^) are \Q\E or \E we skip them
1358      too. This makes for compatibility with Perl. Note the use of STR macros to
1359      encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1360    
1361      if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1362        {
1363        BOOL negate_class = FALSE;
1364        for (;;)
1365          {
1366          if (ptr[1] == CHAR_BACKSLASH)
1367            {
1368            if (ptr[2] == CHAR_E)
1369              ptr+= 2;
1370            else if (STRNCMP_UC_C8(ptr + 2,
1371                     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1372              ptr += 4;
1373            else
1374              break;
1375            }
1376          else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1377            {
1378            negate_class = TRUE;
1379            ptr++;
1380            }
1381          else break;
1382          }
1383    
1384        /* If the next character is ']', it is a data character that must be
1385        skipped, except in JavaScript compatibility mode. */
1386    
1387    if (*ptr == '[')      if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1388      {          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1389      while (*(++ptr) != ']')        ptr++;
1390    
1391        while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1392        {        {
1393        if (*ptr == '\\')        if (*ptr == 0) return -1;
1394          if (*ptr == CHAR_BACKSLASH)
1395          {          {
1396          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1397          if (*ptr == 'Q') for (;;)          if (*ptr == CHAR_Q) for (;;)
1398            {            {
1399            while (*(++ptr) != 0 && *ptr != '\\');            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1400            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1401            if (*(++ptr) == 'E') break;            if (*(++ptr) == CHAR_E) break;
1402            }            }
1403          continue;          continue;
1404          }          }
# Line 921  for (; *ptr != 0; ptr++) Line 1408  for (; *ptr != 0; ptr++)
1408    
1409    /* Skip comments in /x mode */    /* Skip comments in /x mode */
1410    
1411    if (xmode && *ptr == '#')    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1412      {      {
1413      while (*(++ptr) != 0 && *ptr != '\n');      ptr++;
1414      if (*ptr == 0) return -1;      while (*ptr != 0)
1415          {
1416          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1417          ptr++;
1418    #ifdef SUPPORT_UTF8
1419          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1420    #endif
1421          }
1422        if (*ptr == 0) goto FAIL_EXIT;
1423      continue;      continue;
1424      }      }
1425    
1426    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1427    
1428    if (*ptr != '(') continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != '?')  
1429      {      {
1430      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1431      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1432      continue;      if (*ptr == 0) goto FAIL_EXIT;
1433      }      }
1434    
1435    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1436    if (*ptr == 'P') ptr++;                      /* Allow optional P */      {
1437        if (dup_parens && *count < hwm_count) *count = hwm_count;
1438        goto FAIL_EXIT;
1439        }
1440    
1441    /* We have to disambiguate (?<! and (?<= from (?<name> */    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1442        {
1443        if (*count > hwm_count) hwm_count = *count;
1444        *count = start_count;
1445        }
1446      }
1447    
1448    FAIL_EXIT:
1449    *ptrptr = ptr;
1450    return -1;
1451    }
1452    
   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&  
        *ptr != '\'')  
     continue;  
1453    
   count++;  
1454    
1455    if (name == NULL && count == lorn) return count;  
1456    term = *ptr++;  /*************************************************
1457    if (term == '<') term = '>';  *       Find forward referenced subpattern       *
1458    thisname = ptr;  *************************************************/
1459    while (*ptr != term) ptr++;  
1460    if (name != NULL && lorn == ptr - thisname &&  /* This function scans along a pattern's text looking for capturing
1461        strncmp((const char *)name, (const char *)thisname, lorn) == 0)  subpatterns, and counting them. If it finds a named pattern that matches the
1462      return count;  name it is given, it returns its number. Alternatively, if the name is NULL, it
1463    returns when it reaches a given numbered subpattern. This is used for forward
1464    references to subpatterns. We used to be able to start this scan from the
1465    current compiling point, using the current count value from cd->bracount, and
1466    do it all in a single loop, but the addition of the possibility of duplicate
1467    subpattern numbers means that we have to scan from the very start, in order to
1468    take account of such duplicates, and to use a recursive function to keep track
1469    of the different types of group.
1470    
1471    Arguments:
1472      cd           compile background data
1473      name         name to seek, or NULL if seeking a numbered subpattern
1474      lorn         name length, or subpattern number if name is NULL
1475      xmode        TRUE if we are in /x mode
1476      utf8         TRUE if we are in UTF-8 mode
1477    
1478    Returns:       the number of the found subpattern, or -1 if not found
1479    */
1480    
1481    static int
1482    find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1483      BOOL utf8)
1484    {
1485    pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1486    int count = 0;
1487    int rc;
1488    
1489    /* If the pattern does not start with an opening parenthesis, the first call
1490    to find_parens_sub() will scan right to the end (if necessary). However, if it
1491    does start with a parenthesis, find_parens_sub() will return when it hits the
1492    matching closing parens. That is why we have to have a loop. */
1493    
1494    for (;;)
1495      {
1496      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1497      if (rc > 0 || *ptr++ == 0) break;
1498    }    }
1499    
1500  return -1;  return rc;
1501  }  }
1502    
1503    
1504    
1505    
1506  /*************************************************  /*************************************************
1507  *      Find first significant op code            *  *      Find first significant op code            *
1508  *************************************************/  *************************************************/
1509    
1510  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1511  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1512  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, it makes sense to skip negative
1513  For some calls, it makes sense to skip negative forward and all backward  forward and all backward assertions, and also the \b assertion; for others it
1514  assertions, and also the \b assertion; for others it does not.  does not.
1515    
1516  Arguments:  Arguments:
1517    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1518    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1519    
1520  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1521  */  */
1522    
1523  static const uschar*  static const pcre_uchar*
1524  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const pcre_uchar *code, BOOL skipassert)
   BOOL skipassert)  
1525  {  {
1526  for (;;)  for (;;)
1527    {    {
1528    switch ((int)*code)    switch ((int)*code)
1529      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1530      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1531      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1532      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1533      if (!skipassert) return code;      if (!skipassert) return code;
1534      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1535      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1536      break;      break;
1537    
1538      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1013  for (;;) Line 1542  for (;;)
1542    
1543      case OP_CALLOUT:      case OP_CALLOUT:
1544      case OP_CREF:      case OP_CREF:
1545        case OP_NCREF:
1546      case OP_RREF:      case OP_RREF:
1547        case OP_NRREF:
1548      case OP_DEF:      case OP_DEF:
1549      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1550      break;      break;
1551    
1552      default:      default:
# Line 1029  for (;;) Line 1560  for (;;)
1560    
1561    
1562  /*************************************************  /*************************************************
1563  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1564  *************************************************/  *************************************************/
1565    
1566  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1567  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1568  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1569    temporarily terminated with OP_END when this function is called.
1570    
1571    This function is called when a backward assertion is encountered, so that if it
1572    fails, the error message can point to the correct place in the pattern.
1573    However, we cannot do this when the assertion contains subroutine calls,
1574    because they can be forward references. We solve this by remembering this case
1575    and doing the check at the end; a flag specifies which mode we are running in.
1576    
1577  Arguments:  Arguments:
1578    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1579    options  the compiling options    utf8     TRUE in UTF-8 mode
1580      atend    TRUE if called when the pattern is complete
1581  Returns:   the fixed length, or -1 if there is no fixed length,    cd       the "compile data" structure
1582               or -2 if \C was encountered  
1583    Returns:   the fixed length,
1584                 or -1 if there is no fixed length,
1585                 or -2 if \C was encountered (in UTF-8 mode only)
1586                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1587                 or -4 if an unknown opcode was encountered (internal error)
1588  */  */
1589    
1590  static int  static int
1591  find_fixedlength(uschar *code, int options)  find_fixedlength(pcre_uchar *code, BOOL utf8, BOOL atend, compile_data *cd)
1592  {  {
1593  int length = -1;  int length = -1;
1594    
1595  register int branchlength = 0;  register int branchlength = 0;
1596  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1597    
1598  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1599  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1058  branch, check the length against that of Line 1601  branch, check the length against that of
1601  for (;;)  for (;;)
1602    {    {
1603    int d;    int d;
1604      pcre_uchar *ce, *cs;
1605    register int op = *cc;    register int op = *cc;
   
1606    switch (op)    switch (op)
1607      {      {
1608        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1609        OP_BRA (normal non-capturing bracket) because the other variants of these
1610        opcodes are all concerned with unlimited repeated groups, which of course
1611        are not of fixed length. */
1612    
1613      case OP_CBRA:      case OP_CBRA:
1614      case OP_BRA:      case OP_BRA:
1615      case OP_ONCE:      case OP_ONCE:
1616        case OP_ONCE_NC:
1617      case OP_COND:      case OP_COND:
1618      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf8, atend, cd);
1619      if (d < 0) return d;      if (d < 0) return d;
1620      branchlength += d;      branchlength += d;
1621      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1622      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1623      break;      break;
1624    
1625      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1626      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1627      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1628        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1629        because they all imply an unlimited repeat. */
1630    
1631      case OP_ALT:      case OP_ALT:
1632      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1633      case OP_END:      case OP_END:
1634        case OP_ACCEPT:
1635        case OP_ASSERT_ACCEPT:
1636      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1637        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1638      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1089  for (;;) Line 1640  for (;;)
1640      branchlength = 0;      branchlength = 0;
1641      break;      break;
1642    
1643        /* A true recursion implies not fixed length, but a subroutine call may
1644        be OK. If the subroutine is a forward reference, we can't deal with
1645        it until the end of the pattern, so return -3. */
1646    
1647        case OP_RECURSE:
1648        if (!atend) return -3;
1649        cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1650        do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1651        if (cc > cs && cc < ce) return -1;                    /* Recursion */
1652        d = find_fixedlength(cs + 2, utf8, atend, cd);
1653        if (d < 0) return d;
1654        branchlength += d;
1655        cc += 1 + LINK_SIZE;
1656        break;
1657    
1658      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1659    
1660      case OP_ASSERT:      case OP_ASSERT:
# Line 1100  for (;;) Line 1666  for (;;)
1666    
1667      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1668    
1669      case OP_REVERSE:      case OP_MARK:
1670        case OP_PRUNE_ARG:
1671        case OP_SKIP_ARG:
1672        case OP_THEN_ARG:
1673        cc += cc[1] + PRIV(OP_lengths)[*cc];
1674        break;
1675    
1676        case OP_CALLOUT:
1677        case OP_CIRC:
1678        case OP_CIRCM:
1679        case OP_CLOSE:
1680        case OP_COMMIT:
1681      case OP_CREF:      case OP_CREF:
     case OP_RREF:  
1682      case OP_DEF:      case OP_DEF:
1683      case OP_OPT:      case OP_DOLL:
1684      case OP_CALLOUT:      case OP_DOLLM:
     case OP_SOD:  
     case OP_SOM:  
1685      case OP_EOD:      case OP_EOD:
1686      case OP_EODN:      case OP_EODN:
1687      case OP_CIRC:      case OP_FAIL:
1688      case OP_DOLL:      case OP_NCREF:
1689        case OP_NRREF:
1690      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1691        case OP_PRUNE:
1692        case OP_REVERSE:
1693        case OP_RREF:
1694        case OP_SET_SOM:
1695        case OP_SKIP:
1696        case OP_SOD:
1697        case OP_SOM:
1698        case OP_THEN:
1699      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1700      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1701      break;      break;
1702    
1703      /* Handle literal characters */      /* Handle literal characters */
1704    
1705      case OP_CHAR:      case OP_CHAR:
1706      case OP_CHARNC:      case OP_CHARI:
1707      case OP_NOT:      case OP_NOT:
1708        case OP_NOTI:
1709      branchlength++;      branchlength++;
1710      cc += 2;      cc += 2;
1711  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1712      if ((options & PCRE_UTF8) != 0)      if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
       {  
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1713  #endif  #endif
1714      break;      break;
1715    
# Line 1136  for (;;) Line 1717  for (;;)
1717      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1718    
1719      case OP_EXACT:      case OP_EXACT:
1720        case OP_EXACTI:
1721        case OP_NOTEXACT:
1722        case OP_NOTEXACTI:
1723      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1724      cc += 4;      cc += 2 + IMM2_SIZE;
1725  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1726      if ((options & PCRE_UTF8) != 0)      if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
       {  
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1727  #endif  #endif
1728      break;      break;
1729    
1730      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1731      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1732      cc += 4;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1733        cc += 1 + IMM2_SIZE + 1;
1734      break;      break;
1735    
1736      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1158  for (;;) Line 1740  for (;;)
1740      cc += 2;      cc += 2;
1741      /* Fall through */      /* Fall through */
1742    
1743        case OP_HSPACE:
1744        case OP_VSPACE:
1745        case OP_NOT_HSPACE:
1746        case OP_NOT_VSPACE:
1747      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1748      case OP_DIGIT:      case OP_DIGIT:
1749      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1165  for (;;) Line 1751  for (;;)
1751      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1752      case OP_WORDCHAR:      case OP_WORDCHAR:
1753      case OP_ANY:      case OP_ANY:
1754        case OP_ALLANY:
1755      branchlength++;      branchlength++;
1756      cc++;      cc++;
1757      break;      break;
1758    
1759      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1760        otherwise \C is coded as OP_ALLANY. */
1761    
1762      case OP_ANYBYTE:      case OP_ANYBYTE:
1763      return -2;      return -2;
1764    
1765      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1766    
1767  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1768      case OP_XCLASS:      case OP_XCLASS:
1769      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1770      /* Fall through */      /* Fall through */
1771  #endif  #endif
1772    
1773      case OP_CLASS:      case OP_CLASS:
1774      case OP_NCLASS:      case OP_NCLASS:
1775      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1776    
1777      switch (*cc)      switch (*cc)
1778        {        {
1779          case OP_CRPLUS:
1780          case OP_CRMINPLUS:
1781        case OP_CRSTAR:        case OP_CRSTAR:
1782        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1783        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1196  for (;;) Line 1786  for (;;)
1786    
1787        case OP_CRRANGE:        case OP_CRRANGE:
1788        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1789        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1790        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1791        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1792        break;        break;
1793    
1794        default:        default:
# Line 1208  for (;;) Line 1798  for (;;)
1798    
1799      /* Anything else is variable length */      /* Anything else is variable length */
1800    
1801      default:      case OP_ANYNL:
1802        case OP_BRAMINZERO:
1803        case OP_BRAPOS:
1804        case OP_BRAPOSZERO:
1805        case OP_BRAZERO:
1806        case OP_CBRAPOS:
1807        case OP_EXTUNI:
1808        case OP_KETRMAX:
1809        case OP_KETRMIN:
1810        case OP_KETRPOS:
1811        case OP_MINPLUS:
1812        case OP_MINPLUSI:
1813        case OP_MINQUERY:
1814        case OP_MINQUERYI:
1815        case OP_MINSTAR:
1816        case OP_MINSTARI:
1817        case OP_MINUPTO:
1818        case OP_MINUPTOI:
1819        case OP_NOTMINPLUS:
1820        case OP_NOTMINPLUSI:
1821        case OP_NOTMINQUERY:
1822        case OP_NOTMINQUERYI:
1823        case OP_NOTMINSTAR:
1824        case OP_NOTMINSTARI:
1825        case OP_NOTMINUPTO:
1826        case OP_NOTMINUPTOI:
1827        case OP_NOTPLUS:
1828        case OP_NOTPLUSI:
1829        case OP_NOTPOSPLUS:
1830        case OP_NOTPOSPLUSI:
1831        case OP_NOTPOSQUERY:
1832        case OP_NOTPOSQUERYI:
1833        case OP_NOTPOSSTAR:
1834        case OP_NOTPOSSTARI:
1835        case OP_NOTPOSUPTO:
1836        case OP_NOTPOSUPTOI:
1837        case OP_NOTQUERY:
1838        case OP_NOTQUERYI:
1839        case OP_NOTSTAR:
1840        case OP_NOTSTARI:
1841        case OP_NOTUPTO:
1842        case OP_NOTUPTOI:
1843        case OP_PLUS:
1844        case OP_PLUSI:
1845        case OP_POSPLUS:
1846        case OP_POSPLUSI:
1847        case OP_POSQUERY:
1848        case OP_POSQUERYI:
1849        case OP_POSSTAR:
1850        case OP_POSSTARI:
1851        case OP_POSUPTO:
1852        case OP_POSUPTOI:
1853        case OP_QUERY:
1854        case OP_QUERYI:
1855        case OP_REF:
1856        case OP_REFI:
1857        case OP_SBRA:
1858        case OP_SBRAPOS:
1859        case OP_SCBRA:
1860        case OP_SCBRAPOS:
1861        case OP_SCOND:
1862        case OP_SKIPZERO:
1863        case OP_STAR:
1864        case OP_STARI:
1865        case OP_TYPEMINPLUS:
1866        case OP_TYPEMINQUERY:
1867        case OP_TYPEMINSTAR:
1868        case OP_TYPEMINUPTO:
1869        case OP_TYPEPLUS:
1870        case OP_TYPEPOSPLUS:
1871        case OP_TYPEPOSQUERY:
1872        case OP_TYPEPOSSTAR:
1873        case OP_TYPEPOSUPTO:
1874        case OP_TYPEQUERY:
1875        case OP_TYPESTAR:
1876        case OP_TYPEUPTO:
1877        case OP_UPTO:
1878        case OP_UPTOI:
1879      return -1;      return -1;
1880    
1881        /* Catch unrecognized opcodes so that when new ones are added they
1882        are not forgotten, as has happened in the past. */
1883    
1884        default:
1885        return -4;
1886      }      }
1887    }    }
1888  /* Control never gets here */  /* Control never gets here */
# Line 1219  for (;;) Line 1892  for (;;)
1892    
1893    
1894  /*************************************************  /*************************************************
1895  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1896  *************************************************/  *************************************************/
1897    
1898  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1899  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1900    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1901    so that it can be called from pcre_study() when finding the minimum matching
1902    length.
1903    
1904  Arguments:  Arguments:
1905    code        points to start of expression    code        points to start of expression
1906    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1907    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1908    
1909  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1910  */  */
1911    
1912  static const uschar *  const pcre_uchar *
1913  find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf8, int number)
1914  {  {
1915  for (;;)  for (;;)
1916    {    {
1917    register int c = *code;    register int c = *code;
1918    
1919    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1920    
1921    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1247  for (;;) Line 1924  for (;;)
1924    
1925    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1926    
1927      /* Handle recursion */
1928    
1929      else if (c == OP_REVERSE)
1930        {
1931        if (number < 0) return (pcre_uchar *)code;
1932        code += PRIV(OP_lengths)[c];
1933        }
1934    
1935    /* Handle capturing bracket */    /* Handle capturing bracket */
1936    
1937    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
1938               c == OP_CBRAPOS || c == OP_SCBRAPOS)
1939      {      {
1940      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
1941      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
1942      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
1943      }      }
1944    
1945      /* Otherwise, we can get the item's length from the table, except that for
1946      repeated character types, we have to test for \p and \P, which have an extra
1947      two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1948      must add in its length. */
1949    
1950      else
1951        {
1952        switch(c)
1953          {
1954          case OP_TYPESTAR:
1955          case OP_TYPEMINSTAR:
1956          case OP_TYPEPLUS:
1957          case OP_TYPEMINPLUS:
1958          case OP_TYPEQUERY:
1959          case OP_TYPEMINQUERY:
1960          case OP_TYPEPOSSTAR:
1961          case OP_TYPEPOSPLUS:
1962          case OP_TYPEPOSQUERY:
1963          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1964          break;
1965    
1966          case OP_TYPEUPTO:
1967          case OP_TYPEMINUPTO:
1968          case OP_TYPEEXACT:
1969          case OP_TYPEPOSUPTO:
1970          if (code[1 + IMM2_SIZE] == OP_PROP
1971            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
1972          break;
1973    
1974          case OP_MARK:
1975          case OP_PRUNE_ARG:
1976          case OP_SKIP_ARG:
1977          code += code[1];
1978          break;
1979    
1980          case OP_THEN_ARG:
1981          code += code[1];
1982          break;
1983          }
1984    
1985        /* Add in the fixed length from the table */
1986    
1987        code += PRIV(OP_lengths)[c];
1988    
1989    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1990    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
1991    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
1992    
   else  
     {  
     code += _pcre_OP_lengths[c];  
1993  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1994      if (utf8) switch(c)      if (utf8) switch(c)
1995        {        {
1996        case OP_CHAR:        case OP_CHAR:
1997        case OP_CHARNC:        case OP_CHARI:
1998        case OP_EXACT:        case OP_EXACT:
1999          case OP_EXACTI:
2000        case OP_UPTO:        case OP_UPTO:
2001          case OP_UPTOI:
2002        case OP_MINUPTO:        case OP_MINUPTO:
2003          case OP_MINUPTOI:
2004        case OP_POSUPTO:        case OP_POSUPTO:
2005          case OP_POSUPTOI:
2006        case OP_STAR:        case OP_STAR:
2007          case OP_STARI:
2008        case OP_MINSTAR:        case OP_MINSTAR:
2009          case OP_MINSTARI:
2010        case OP_POSSTAR:        case OP_POSSTAR:
2011          case OP_POSSTARI:
2012        case OP_PLUS:        case OP_PLUS:
2013          case OP_PLUSI:
2014        case OP_MINPLUS:        case OP_MINPLUS:
2015          case OP_MINPLUSI:
2016        case OP_POSPLUS:        case OP_POSPLUS:
2017          case OP_POSPLUSI:
2018        case OP_QUERY:        case OP_QUERY:
2019          case OP_QUERYI:
2020        case OP_MINQUERY:        case OP_MINQUERY:
2021          case OP_MINQUERYI:
2022        case OP_POSQUERY:        case OP_POSQUERY:
2023        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2024          if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
2025        break;        break;
2026        }        }
2027    #else
2028        (void)(utf8);  /* Keep compiler happy by referencing function argument */
2029  #endif  #endif
2030      }      }
2031    }    }
# Line 1305  Arguments: Line 2047  Arguments:
2047  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2048  */  */
2049    
2050  static const uschar *  static const pcre_uchar *
2051  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf8)
2052  {  {
2053  for (;;)  for (;;)
2054    {    {
# Line 1320  for (;;) Line 2062  for (;;)
2062    
2063    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
2064    
2065    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we can get the item's length from the table, except that for
2066    that are followed by a character may be followed by a multi-byte character.    repeated character types, we have to test for \p and \P, which have an extra
2067    The length in the table is a minimum, so we have to arrange to skip the extra    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2068    bytes. */    must add in its length. */
2069    
2070    else    else
2071      {      {
2072      code += _pcre_OP_lengths[c];      switch(c)
2073          {
2074          case OP_TYPESTAR:
2075          case OP_TYPEMINSTAR:
2076          case OP_TYPEPLUS:
2077          case OP_TYPEMINPLUS:
2078          case OP_TYPEQUERY:
2079          case OP_TYPEMINQUERY:
2080          case OP_TYPEPOSSTAR:
2081          case OP_TYPEPOSPLUS:
2082          case OP_TYPEPOSQUERY:
2083          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2084          break;
2085    
2086          case OP_TYPEPOSUPTO:
2087          case OP_TYPEUPTO:
2088          case OP_TYPEMINUPTO:
2089          case OP_TYPEEXACT:
2090          if (code[1 + IMM2_SIZE] == OP_PROP
2091            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2092          break;
2093    
2094          case OP_MARK:
2095          case OP_PRUNE_ARG:
2096          case OP_SKIP_ARG:
2097          code += code[1];
2098          break;
2099    
2100          case OP_THEN_ARG:
2101          code += code[1];
2102          break;
2103          }
2104    
2105        /* Add in the fixed length from the table */
2106    
2107        code += PRIV(OP_lengths)[c];
2108    
2109        /* In UTF-8 mode, opcodes that are followed by a character may be followed
2110        by a multi-byte character. The length in the table is a minimum, so we have
2111        to arrange to skip the extra bytes. */
2112    
2113  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2114      if (utf8) switch(c)      if (utf8) switch(c)
2115        {        {
2116        case OP_CHAR:        case OP_CHAR:
2117        case OP_CHARNC:        case OP_CHARI:
2118        case OP_EXACT:        case OP_EXACT:
2119          case OP_EXACTI:
2120        case OP_UPTO:        case OP_UPTO:
2121          case OP_UPTOI:
2122        case OP_MINUPTO:        case OP_MINUPTO:
2123          case OP_MINUPTOI:
2124        case OP_POSUPTO:        case OP_POSUPTO:
2125          case OP_POSUPTOI:
2126        case OP_STAR:        case OP_STAR:
2127          case OP_STARI:
2128        case OP_MINSTAR:        case OP_MINSTAR:
2129          case OP_MINSTARI:
2130        case OP_POSSTAR:        case OP_POSSTAR:
2131          case OP_POSSTARI:
2132        case OP_PLUS:        case OP_PLUS:
2133          case OP_PLUSI:
2134        case OP_MINPLUS:        case OP_MINPLUS:
2135          case OP_MINPLUSI:
2136        case OP_POSPLUS:        case OP_POSPLUS:
2137          case OP_POSPLUSI:
2138        case OP_QUERY:        case OP_QUERY:
2139          case OP_QUERYI:
2140        case OP_MINQUERY:        case OP_MINQUERY:
2141          case OP_MINQUERYI:
2142        case OP_POSQUERY:        case OP_POSQUERY:
2143        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2144          if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
2145        break;        break;
2146        }        }
2147    #else
2148        (void)(utf8);  /* Keep compiler happy by referencing function argument */
2149  #endif  #endif
2150      }      }
2151    }    }
# Line 1364  for (;;) Line 2161  for (;;)
2161  can match the empty string or not. It is called from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
2162  below and from compile_branch() when checking for an unlimited repeat of a  below and from compile_branch() when checking for an unlimited repeat of a
2163  group that can match nothing. Note that first_significant_code() skips over  group that can match nothing. Note that first_significant_code() skips over
2164  assertions. If we hit an unclosed bracket, we return "empty" - this means we've  backward and negative forward assertions when its final argument is TRUE. If we
2165  struck an inner bracket whose current branch will already have been scanned.  hit an unclosed bracket, we return "empty" - this means we've struck an inner
2166    bracket whose current branch will already have been scanned.
2167    
2168  Arguments:  Arguments:
2169    code        points to start of search    code        points to start of search
2170    endcode     points to where to stop    endcode     points to where to stop
2171    utf8        TRUE if in UTF8 mode    utf8        TRUE if in UTF8 mode
2172      cd          contains pointers to tables etc.
2173    
2174  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2175  */  */
2176    
2177  static BOOL  static BOOL
2178  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2179      BOOL utf8, compile_data *cd)
2180  {  {
2181  register int c;  register int c;
2182  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2183       code < endcode;       code < endcode;
2184       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2185    {    {
2186    const uschar *ccode;    const pcre_uchar *ccode;
2187    
2188    c = *code;    c = *code;
2189    
2190    /* Groups with zero repeats can of course be empty; skip them. */    /* Skip over forward assertions; the other assertions are skipped by
2191      first_significant_code() with a TRUE final argument. */
2192    
2193    if (c == OP_BRAZERO || c == OP_BRAMINZERO)    if (c == OP_ASSERT)
2194      {      {
     code += _pcre_OP_lengths[c];  
2195      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
2196      c = *code;      c = *code;
2197      continue;      continue;
2198      }      }
2199    
2200    /* For other groups, scan the branches. */    /* For a recursion/subroutine call, if its end has been reached, which
2201      implies a backward reference subroutine call, we can scan it. If it's a
2202      forward reference subroutine call, we can't. To detect forward reference
2203      we have to scan up the list that is kept in the workspace. This function is
2204      called only when doing the real compile, not during the pre-compile that
2205      measures the size of the compiled pattern. */
2206    
2207    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    if (c == OP_RECURSE)
2208      {      {
2209        const pcre_uchar *scode;
2210      BOOL empty_branch;      BOOL empty_branch;
     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */  
2211    
2212      /* Scan a closed bracket */      /* Test for forward reference */
2213    
2214        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2215          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2216    
2217        /* Not a forward reference, test for completed backward reference */
2218    
2219      empty_branch = FALSE;      empty_branch = FALSE;
2220        scode = cd->start_code + GET(code, 1);
2221        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2222    
2223        /* Completed backwards reference */
2224    
2225      do      do
2226        {        {
2227        if (!empty_branch && could_be_empty_branch(code, endcode, utf8))        if (could_be_empty_branch(scode, endcode, utf8, cd))
2228            {
2229          empty_branch = TRUE;          empty_branch = TRUE;
2230            break;
2231            }
2232          scode += GET(scode, 1);
2233          }
2234        while (*scode == OP_ALT);
2235    
2236        if (!empty_branch) return FALSE;  /* All branches are non-empty */
2237        continue;
2238        }
2239    
2240      /* Groups with zero repeats can of course be empty; skip them. */
2241    
2242      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2243          c == OP_BRAPOSZERO)
2244        {
2245        code += PRIV(OP_lengths)[c];
2246        do code += GET(code, 1); while (*code == OP_ALT);
2247        c = *code;
2248        continue;
2249        }
2250    
2251      /* A nested group that is already marked as "could be empty" can just be
2252      skipped. */
2253    
2254      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2255          c == OP_SCBRA || c == OP_SCBRAPOS)
2256        {
2257        do code += GET(code, 1); while (*code == OP_ALT);
2258        c = *code;
2259        continue;
2260        }
2261    
2262      /* For other groups, scan the branches. */
2263    
2264      if (c == OP_BRA  || c == OP_BRAPOS ||
2265          c == OP_CBRA || c == OP_CBRAPOS ||
2266          c == OP_ONCE || c == OP_ONCE_NC ||
2267          c == OP_COND)
2268        {
2269        BOOL empty_branch;
2270        if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2271    
2272        /* If a conditional group has only one branch, there is a second, implied,
2273        empty branch, so just skip over the conditional, because it could be empty.
2274        Otherwise, scan the individual branches of the group. */
2275    
2276        if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2277        code += GET(code, 1);        code += GET(code, 1);
2278        else
2279          {
2280          empty_branch = FALSE;
2281          do
2282            {
2283            if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2284              empty_branch = TRUE;
2285            code += GET(code, 1);
2286            }
2287          while (*code == OP_ALT);
2288          if (!empty_branch) return FALSE;   /* All branches are non-empty */
2289        }        }
2290      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
2291      c = *code;      c = *code;
2292      continue;      continue;
2293      }      }
# Line 1423  for (code = first_significant_code(code Line 2296  for (code = first_significant_code(code
2296    
2297    switch (c)    switch (c)
2298      {      {
2299      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
2300        cannot be represented just by a bit map. This includes negated single
2301        high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2302        actual length is stored in the compiled code, so we must update "code"
2303        here. */
2304    
2305  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2306      case OP_XCLASS:      case OP_XCLASS:
2307      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
2308      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
2309  #endif  #endif
2310    
2311      case OP_CLASS:      case OP_CLASS:
2312      case OP_NCLASS:      case OP_NCLASS:
2313      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2314    
2315  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2316      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
# Line 1471  for (code = first_significant_code(code Line 2348  for (code = first_significant_code(code
2348      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2349      case OP_WORDCHAR:      case OP_WORDCHAR:
2350      case OP_ANY:      case OP_ANY:
2351        case OP_ALLANY:
2352      case OP_ANYBYTE:      case OP_ANYBYTE:
2353      case OP_CHAR:      case OP_CHAR:
2354      case OP_CHARNC:      case OP_CHARI:
2355      case OP_NOT:      case OP_NOT:
2356        case OP_NOTI:
2357      case OP_PLUS:      case OP_PLUS:
2358      case OP_MINPLUS:      case OP_MINPLUS:
2359      case OP_POSPLUS:      case OP_POSPLUS:
# Line 1489  for (code = first_significant_code(code Line 2368  for (code = first_significant_code(code
2368      case OP_TYPEEXACT:      case OP_TYPEEXACT:
2369      return FALSE;      return FALSE;
2370    
2371        /* These are going to continue, as they may be empty, but we have to
2372        fudge the length for the \p and \P cases. */
2373    
2374        case OP_TYPESTAR:
2375        case OP_TYPEMINSTAR:
2376        case OP_TYPEPOSSTAR:
2377        case OP_TYPEQUERY:
2378        case OP_TYPEMINQUERY:
2379        case OP_TYPEPOSQUERY:
2380        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2381        break;
2382    
2383        /* Same for these */
2384    
2385        case OP_TYPEUPTO:
2386        case OP_TYPEMINUPTO:
2387        case OP_TYPEPOSUPTO:
2388        if (code[1 + IMM2_SIZE] == OP_PROP
2389          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2390        break;
2391    
2392      /* End of branch */      /* End of branch */
2393    
2394      case OP_KET:      case OP_KET:
2395      case OP_KETRMAX:      case OP_KETRMAX:
2396      case OP_KETRMIN:      case OP_KETRMIN:
2397        case OP_KETRPOS:
2398      case OP_ALT:      case OP_ALT:
2399      return TRUE;      return TRUE;
2400    
# Line 1502  for (code = first_significant_code(code Line 2403  for (code = first_significant_code(code
2403    
2404  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2405      case OP_STAR:      case OP_STAR:
2406        case OP_STARI:
2407      case OP_MINSTAR:      case OP_MINSTAR:
2408        case OP_MINSTARI:
2409      case OP_POSSTAR:      case OP_POSSTAR:
2410        case OP_POSSTARI:
2411      case OP_QUERY:      case OP_QUERY:
2412        case OP_QUERYI:
2413      case OP_MINQUERY:      case OP_MINQUERY:
2414        case OP_MINQUERYI:
2415      case OP_POSQUERY:      case OP_POSQUERY:
2416        case OP_POSQUERYI:
2417        if (utf8 && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
2418        break;
2419    
2420      case OP_UPTO:      case OP_UPTO:
2421        case OP_UPTOI:
2422      case OP_MINUPTO:      case OP_MINUPTO:
2423        case OP_MINUPTOI:
2424      case OP_POSUPTO:      case OP_POSUPTO:
2425      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      case OP_POSUPTOI:
2426        if (utf8 && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
2427      break;      break;
2428  #endif  #endif
2429    
2430        /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2431        string. */
2432    
2433        case OP_MARK:
2434        case OP_PRUNE_ARG:
2435        case OP_SKIP_ARG:
2436        code += code[1];
2437        break;
2438    
2439        case OP_THEN_ARG:
2440        code += code[1];
2441        break;
2442    
2443        /* None of the remaining opcodes are required to match a character. */
2444    
2445        default:
2446        break;
2447      }      }
2448    }    }
2449    
# Line 1529  return TRUE; Line 2460  return TRUE;
2460  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2461  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2462  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2463    This function is called only during the real compile, not during the
2464    pre-compile.
2465    
2466  Arguments:  Arguments:
2467    code        points to start of the recursion    code        points to start of the recursion
2468    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2469    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2470    utf8        TRUE if in UTF-8 mode    utf8        TRUE if in UTF-8 mode
2471      cd          pointers to tables etc
2472    
2473  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2474  */  */
2475    
2476  static BOOL  static BOOL
2477  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2478    BOOL utf8)    branch_chain *bcptr, BOOL utf8, compile_data *cd)
2479  {  {
2480  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2481    {    {
2482    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2483        return FALSE;
2484    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2485    }    }
2486  return TRUE;  return TRUE;
# Line 1558  return TRUE; Line 2493  return TRUE;
2493  *************************************************/  *************************************************/
2494    
2495  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
2496  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
2497  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2498  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
2499    
2500    Originally, this function only recognized a sequence of letters between the
2501    terminators, but it seems that Perl recognizes any sequence of characters,
2502    though of course unknown POSIX names are subsequently rejected. Perl gives an
2503    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2504    didn't consider this to be a POSIX class. Likewise for [:1234:].
2505    
2506    The problem in trying to be exactly like Perl is in the handling of escapes. We
2507    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2508    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2509    below handles the special case of \], but does not try to do any other escape
2510    processing. This makes it different from Perl for cases such as [:l\ower:]
2511    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2512    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2513    I think.
2514    
2515    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2516    It seems that the appearance of a nested POSIX class supersedes an apparent
2517    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2518    a digit.
2519    
2520    In Perl, unescaped square brackets may also appear as part of class names. For
2521    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2522    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2523    seem right at all. PCRE does not allow closing square brackets in POSIX class
2524    names.
2525    
2526  Argument:  Arguments:
2527    ptr      pointer to the initial [    ptr      pointer to the initial [
2528    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
2529    
2530  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
2531  */  */
2532    
2533  static BOOL  static BOOL
2534  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2535  {  {
2536  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2537  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2538  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
2539    {    {
2540    *endptr = ptr;    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2541    return TRUE;      ptr++;
2542      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2543      else
2544        {
2545        if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2546          {
2547          *endptr = ptr;
2548          return TRUE;
2549          }
2550        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2551             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2552              ptr[1] == CHAR_EQUALS_SIGN) &&
2553            check_posix_syntax(ptr, endptr))
2554          return FALSE;
2555        }
2556    }    }
2557  return FALSE;  return FALSE;
2558  }  }
# Line 1603  Returns:     a value representing the na Line 2575  Returns:     a value representing the na
2575  */  */
2576    
2577  static int  static int
2578  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2579  {  {
2580    const char *pn = posix_names;
2581  register int yield = 0;  register int yield = 0;
2582  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2583    {    {
2584    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2585      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2586      pn += posix_name_lengths[yield] + 1;
2587    yield++;    yield++;
2588    }    }
2589  return -1;  return -1;
# Line 1624  return -1; Line 2598  return -1;
2598  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
2599  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
2600  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
2601  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2602  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
2603  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
2604  offsets adjusted. That one of the jobs of this function. Before it is called,  have their offsets adjusted. That one of the jobs of this function. Before it
2605  the partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
2606    OP_END.
2607    
2608  This function has been extended with the possibility of forward references for  This function has been extended with the possibility of forward references for
2609  recursions and subroutine calls. It must also check the list of such references  recursions and subroutine calls. It must also check the list of such references
# Line 1647  Returns:     nothing Line 2622  Returns:     nothing
2622  */  */
2623    
2624  static void  static void
2625  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf8, compile_data *cd,
2626    uschar *save_hwm)    pcre_uchar *save_hwm)
2627  {  {
2628  uschar *ptr = group;  pcre_uchar *ptr = group;
2629  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  
2630    while ((ptr = (pcre_uchar *)find_recurse(ptr, utf8)) != NULL)
2631    {    {
2632    int offset;    int offset;
2633    uschar *hc;    pcre_uchar *hc;
2634    
2635    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2636    reference. */    reference. */
# Line 1699  Arguments: Line 2675  Arguments:
2675  Returns:         new code pointer  Returns:         new code pointer
2676  */  */
2677    
2678  static uschar *  static pcre_uchar *
2679  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2680  {  {
2681  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2682  *code++ = 255;  *code++ = 255;
2683  PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2684  PUT(code, LINK_SIZE, 0);                /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2685  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2686  }  }
2687    
2688    
# Line 1728  Returns:             nothing Line 2704  Returns:             nothing
2704  */  */
2705    
2706  static void  static void
2707  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2708  {  {
2709  int length = ptr - cd->start_pattern - GET(previous_callout, 2);  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2710  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
2711  }  }
2712    
# Line 1762  get_othercase_range(unsigned int *cptr, Line 2738  get_othercase_range(unsigned int *cptr,
2738  unsigned int c, othercase, next;  unsigned int c, othercase, next;
2739    
2740  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2741    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2742    
2743  if (c > d) return FALSE;  if (c > d) return FALSE;
2744    
# Line 1771  next = othercase + 1; Line 2747  next = othercase + 1;
2747    
2748  for (++c; c <= d; c++)  for (++c; c <= d; c++)
2749    {    {
2750    if (_pcre_ucp_othercase(c) != next) break;    if (UCD_OTHERCASE(c) != next) break;
2751    next++;    next++;
2752      }
2753    
2754    *odptr = next - 1;
2755    *cptr = c;
2756    
2757    return TRUE;
2758    }
2759    
2760    
2761    
2762    /*************************************************
2763    *        Check a character and a property        *
2764    *************************************************/
2765    
2766    /* This function is called by check_auto_possessive() when a property item
2767    is adjacent to a fixed character.
2768    
2769    Arguments:
2770      c            the character
2771      ptype        the property type
2772      pdata        the data for the type
2773      negated      TRUE if it's a negated property (\P or \p{^)
2774    
2775    Returns:       TRUE if auto-possessifying is OK
2776    */
2777    
2778    static BOOL
2779    check_char_prop(int c, int ptype, int pdata, BOOL negated)
2780    {
2781    const ucd_record *prop = GET_UCD(c);
2782    switch(ptype)
2783      {
2784      case PT_LAMP:
2785      return (prop->chartype == ucp_Lu ||
2786              prop->chartype == ucp_Ll ||
2787              prop->chartype == ucp_Lt) == negated;
2788    
2789      case PT_GC:
2790      return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2791    
2792      case PT_PC:
2793      return (pdata == prop->chartype) == negated;
2794    
2795      case PT_SC:
2796      return (pdata == prop->script) == negated;
2797    
2798      /* These are specials */
2799    
2800      case PT_ALNUM:
2801      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2802              PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2803    
2804      case PT_SPACE:    /* Perl space */
2805      return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2806              c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2807              == negated;
2808    
2809      case PT_PXSPACE:  /* POSIX space */
2810      return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2811              c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2812              c == CHAR_FF || c == CHAR_CR)
2813              == negated;
2814    
2815      case PT_WORD:
2816      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2817              PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2818              c == CHAR_UNDERSCORE) == negated;
2819    }    }
2820    return FALSE;
 *odptr = next - 1;  
 *cptr = c;  
   
 return TRUE;  
2821  }  }
2822  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2823    
# Line 1793  whether the next thing could possibly ma Line 2832  whether the next thing could possibly ma
2832  sense to automatically possessify the repeated item.  sense to automatically possessify the repeated item.
2833    
2834  Arguments:  Arguments:
2835    op_code       the repeated op code    previous      pointer to the repeated opcode
   this          data for this item, depends on the opcode  
2836    utf8          TRUE in UTF-8 mode    utf8          TRUE in UTF-8 mode
   utf8_char     used for utf8 character bytes, NULL if not relevant  
2837    ptr           next character in pattern    ptr           next character in pattern
2838    options       options bits    options       options bits
2839    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 1805  Returns:        TRUE if possessifying is Line 2842  Returns:        TRUE if possessifying is
2842  */  */
2843    
2844  static BOOL  static BOOL
2845  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,  check_auto_possessive(const pcre_uchar *previous, BOOL utf8,
2846    const uschar *ptr, int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2847  {  {
2848  int next;  int c, next;
2849    int op_code = *previous++;
2850    
2851  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
2852    
# Line 1817  if ((options & PCRE_EXTENDED) != 0) Line 2855  if ((options & PCRE_EXTENDED) != 0)
2855    for (;;)    for (;;)
2856      {      {
2857      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2858      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2859        {        {
2860        while (*(++ptr) != 0)        ptr++;
2861          while (*ptr != 0)
2862            {
2863          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2864            ptr++;
2865    #ifdef SUPPORT_UTF8
2866            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2867    #endif
2868            }
2869        }        }
2870      else break;      else break;
2871      }      }
# Line 1829  if ((options & PCRE_EXTENDED) != 0) Line 2874  if ((options & PCRE_EXTENDED) != 0)
2874  /* If the next item is one that we can handle, get its value. A non-negative  /* If the next item is one that we can handle, get its value. A non-negative
2875  value is a character, a negative value is an escape value. */  value is a character, a negative value is an escape value. */
2876    
2877  if (*ptr == '\\')  if (*ptr == CHAR_BACKSLASH)
2878    {    {
2879    int temperrorcode = 0;    int temperrorcode = 0;
2880    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
# Line 1854  if ((options & PCRE_EXTENDED) != 0) Line 2899  if ((options & PCRE_EXTENDED) != 0)
2899    for (;;)    for (;;)
2900      {      {
2901      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2902      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2903        {        {
2904        while (*(++ptr) != 0)        ptr++;
2905          while (*ptr != 0)
2906            {
2907          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2908            ptr++;
2909    #ifdef SUPPORT_UTF8
2910            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2911    #endif
2912            }
2913        }        }
2914      else break;      else break;
2915      }      }
# Line 1865  if ((options & PCRE_EXTENDED) != 0) Line 2917  if ((options & PCRE_EXTENDED) != 0)
2917    
2918  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2919    
2920  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2921    return FALSE;    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2922        return FALSE;
 /* Now compare the next item with the previous opcode. If the previous is a  
 positive single character match, "item" either contains the character or, if  
 "item" is greater than 127 in utf8 mode, the character's bytes are in  
 utf8_char. */  
   
2923    
2924  /* Handle cases when the next item is a character. */  /* Now compare the next item with the previous opcode. First, handle cases when
2925    the next item is a character. */
2926    
2927  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
2928    {    {
2929    case OP_CHAR:    case OP_CHAR:
2930  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2931    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
2932    #else
2933      c = *previous;
2934  #endif  #endif
2935    return item != next;    return c != next;
2936    
2937    /* For CHARNC (caseless character) we must check the other case. If we have    /* For CHARI (caseless character) we must check the other case. If we have
2938    Unicode property support, we can use it to test the other case of    Unicode property support, we can use it to test the other case of
2939    high-valued characters. */    high-valued characters. */
2940    
2941    case OP_CHARNC:    case OP_CHARI:
2942  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2943    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
2944    #else
2945      c = *previous;
2946  #endif  #endif
2947    if (item == next) return FALSE;    if (c == next) return FALSE;
2948  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2949    if (utf8)    if (utf8)
2950      {      {
2951      unsigned int othercase;      unsigned int othercase;
2952      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2953  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2954      othercase = _pcre_ucp_othercase((unsigned int)next);      othercase = UCD_OTHERCASE((unsigned int)next);
2955  #else  #else
2956      othercase = NOTACHAR;      othercase = NOTACHAR;
2957  #endif  #endif
2958      return (unsigned int)item != othercase;      return (unsigned int)c != othercase;
2959      }      }
2960    else    else
2961  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
2962    return (item != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
2963    
2964    /* For OP_NOT, "item" must be a single-byte character. */    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2965      opcodes are not used for multi-byte characters, because they are coded using
2966      an XCLASS instead. */
2967    
2968    case OP_NOT:    case OP_NOT:
2969    if (next < 0) return FALSE;  /* Not a character */    return (c = *previous) == next;
2970    if (item == next) return TRUE;  
2971    if ((options & PCRE_CASELESS) == 0) return FALSE;    case OP_NOTI:
2972      if ((c = *previous) == next) return TRUE;
2973  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2974    if (utf8)    if (utf8)
2975      {      {
2976      unsigned int othercase;      unsigned int othercase;
2977      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2978  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2979      othercase = _pcre_ucp_othercase(next);      othercase = UCD_OTHERCASE(next);
2980  #else  #else
2981      othercase = NOTACHAR;      othercase = NOTACHAR;
2982  #endif  #endif
2983      return (unsigned int)item == othercase;      return (unsigned int)c == othercase;
2984      }      }
2985    else    else
2986  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
2987    return (item == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
2988    
2989      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2990      When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2991    
2992    case OP_DIGIT:    case OP_DIGIT:
2993    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
# Line 1972  if (next >= 0) switch(op_code) Line 3030  if (next >= 0) switch(op_code)
3030      case 0x202f:      case 0x202f:
3031      case 0x205f:      case 0x205f:
3032      case 0x3000:      case 0x3000:
3033      return op_code != OP_HSPACE;      return op_code == OP_NOT_HSPACE;
3034      default:      default:
3035      return op_code == OP_HSPACE;      return op_code != OP_NOT_HSPACE;
3036      }      }
3037    
3038      case OP_ANYNL:
3039    case OP_VSPACE:    case OP_VSPACE:
3040    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3041    switch(next)    switch(next)
# Line 1988  if (next >= 0) switch(op_code) Line 3047  if (next >= 0) switch(op_code)
3047      case 0x85:      case 0x85:
3048      case 0x2028:      case 0x2028:
3049      case 0x2029:      case 0x2029:
3050      return op_code != OP_VSPACE;      return op_code == OP_NOT_VSPACE;
3051      default:      default:
3052      return op_code == OP_VSPACE;      return op_code != OP_NOT_VSPACE;
3053      }      }
3054    
3055    #ifdef SUPPORT_UCP
3056      case OP_PROP:
3057      return check_char_prop(next, previous[0], previous[1], FALSE);
3058    
3059      case OP_NOTPROP:
3060      return check_char_prop(next, previous[0], previous[1], TRUE);
3061    #endif
3062    
3063    default:    default:
3064    return FALSE;    return FALSE;
3065    }    }
3066    
3067    
3068  /* Handle the case when the next item is \d, \s, etc. */  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3069    is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3070    generated only when PCRE_UCP is *not* set, that is, when only ASCII
3071    characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3072    replaced by OP_PROP codes when PCRE_UCP is set. */
3073    
3074  switch(op_code)  switch(op_code)
3075    {    {
3076    case OP_CHAR:    case OP_CHAR:
3077    case OP_CHARNC:    case OP_CHARI:
3078  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3079    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
3080    #else
3081      c = *previous;
3082  #endif  #endif
3083    switch(-next)    switch(-next)
3084      {      {
3085      case ESC_d:      case ESC_d:
3086      return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;      return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3087    
3088      case ESC_D:      case ESC_D:
3089      return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3090    
3091      case ESC_s:      case ESC_s:
3092      return item > 127 || (cd->ctypes[item] & ctype_space) == 0;      return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3093    
3094      case ESC_S:      case ESC_S:
3095      return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3096    
3097      case ESC_w:      case ESC_w:
3098      return item > 127 || (cd->ctypes[item] & ctype_word) == 0;      return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3099    
3100      case ESC_W:      case ESC_W:
3101      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3102    
3103      case ESC_h:      case ESC_h:
3104      case ESC_H:      case ESC_H:
3105      switch(item)      switch(c)
3106        {        {
3107        case 0x09:        case 0x09:
3108        case 0x20:        case 0x20:
# Line 2053  switch(op_code) Line 3126  switch(op_code)
3126        return -next != ESC_h;        return -next != ESC_h;
3127        default:        default:
3128        return -next == ESC_h;        return -next == ESC_h;
3129        }        }
3130    
3131      case ESC_v:      case ESC_v:
3132      case ESC_V:      case ESC_V:
3133      switch(item)      switch(c)
3134        {        {
3135        case 0x0a:        case 0x0a:
3136        case 0x0b:        case 0x0b:
# Line 2069  switch(op_code) Line 3142  switch(op_code)
3142        return -next != ESC_v;        return -next != ESC_v;
3143        default:        default:
3144        return -next == ESC_v;        return -next == ESC_v;
3145        }        }
3146    
3147        /* When PCRE_UCP is set, these values get generated for \d etc. Find
3148        their substitutions and process them. The result will always be either
3149        -ESC_p or -ESC_P. Then fall through to process those values. */
3150    
3151    #ifdef SUPPORT_UCP
3152        case ESC_du:
3153        case ESC_DU:
3154        case ESC_wu:
3155        case ESC_WU:
3156        case ESC_su:
3157        case ESC_SU:
3158          {
3159          int temperrorcode = 0;
3160          ptr = substitutes[-next - ESC_DU];
3161          next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3162          if (temperrorcode != 0) return FALSE;
3163          ptr++;    /* For compatibility */
3164          }
3165        /* Fall through */
3166    
3167        case ESC_p:
3168        case ESC_P:
3169          {
3170          int ptype, pdata, errorcodeptr;
3171          BOOL negated;
3172    
3173          ptr--;      /* Make ptr point at the p or P */
3174          ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3175          if (ptype < 0) return FALSE;
3176          ptr++;      /* Point past the final curly ket */
3177    
3178          /* If the property item is optional, we have to give up. (When generated
3179          from \d etc by PCRE_UCP, this test will have been applied much earlier,
3180          to the original \d etc. At this point, ptr will point to a zero byte. */
3181    
3182          if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3183            STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3184              return FALSE;
3185    
3186          /* Do the property check. */
3187    
3188          return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3189          }
3190    #endif
3191    
3192      default:      default:
3193      return FALSE;      return FALSE;
3194      }      }
3195    
3196      /* In principle, support for Unicode properties should be integrated here as
3197      well. It means re-organizing the above code so as to get hold of the property
3198      values before switching on the op-code. However, I wonder how many patterns
3199      combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3200      these op-codes are never generated.) */
3201    
3202    case OP_DIGIT:    case OP_DIGIT:
3203    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3204           next == -ESC_h || next == -ESC_v;           next == -ESC_h || next == -ESC_v || next == -ESC_R;
3205    
3206    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3207    return next == -ESC_d;    return next == -ESC_d;
3208    
3209    case OP_WHITESPACE:    case OP_WHITESPACE:
3210    return next == -ESC_S || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3211    
3212    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3213    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3214    
3215    case OP_HSPACE:    case OP_HSPACE:
3216    return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3217             next == -ESC_w || next == -ESC_v || next == -ESC_R;
3218    
3219    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3220    return next == -ESC_h;    return next == -ESC_h;
3221    
3222    /* Can't have \S in here because VT matches \S (Perl anomaly) */    /* Can't have \S in here because VT matches \S (Perl anomaly) */
3223    case OP_VSPACE:    case OP_ANYNL:
3224      case OP_VSPACE:
3225    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3226    
3227    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3228    return next == -ESC_v;    return next == -ESC_v || next == -ESC_R;
3229    
3230    case OP_WORDCHAR:    case OP_WORDCHAR:
3231    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3232             next == -ESC_v || next == -ESC_R;
3233    
3234    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3235    return next == -ESC_w || next == -ESC_d;    return next == -ESC_w || next == -ESC_d;
3236    
3237    default:    default:
3238    return FALSE;    return FALSE;
3239    }    }
# Line 2134  Arguments: Line 3261  Arguments:
3261    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3262    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
3263    bcptr          points to current branch chain    bcptr          points to current branch chain
3264      cond_depth     conditional nesting depth
3265    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3266    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3267                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 2143  Returns:         TRUE on success Line 3271  Returns:         TRUE on success
3271  */  */
3272    
3273  static BOOL  static BOOL
3274  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3275    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, int *firstbyteptr,
3276    compile_data *cd, int *lengthptr)    int *reqbyteptr, branch_chain *bcptr, int cond_depth, compile_data *cd,
3277      int *lengthptr)
3278  {  {
3279  int repeat_type, op_type;  int repeat_type, op_type;
3280  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 2154  int greedy_default, greedy_non_default; Line 3283  int greedy_default, greedy_non_default;
3283  int firstbyte, reqbyte;  int firstbyte, reqbyte;
3284  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
3285  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
3286  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3287  int after_manual_callout = 0;  int after_manual_callout = 0;
3288  int length_prevgroup = 0;  int length_prevgroup = 0;
3289  register int c;  register int c;
3290  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3291  uschar *last_code = code;  pcre_uchar *last_code = code;
3292  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3293  uschar *tempcode;  pcre_uchar *tempcode;
3294  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3295  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
3296  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3297  const uschar *tempptr;  const pcre_uchar *tempptr;
3298  uschar *previous = NULL;  const pcre_uchar *nestptr = NULL;
3299  uschar *previous_callout = NULL;  pcre_uchar *previous = NULL;
3300  uschar *save_hwm = NULL;  pcre_uchar *previous_callout = NULL;
3301  uschar classbits[32];  pcre_uchar *save_hwm = NULL;
3302    pcre_uint8 classbits[32];
3303    
3304    /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3305    must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3306    dynamically as we process the pattern. */
3307    
3308  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
 BOOL class_utf8;  
3309  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
3310  uschar *class_utf8data;  pcre_uint8 utf8_char[6];
 uschar utf8_char[6];  
3311  #else  #else
3312  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
 uschar *utf8_char = NULL;  
3313  #endif  #endif
3314    
3315  #ifdef DEBUG  /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3316    
3317    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3318    BOOL xclass;
3319    pcre_uchar *class_uchardata;
3320    pcre_uchar *class_uchardata_base;
3321    #endif
3322    
3323    #ifdef PCRE_DEBUG
3324  if (lengthptr != NULL) DPRINTF((">> start branch\n"));  if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3325  #endif  #endif
3326    
# Line 2214  req_caseopt = ((options & PCRE_CASELESS) Line 3353  req_caseopt = ((options & PCRE_CASELESS)
3353  for (;; ptr++)  for (;; ptr++)
3354    {    {
3355    BOOL negate_class;    BOOL negate_class;
3356      BOOL should_flip_negation;
3357    BOOL possessive_quantifier;    BOOL possessive_quantifier;
3358    BOOL is_quantifier;    BOOL is_quantifier;
3359    BOOL is_recurse;    BOOL is_recurse;
# Line 2228  for (;; ptr++) Line 3368  for (;; ptr++)
3368    int subfirstbyte;    int subfirstbyte;
3369    int terminator;    int terminator;
3370    int mclength;    int mclength;
3371    uschar mcbuffer[8];    int tempbracount;
3372      pcre_uchar mcbuffer[8];
3373    
3374    /* Get next byte in the pattern */    /* Get next byte in the pattern */
3375    
3376    c = *ptr;    c = *ptr;
3377    
3378      /* If we are at the end of a nested substitution, revert to the outer level
3379      string. Nesting only happens one level deep. */
3380    
3381      if (c == 0 && nestptr != NULL)
3382        {
3383        ptr = nestptr;
3384        nestptr = NULL;
3385        c = *ptr;
3386        }
3387    
3388    /* If we are in the pre-compile phase, accumulate the length used for the    /* If we are in the pre-compile phase, accumulate the length used for the
3389    previous cycle of this loop. */    previous cycle of this loop. */
3390    
3391    if (lengthptr != NULL)    if (lengthptr != NULL)
3392      {      {
3393  #ifdef DEBUG  #ifdef PCRE_DEBUG
3394      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3395  #endif  #endif
3396      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */
3397        {        {
3398        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3399        goto FAILED;        goto FAILED;
# Line 2255  for (;; ptr++) Line 3406  for (;; ptr++)
3406      */      */
3407    
3408      if (code < last_code) code = last_code;      if (code < last_code) code = last_code;
3409      *lengthptr += code - last_code;  
3410      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      /* Paranoid check for integer overflow */
3411    
3412        if (OFLOW_MAX - *lengthptr < code - last_code)
3413          {
3414          *errorcodeptr = ERR20;
3415          goto FAILED;
3416          }
3417    
3418        *lengthptr += (int)(code - last_code);
3419        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3420          c));
3421    
3422      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3423      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 2266  for (;; ptr++) Line 3427  for (;; ptr++)
3427        {        {
3428        if (previous > orig_code)        if (previous > orig_code)
3429          {          {
3430          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3431          code -= previous - orig_code;          code -= previous - orig_code;
3432          previous = orig_code;          previous = orig_code;
3433          }          }
# Line 2282  for (;; ptr++) Line 3443  for (;; ptr++)
3443    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3444    reference list. */    reference list. */
3445    
3446    else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3447      {      {
3448      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3449      goto FAILED;      goto FAILED;
# Line 2292  for (;; ptr++) Line 3453  for (;; ptr++)
3453    
3454    if (inescq && c != 0)    if (inescq && c != 0)
3455      {      {
3456      if (c == '\\' && ptr[1] == 'E')      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3457        {        {
3458        inescq = FALSE;        inescq = FALSE;
3459        ptr++;        ptr++;
# Line 2318  for (;; ptr++) Line 3479  for (;; ptr++)
3479    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
3480    a quantifier. */    a quantifier. */
3481    
3482    is_quantifier = c == '*' || c == '+' || c == '?' ||    is_quantifier =
3483      (c == '{' && is_counted_repeat(ptr+1));      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3484        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3485    
3486    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
3487         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
# Line 2329  for (;; ptr++) Line 3491  for (;; ptr++)
3491      previous_callout = NULL;      previous_callout = NULL;
3492      }      }
3493    
3494    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3495    
3496    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3497      {      {
3498      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
3499      if (c == '#')      if (c == CHAR_NUMBER_SIGN)
3500        {        {
3501        while (*(++ptr) != 0)        ptr++;
3502          while (*ptr != 0)
3503          {          {
3504          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3505            ptr++;
3506    #ifdef SUPPORT_UTF8
3507            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3508    #endif
3509          }          }
3510        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3511    
# Line 2359  for (;; ptr++) Line 3526  for (;; ptr++)
3526      {      {
3527      /* ===================================================================*/      /* ===================================================================*/
3528      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3529      case '|':                      /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3530      case ')':      case CHAR_RIGHT_PARENTHESIS:
3531      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
3532      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
3533      *codeptr = code;      *codeptr = code;
3534      *ptrptr = ptr;      *ptrptr = ptr;
3535      if (lengthptr != NULL)      if (lengthptr != NULL)
3536        {        {
3537        *lengthptr += code - last_code;   /* To include callout length */        if (OFLOW_MAX - *lengthptr < code - last_code)
3538            {
3539            *errorcodeptr = ERR20;
3540            goto FAILED;
3541            }
3542          *lengthptr += (int)(code - last_code);   /* To include callout length */
3543        DPRINTF((">> end branch\n"));        DPRINTF((">> end branch\n"));
3544        }        }
3545      return TRUE;      return TRUE;
# Line 2377  for (;; ptr++) Line 3549  for (;; ptr++)
3549      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
3550      the setting of any following char as a first character. */      the setting of any following char as a first character. */
3551    
3552      case '^':      case CHAR_CIRCUMFLEX_ACCENT:
3553        previous = NULL;
3554      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3555        {        {
3556        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3557          *code++ = OP_CIRCM;
3558        }        }
3559      previous = NULL;      else *code++ = OP_CIRC;
     *code++ = OP_CIRC;  
3560      break;      break;
3561    
3562      case '$':      case CHAR_DOLLAR_SIGN:
3563      previous = NULL;      previous = NULL;
3564      *code++ = OP_DOLL;      *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3565      break;      break;
3566    
3567      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3568      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqbyte doesn't change either. */
3569    
3570      case '.':      case CHAR_DOT:
3571      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3572      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
3573      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3574      previous = code;      previous = code;
3575      *code++ = OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3576      break;      break;
3577    
3578    
# Line 2414  for (;; ptr++) Line 3587  for (;; ptr++)
3587      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
3588      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
3589      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
     */  
3590    
3591      case '[':      In JavaScript compatibility mode, an isolated ']' causes an error. In
3592        default (Perl) mode, it is treated as a data character. */
3593    
3594        case CHAR_RIGHT_SQUARE_BRACKET:
3595        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3596          {
3597          *errorcodeptr = ERR64;
3598          goto FAILED;
3599          }
3600        goto NORMAL_CHAR;
3601    
3602        case CHAR_LEFT_SQUARE_BRACKET:
3603      previous = code;      previous = code;
3604    
3605      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3606      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
3607    
3608      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3609          check_posix_syntax(ptr, &tempptr, cd))           ptr[1] == CHAR_EQUALS_SIGN) &&
3610            check_posix_syntax(ptr, &tempptr))
3611        {        {
3612        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3613        goto FAILED;        goto FAILED;
3614        }        }
3615    
3616      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
3617        if the first few characters (either before or after ^) are \Q\E or \E we
3618        skip them too. This makes for compatibility with Perl. */
3619    
3620      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
3621        for (;;)
3622        {        {
       negate_class = TRUE;  
3623        c = *(++ptr);        c = *(++ptr);
3624          if (c == CHAR_BACKSLASH)
3625            {
3626            if (ptr[1] == CHAR_E)
3627              ptr++;
3628            else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3629              ptr += 3;
3630            else
3631              break;
3632            }
3633          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3634            negate_class = TRUE;
3635          else break;
3636        }        }
3637      else  
3638        /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3639        an initial ']' is taken as a data character -- the code below handles
3640        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3641        [^] must match any character, so generate OP_ALLANY. */
3642    
3643        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3644            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3645        {        {
3646        negate_class = FALSE;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3647          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3648          zerofirstbyte = firstbyte;
3649          break;
3650        }        }
3651    
3652        /* If a class contains a negative special such as \S, we need to flip the
3653        negation flag at the end, so that support for characters > 255 works
3654        correctly (they are all included in the class). */
3655    
3656        should_flip_negation = FALSE;
3657    
3658      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
3659      of just a single character (as long as it's < 256). However, For higher      of just a single character (as long as it's < 256). However, For higher
3660      valued UTF-8 characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
# Line 2453  for (;; ptr++) Line 3667  for (;; ptr++)
3667      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3668      */      */
3669    
3670      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3671    
3672  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3673      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3674      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3675        class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3676  #endif  #endif
3677    
3678      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2466  for (;; ptr++) Line 3681  for (;; ptr++)
3681    
3682      if (c != 0) do      if (c != 0) do
3683        {        {
3684        const uschar *oldptr;        const pcre_uchar *oldptr;
3685    
3686  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3687        if (utf8 && c > 127)        if (utf8 && c > 127)
# Line 2475  for (;; ptr++) Line 3690  for (;; ptr++)
3690          }          }
3691  #endif  #endif
3692    
3693    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3694          /* In the pre-compile phase, accumulate the length of any extra
3695          data and reset the pointer. This is so that very large classes that
3696          contain a zillion > 255 characters no longer overwrite the work space
3697          (which is on the stack). */
3698    
3699          if (lengthptr != NULL)
3700            {
3701            *lengthptr += class_uchardata - class_uchardata_base;
3702            class_uchardata = class_uchardata_base;
3703            }
3704    #endif
3705    
3706        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
3707    
3708        if (inescq)        if (inescq)
3709          {          {
3710          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */          if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3711            {            {
3712            inescq = FALSE;                   /* Reset literal state */            inescq = FALSE;                   /* Reset literal state */
3713            ptr++;                            /* Skip the 'E' */            ptr++;                            /* Skip the 'E' */
# Line 2494  for (;; ptr++) Line 3722  for (;; ptr++)
3722        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3723        5.6 and 5.8 do. */        5.6 and 5.8 do. */
3724    
3725        if (c == '[' &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
3726            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3727            check_posix_syntax(ptr, &tempptr, cd))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3728          {          {
3729          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3730          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3731          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3732          uschar pbits[32];          pcre_uint8 pbits[32];
3733    
3734          if (ptr[1] != ':')          if (ptr[1] != CHAR_COLON)
3735            {            {
3736            *errorcodeptr = ERR31;            *errorcodeptr = ERR31;
3737            goto FAILED;            goto FAILED;
3738            }            }
3739    
3740          ptr += 2;          ptr += 2;
3741          if (*ptr == '^')          if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3742            {            {
3743            local_negate = TRUE;            local_negate = TRUE;
3744              should_flip_negation = TRUE;  /* Note negative special */
3745            ptr++;            ptr++;
3746            }            }
3747    
3748          posix_class = check_posix_name(ptr, tempptr - ptr);          posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3749          if (posix_class < 0)          if (posix_class < 0)
3750            {            {
3751            *errorcodeptr = ERR30;            *errorcodeptr = ERR30;
# Line 2530  for (;; ptr++) Line 3759  for (;; ptr++)
3759          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3760            posix_class = 0;            posix_class = 0;
3761    
3762          /* We build the bit map for the POSIX class in a chunk of local store          /* When PCRE_UCP is set, some of the POSIX classes are converted to
3763          because we may be adding and subtracting from it, and we don't want to          different escape sequences that use Unicode properties. */
3764          subtract bits that may be in the main map already. At the end we or the  
3765          result into the bit map that is being built. */  #ifdef SUPPORT_UCP
3766            if ((options & PCRE_UCP) != 0)
3767              {
3768              int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3769              if (posix_substitutes[pc] != NULL)
3770                {
3771                nestptr = tempptr + 1;
3772                ptr = posix_substitutes[pc] - 1;
3773                continue;
3774                }
3775              }
3776    #endif
3777            /* In the non-UCP case, we build the bit map for the POSIX class in a
3778            chunk of local store because we may be adding and subtracting from it,
3779            and we don't want to subtract bits that may be in the main map already.
3780            At the end we or the result into the bit map that is being built. */
3781    
3782          posix_class *= 3;          posix_class *= 3;
3783    
3784          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3785    
3786          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3787            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3788    
3789          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3790    
# Line 2577  for (;; ptr++) Line 3821  for (;; ptr++)
3821    
3822        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3823        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3824        case. Inside a class (and only there) it is treated as backspace.        case. Inside a class (and only there) it is treated as backspace. We
3825        Elsewhere it marks a word boundary. Other escapes have preset maps ready        assume that other escapes have more than one character in them, so set
3826        to or into the one we are building. We assume they have more than one        class_charcount bigger than one. Unrecognized escapes fall through and
3827        character in them, so set class_charcount bigger than one. */        are either treated as literal characters (by default), or are faulted if
3828          PCRE_EXTRA is set. */
3829    
3830        if (c == '\\')        if (c == CHAR_BACKSLASH)
3831          {          {
3832          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3833          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3834    
3835          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */  
         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */  
3836          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3837            {            {
3838            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3839              {              {
3840              ptr += 2; /* avoid empty string */              ptr += 2; /* avoid empty string */
3841              }              }
3842            else inescq = TRUE;            else inescq = TRUE;
3843            continue;            continue;
3844            }            }
3845            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
3846    
3847          if (c < 0)          if (c < 0)
3848            {            {
3849            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3850            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
3851    
3852            /* Save time by not doing this in the pre-compile phase. */            switch (-c)
   
           if (lengthptr == NULL) switch (-c)  
3853              {              {
3854    #ifdef SUPPORT_UCP
3855                case ESC_du:     /* These are the values given for \d etc */
3856                case ESC_DU:     /* when PCRE_UCP is set. We replace the */
3857                case ESC_wu:     /* escape sequence with an appropriate \p */
3858                case ESC_WU:     /* or \P to test Unicode properties instead */
3859                case ESC_su:     /* of the default ASCII testing. */
3860                case ESC_SU:
3861                nestptr = ptr;
3862                ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3863                class_charcount -= 2;                /* Undo! */
3864                continue;
3865    #endif
3866              case ESC_d:              case ESC_d:
3867              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3868              continue;              continue;
3869    
3870              case ESC_D:              case ESC_D:
3871                should_flip_negation = TRUE;
3872              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3873              continue;              continue;
3874    
# Line 2622  for (;; ptr++) Line 3877  for (;; ptr++)
3877              continue;              continue;
3878    
3879              case ESC_W:              case ESC_W:
3880                should_flip_negation = TRUE;
3881              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3882              continue;              continue;
3883    
3884                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3885                if it was previously set by something earlier in the character
3886                class. */
3887    
3888              case ESC_s:              case ESC_s:
3889              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
3890              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
3891                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3892              continue;              continue;
3893    
3894              case ESC_S:              case ESC_S:
3895                should_flip_negation = TRUE;
3896              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3897              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3898              continue;              continue;
3899    
3900              case ESC_E: /* Perl ignores an orphan \E */              case ESC_h:
             continue;  
   
             default:    /* Not recognized; fall through */  
             break;      /* Need "default" setting to stop compiler warning. */  
             }  
   
           /* In the pre-compile phase, just do the recognition. */  
   
           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||  
                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;  
   
           /* We need to deal with \H, \h, \V, and \v in both phases because  
           they use extra memory. */  
   
           if (-c == ESC_h)  
             {  
3901              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
3902              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
3903              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
3904  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3905              if (utf8)              if (utf8)
3906                {                {
3907                class_utf8 = TRUE;                xclass = TRUE;
3908                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3909                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata);
3910                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3911                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata);
3912                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
3913                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata);
3914                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata);
3915                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3916                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata);
3917                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3918                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata);
3919                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3920               &n