/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 360 by ph10, Wed Jul 9 20:00:28 2008 UTC revision 978 by ph10, Sun Jun 17 16:55:07 2012 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  used by pcretest. DEBUG is not defined when building a production library. */  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. We do not need to select pcre16_printint.c specially, because the
59  #ifdef DEBUG  COMPILE_PCREx macro will already be appropriately set. */
60  #include "pcre_printint.src"  
61    #ifdef PCRE_DEBUG
62    /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
# Line 87  so this number is very generous. Line 92  so this number is very generous.
92  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
93  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
94  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
96    filled up by repetitions of forward references, for example patterns like
97    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98    that the workspace is expanded using malloc() in this situation. The value
99    below is therefore a minimum, and we put a maximum on it for safety. The
100    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101    kicks in at the same number of forward references in all cases. */
102    
103    #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105    
106    /* The overrun tests check for a slightly smaller size so that they detect the
107    overrun before it actually does run off the end of the data block. */
108    
109    #define WORK_SIZE_SAFETY_MARGIN (100)
110    
111  #define COMPILE_WORK_SIZE (4096)  /* Private flags added to firstchar and reqchar. */
112    
113    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
114    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
115    
116    /* Repeated character flags. */
117    
118    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
119    
120  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
122  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
123  is invalid. */  is invalid. */
124    
125  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC
126    
127    /* This is the "normal" table for ASCII systems or for EBCDIC systems running
128    in UTF-8 mode. */
129    
130  static const short int escapes[] = {  static const short int escapes[] = {
131       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,                       0,
132       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,                       0,
133     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */       0,                       0,
134  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */       0,                       0,
135  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */       0,                       0,
136  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */       CHAR_COLON,              CHAR_SEMICOLON,
137     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
138  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
139  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */       CHAR_COMMERCIAL_AT,      -ESC_A,
140       0,      0, -ESC_z                                            /* x - z */       -ESC_B,                  -ESC_C,
141         -ESC_D,                  -ESC_E,
142         0,                       -ESC_G,
143         -ESC_H,                  0,
144         0,                       -ESC_K,
145         0,                       0,
146         -ESC_N,                  0,
147         -ESC_P,                  -ESC_Q,
148         -ESC_R,                  -ESC_S,
149         0,                       0,
150         -ESC_V,                  -ESC_W,
151         -ESC_X,                  0,
152         -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
153         CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
154         CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
155         CHAR_GRAVE_ACCENT,       7,
156         -ESC_b,                  0,
157         -ESC_d,                  ESC_e,
158         ESC_f,                   0,
159         -ESC_h,                  0,
160         0,                       -ESC_k,
161         0,                       0,
162         ESC_n,                   0,
163         -ESC_p,                  0,
164         ESC_r,                   -ESC_s,
165         ESC_tee,                 0,
166         -ESC_v,                  -ESC_w,
167         0,                       0,
168         -ESC_z
169  };  };
170    
171  #else           /* This is the "abnormal" table for EBCDIC systems */  #else
172    
173    /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
174    
175  static const short int escapes[] = {  static const short int escapes[] = {
176  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
177  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 130  static const short int escapes[] = { Line 190  static const short int escapes[] = {
190  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
191  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
192  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
193  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
194  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
195  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
196  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
# Line 142  static const short int escapes[] = { Line 202  static const short int escapes[] = {
202    
203  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
204  searched linearly. Put all the names into a single string, in order to reduce  searched linearly. Put all the names into a single string, in order to reduce
205  the number of relocations when a shared library is dynamically linked. */  the number of relocations when a shared library is dynamically linked. The
206    string is built from string macros so that it works in UTF-8 mode on EBCDIC
207    platforms. */
208    
209  typedef struct verbitem {  typedef struct verbitem {
210    int   len;    int   len;                 /* Length of verb name */
211    int   op;    int   op;                  /* Op when no arg, or -1 if arg mandatory */
212      int   op_arg;              /* Op when arg present, or -1 if not allowed */
213  } verbitem;  } verbitem;
214    
215  static const char verbnames[] =  static const char verbnames[] =
216    "ACCEPT\0"    "\0"                       /* Empty name is a shorthand for MARK */
217    "COMMIT\0"    STRING_MARK0
218    "F\0"    STRING_ACCEPT0
219    "FAIL\0"    STRING_COMMIT0
220    "PRUNE\0"    STRING_F0
221    "SKIP\0"    STRING_FAIL0
222    "THEN";    STRING_PRUNE0
223      STRING_SKIP0
224      STRING_THEN;
225    
226  static const verbitem verbs[] = {  static const verbitem verbs[] = {
227    { 6, OP_ACCEPT },    { 0, -1,        OP_MARK },
228    { 6, OP_COMMIT },    { 4, -1,        OP_MARK },
229    { 1, OP_FAIL },    { 6, OP_ACCEPT, -1 },
230    { 4, OP_FAIL },    { 6, OP_COMMIT, -1 },
231    { 5, OP_PRUNE },    { 1, OP_FAIL,   -1 },
232    { 4, OP_SKIP  },    { 4, OP_FAIL,   -1 },
233    { 4, OP_THEN  }    { 5, OP_PRUNE,  OP_PRUNE_ARG },
234      { 4, OP_SKIP,   OP_SKIP_ARG  },
235      { 4, OP_THEN,   OP_THEN_ARG  }
236  };  };
237    
238  static const int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
# Line 178  length entry. The first three must be al Line 245  length entry. The first three must be al
245  for handling case independence. */  for handling case independence. */
246    
247  static const char posix_names[] =  static const char posix_names[] =
248    "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
249    "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"    STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
250    "word\0"   "xdigit";    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251      STRING_word0  STRING_xdigit;
252    
253  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
254    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255    
256  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 212  static const int posix_class_maps[] = { Line 280  static const int posix_class_maps[] = {
280    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
281  };  };
282    
283    /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
284    substitutes must be in the order of the names, defined above, and there are
285    both positive and negative cases. NULL means no substitute. */
286    
287    #ifdef SUPPORT_UCP
288    static const pcre_uchar string_PNd[]  = {
289      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290      CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291    static const pcre_uchar string_pNd[]  = {
292      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293      CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294    static const pcre_uchar string_PXsp[] = {
295      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297    static const pcre_uchar string_pXsp[] = {
298      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300    static const pcre_uchar string_PXwd[] = {
301      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303    static const pcre_uchar string_pXwd[] = {
304      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    
307    static const pcre_uchar *substitutes[] = {
308      string_PNd,           /* \D */
309      string_pNd,           /* \d */
310      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
311      string_pXsp,          /* \s */
312      string_PXwd,          /* \W */
313      string_pXwd           /* \w */
314    };
315    
316    static const pcre_uchar string_pL[] =   {
317      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319    static const pcre_uchar string_pLl[] =  {
320      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322    static const pcre_uchar string_pLu[] =  {
323      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325    static const pcre_uchar string_pXan[] = {
326      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328    static const pcre_uchar string_h[] =    {
329      CHAR_BACKSLASH, CHAR_h, '\0' };
330    static const pcre_uchar string_pXps[] = {
331      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333    static const pcre_uchar string_PL[] =   {
334      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336    static const pcre_uchar string_PLl[] =  {
337      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339    static const pcre_uchar string_PLu[] =  {
340      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342    static const pcre_uchar string_PXan[] = {
343      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345    static const pcre_uchar string_H[] =    {
346      CHAR_BACKSLASH, CHAR_H, '\0' };
347    static const pcre_uchar string_PXps[] = {
348      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350    
351    static const pcre_uchar *posix_substitutes[] = {
352      string_pL,            /* alpha */
353      string_pLl,           /* lower */
354      string_pLu,           /* upper */
355      string_pXan,          /* alnum */
356      NULL,                 /* ascii */
357      string_h,             /* blank */
358      NULL,                 /* cntrl */
359      string_pNd,           /* digit */
360      NULL,                 /* graph */
361      NULL,                 /* print */
362      NULL,                 /* punct */
363      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
364      string_pXwd,          /* word */
365      NULL,                 /* xdigit */
366      /* Negated cases */
367      string_PL,            /* ^alpha */
368      string_PLl,           /* ^lower */
369      string_PLu,           /* ^upper */
370      string_PXan,          /* ^alnum */
371      NULL,                 /* ^ascii */
372      string_H,             /* ^blank */
373      NULL,                 /* ^cntrl */
374      string_PNd,           /* ^digit */
375      NULL,                 /* ^graph */
376      NULL,                 /* ^print */
377      NULL,                 /* ^punct */
378      string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
379      string_PXwd,          /* ^word */
380      NULL                  /* ^xdigit */
381    };
382    #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383    #endif
384    
385  #define STRING(a)  # a  #define STRING(a)  # a
386  #define XSTRING(s) STRING(s)  #define XSTRING(s) STRING(s)
# Line 224  the number of relocations needed when a Line 393  the number of relocations needed when a
393  it is now one long string. We cannot use a table of offsets, because the  it is now one long string. We cannot use a table of offsets, because the
394  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
395  simply count through to the one we want - this isn't a performance issue  simply count through to the one we want - this isn't a performance issue
396  because these strings are used only when there is a compilation error. */  because these strings are used only when there is a compilation error.
397    
398    Each substring ends with \0 to insert a null character. This includes the final
399    substring, so that the whole string ends with \0\0, which can be detected when
400    counting through. */
401    
402  static const char error_texts[] =  static const char error_texts[] =
403    "no error\0"    "no error\0"
# Line 265  static const char error_texts[] = Line 438  static const char error_texts[] =
438    /* 30 */    /* 30 */
439    "unknown POSIX class name\0"    "unknown POSIX class name\0"
440    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
441    "this version of PCRE is not compiled with PCRE_UTF8 support\0"    "this version of PCRE is compiled without UTF support\0"
442    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
443    "character value in \\x{...} sequence is too large\0"    "character value in \\x{...} sequence is too large\0"
444    /* 35 */    /* 35 */
445    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
446    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
447    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"    "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
448    "number after (?C is > 255\0"    "number after (?C is > 255\0"
449    "closing ) for (?C expected\0"    "closing ) for (?C expected\0"
450    /* 40 */    /* 40 */
# Line 288  static const char error_texts[] = Line 461  static const char error_texts[] =
461    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
462    /* 50 */    /* 50 */
463    "repeated subpattern is too long\0"    /** DEAD **/    "repeated subpattern is too long\0"    /** DEAD **/
464    "octal value is greater than \\377 (not in UTF-8 mode)\0"    "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
465    "internal error: overran compiling workspace\0"    "internal error: overran compiling workspace\0"
466    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
467    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
468    /* 55 */    /* 55 */
469    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
470    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
471    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
472    "a numbered reference must not be zero\0"    "a numbered reference must not be zero\0"
473    "(*VERB) with an argument is not supported\0"    "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
474    /* 60 */    /* 60 */
475    "(*VERB) not recognized\0"    "(*VERB) not recognized\0"
476    "number is too big\0"    "number is too big\0"
477    "subpattern name expected\0"    "subpattern name expected\0"
478    "digit expected after (?+\0"    "digit expected after (?+\0"
479    "] is an invalid data character in JavaScript compatibility mode";    "] is an invalid data character in JavaScript compatibility mode\0"
480      /* 65 */
481      "different names for subpatterns of the same number are not allowed\0"
482      "(*MARK) must have an argument\0"
483      "this version of PCRE is not compiled with Unicode property support\0"
484      "\\c must be followed by an ASCII character\0"
485      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486      /* 70 */
487      "internal error: unknown opcode in find_fixedlength()\0"
488      "\\N is not supported in a class\0"
489      "too many forward references\0"
490      "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491      "invalid UTF-16 string\0"
492      /* 75 */
493      "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
494      "character value in \\u.... sequence is too large\0"
495      ;
496    
497  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
498  patterns. Note that the tables in chartables are dependent on the locale, and  patterns. Note that the tables in chartables are dependent on the locale, and
# Line 322  For convenience, we use the same bit def Line 510  For convenience, we use the same bit def
510    
511  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
512    
513  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */  /* Using a simple comparison for decimal numbers rather than a memory read
514  static const unsigned char digitab[] =  is much faster, and the resulting code is simpler (the compiler turns it
515    into a subtraction and unsigned comparison). */
516    
517    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
518    
519    #ifndef EBCDIC
520    
521    /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
522    UTF-8 mode. */
523    
524    static const pcre_uint8 digitab[] =
525    {    {
526    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
527    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 358  static const unsigned char digitab[] = Line 556  static const unsigned char digitab[] =
556    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
557    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
558    
559  #else           /* This is the "abnormal" case, for EBCDIC systems */  #else
560  static const unsigned char digitab[] =  
561    /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
562    
563    static const pcre_uint8 digitab[] =
564    {    {
565    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
566    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 394  static const unsigned char digitab[] = Line 595  static const unsigned char digitab[] =
595    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
596    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
597    
598  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
599    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
600    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
601    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 433  static const unsigned char ebcdic_charta Line 634  static const unsigned char ebcdic_charta
634  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
635    
636  static BOOL  static BOOL
637    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
638      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
639    
640    
# Line 455  static const char * Line 656  static const char *
656  find_error_text(int n)  find_error_text(int n)
657  {  {
658  const char *s = error_texts;  const char *s = error_texts;
659  for (; n > 0; n--) while (*s++ != 0);  for (; n > 0; n--)
660      {
661      while (*s++ != 0) {};
662      if (*s == 0) return "Error text not found (please report)";
663      }
664  return s;  return s;
665  }  }
666    
667    
668  /*************************************************  /*************************************************
669    *           Expand the workspace                 *
670    *************************************************/
671    
672    /* This function is called during the second compiling phase, if the number of
673    forward references fills the existing workspace, which is originally a block on
674    the stack. A larger block is obtained from malloc() unless the ultimate limit
675    has been reached or the increase will be rather small.
676    
677    Argument: pointer to the compile data block
678    Returns:  0 if all went well, else an error number
679    */
680    
681    static int
682    expand_workspace(compile_data *cd)
683    {
684    pcre_uchar *newspace;
685    int newsize = cd->workspace_size * 2;
686    
687    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
688    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
689        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
690     return ERR72;
691    
692    newspace = (PUBL(malloc))(IN_UCHARS(newsize));
693    if (newspace == NULL) return ERR21;
694    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
695    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
696    if (cd->workspace_size > COMPILE_WORK_SIZE)
697      (PUBL(free))((void *)cd->start_workspace);
698    cd->start_workspace = newspace;
699    cd->workspace_size = newsize;
700    return 0;
701    }
702    
703    
704    
705    /*************************************************
706    *            Check for counted repeat            *
707    *************************************************/
708    
709    /* This function is called when a '{' is encountered in a place where it might
710    start a quantifier. It looks ahead to see if it really is a quantifier or not.
711    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
712    where the ddds are digits.
713    
714    Arguments:
715      p         pointer to the first char after '{'
716    
717    Returns:    TRUE or FALSE
718    */
719    
720    static BOOL
721    is_counted_repeat(const pcre_uchar *p)
722    {
723    if (!IS_DIGIT(*p)) return FALSE;
724    p++;
725    while (IS_DIGIT(*p)) p++;
726    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
727    
728    if (*p++ != CHAR_COMMA) return FALSE;
729    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
730    
731    if (!IS_DIGIT(*p)) return FALSE;
732    p++;
733    while (IS_DIGIT(*p)) p++;
734    
735    return (*p == CHAR_RIGHT_CURLY_BRACKET);
736    }
737    
738    
739    
740    /*************************************************
741  *            Handle escapes                      *  *            Handle escapes                      *
742  *************************************************/  *************************************************/
743    
# Line 485  Returns:         zero or positive => a d Line 762  Returns:         zero or positive => a d
762  */  */
763    
764  static int  static int
765  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
766    int options, BOOL isclass)    int options, BOOL isclass)
767  {  {
768  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
769  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
770  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
771    pcre_int32 c;
772    int i;
773    
774  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
775  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
# Line 503  if (c == 0) *errorcodeptr = ERR1; Line 782  if (c == 0) *errorcodeptr = ERR1;
782  in a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
783  Otherwise further processing may be required. */  Otherwise further processing may be required. */
784    
785  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
786  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */  /* Not alphanumeric */
787  else if ((i = escapes[c - '0']) != 0) c = i;  else if (c < CHAR_0 || c > CHAR_z) {}
788    else if ((i = escapes[c - CHAR_0]) != 0) c = i;
789    
790  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
791  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
792    else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
793  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
794  #endif  #endif
795    
# Line 516  else if ((i = escapes[c - 0x48]) != 0) Line 797  else if ((i = escapes[c - 0x48]) != 0)
797    
798  else  else
799    {    {
800    const uschar *oldptr;    const pcre_uchar *oldptr;
801    BOOL braced, negated;    BOOL braced, negated;
802    
803    switch (c)    switch (c)
# Line 524  else Line 805  else
805      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
806      error. */      error. */
807    
808      case 'l':      case CHAR_l:
809      case 'L':      case CHAR_L:
     case 'N':  
     case 'u':  
     case 'U':  
810      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
811      break;      break;
812    
813      /* \g must be followed by one of a number of specific things:      case CHAR_u:
814        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
815          {
816          /* In JavaScript, \u must be followed by four hexadecimal numbers.
817          Otherwise it is a lowercase u letter. */
818          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
819            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
820            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
821            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
822            {
823            c = 0;
824            for (i = 0; i < 4; ++i)
825              {
826              register int cc = *(++ptr);
827    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
828              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
829              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
830    #else           /* EBCDIC coding */
831              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
832              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
833    #endif
834              }
835    
836    #ifdef COMPILE_PCRE8
837            if (c > (utf ? 0x10ffff : 0xff))
838    #else
839    #ifdef COMPILE_PCRE16
840            if (c > (utf ? 0x10ffff : 0xffff))
841    #endif
842    #endif
843              {
844              *errorcodeptr = ERR76;
845              }
846            else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
847            }
848          }
849        else
850          *errorcodeptr = ERR37;
851        break;
852    
853        case CHAR_U:
854        /* In JavaScript, \U is an uppercase U letter. */
855        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
856        break;
857    
858        /* In a character class, \g is just a literal "g". Outside a character
859        class, \g must be followed by one of a number of specific things:
860    
861      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
862      backreference. If negative, it is a relative backreference. This is a Perl      backreference. If negative, it is a relative backreference. This is a Perl
# Line 548  else Line 872  else
872      (possibly recursive) subroutine calls, _not_ backreferences. Just return      (possibly recursive) subroutine calls, _not_ backreferences. Just return
873      the -ESC_g code (cf \k). */      the -ESC_g code (cf \k). */
874    
875      case 'g':      case CHAR_g:
876      if (ptr[1] == '<' || ptr[1] == '\'')      if (isclass) break;
877        if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
878        {        {
879        c = -ESC_g;        c = -ESC_g;
880        break;        break;
# Line 557  else Line 882  else
882    
883      /* Handle the Perl-compatible cases */      /* Handle the Perl-compatible cases */
884    
885      if (ptr[1] == '{')      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
886        {        {
887        const uschar *p;        const pcre_uchar *p;
888        for (p = ptr+2; *p != 0 && *p != '}'; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
889          if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
890        if (*p != 0 && *p != '}')        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
891          {          {
892          c = -ESC_k;          c = -ESC_k;
893          break;          break;
# Line 572  else Line 897  else
897        }        }
898      else braced = FALSE;      else braced = FALSE;
899    
900      if (ptr[1] == '-')      if (ptr[1] == CHAR_MINUS)
901        {        {
902        negated = TRUE;        negated = TRUE;
903        ptr++;        ptr++;
904        }        }
905      else negated = FALSE;      else negated = FALSE;
906    
907        /* The integer range is limited by the machine's int representation. */
908      c = 0;      c = 0;
909      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
910        c = c * 10 + *(++ptr) - '0';        {
911          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
912      if (c < 0)   /* Integer overflow */          {
913            c = -1;
914            break;
915            }
916          c = c * 10 + *(++ptr) - CHAR_0;
917          }
918        if (((unsigned int)c) > INT_MAX) /* Integer overflow */
919        {        {
920          while (IS_DIGIT(ptr[1]))
921            ptr++;
922        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
923        break;        break;
924        }        }
925    
926      if (braced && *(++ptr) != '}')      if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
927        {        {
928        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
929        break;        break;
# Line 626  else Line 960  else
960      value is greater than 377, the least significant 8 bits are taken. Inside a      value is greater than 377, the least significant 8 bits are taken. Inside a
961      character class, \ followed by a digit is always an octal number. */      character class, \ followed by a digit is always an octal number. */
962    
963      case '1': case '2': case '3': case '4': case '5':      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
964      case '6': case '7': case '8': case '9':      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
965    
966      if (!isclass)      if (!isclass)
967        {        {
968        oldptr = ptr;        oldptr = ptr;
969        c -= '0';        /* The integer range is limited by the machine's int representation. */
970        while ((digitab[ptr[1]] & ctype_digit) != 0)        c -= CHAR_0;
971          c = c * 10 + *(++ptr) - '0';        while (IS_DIGIT(ptr[1]))
972        if (c < 0)    /* Integer overflow */          {
973            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
974              {
975              c = -1;
976              break;
977              }
978            c = c * 10 + *(++ptr) - CHAR_0;
979            }
980          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
981          {          {
982            while (IS_DIGIT(ptr[1]))
983              ptr++;
984          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
985          break;          break;
986          }          }
# Line 652  else Line 996  else
996      generates a binary zero byte and treats the digit as a following literal.      generates a binary zero byte and treats the digit as a following literal.
997      Thus we have to pull back the pointer by one. */      Thus we have to pull back the pointer by one. */
998    
999      if ((c = *ptr) >= '8')      if ((c = *ptr) >= CHAR_8)
1000        {        {
1001        ptr--;        ptr--;
1002        c = 0;        c = 0;
# Line 662  else Line 1006  else
1006      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1007      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
1008      significant 8 bits of octal numbers (I think this is what early Perls used      significant 8 bits of octal numbers (I think this is what early Perls used
1009      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1010      than 3 octal digits. */      but no more than 3 octal digits. */
1011    
1012      case '0':      case CHAR_0:
1013      c -= '0';      c -= CHAR_0;
1014      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1015          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - CHAR_0;
1016      if (!utf8 && c > 255) *errorcodeptr = ERR51;  #ifdef COMPILE_PCRE8
1017        if (!utf && c > 0xff) *errorcodeptr = ERR51;
1018    #endif
1019      break;      break;
1020    
1021      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
1022      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1023      treated as a data character. */      If not, { is treated as a data character. */
1024    
1025      case 'x':      case CHAR_x:
1026      if (ptr[1] == '{')      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1027        {        {
1028        const uschar *pt = ptr + 2;        /* In JavaScript, \x must be followed by two hexadecimal numbers.
1029        int count = 0;        Otherwise it is a lowercase x letter. */
1030          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1031            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1032            {
1033            c = 0;
1034            for (i = 0; i < 2; ++i)
1035              {
1036              register int cc = *(++ptr);
1037    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1038              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1039              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1040    #else           /* EBCDIC coding */
1041              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1042              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1043    #endif
1044              }
1045            }
1046          break;
1047          }
1048    
1049        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1050          {
1051          const pcre_uchar *pt = ptr + 2;
1052    
1053        c = 0;        c = 0;
1054        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1055          {          {
1056          register int cc = *pt++;          register int cc = *pt++;
1057          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
         count++;  
1058    
1059  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1060          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1061          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1062  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1063          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1064          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1065    #endif
1066    
1067    #ifdef COMPILE_PCRE8
1068            if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1069    #else
1070    #ifdef COMPILE_PCRE16
1071            if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1072    #endif
1073  #endif  #endif
1074          }          }
1075    
1076        if (*pt == '}')        if (c < 0)
1077          {          {
1078          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1079            *errorcodeptr = ERR34;
1080            }
1081    
1082          if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1083            {
1084            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1085          ptr = pt;          ptr = pt;
1086          break;          break;
1087          }          }
# Line 712  else Line 1093  else
1093      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1094    
1095      c = 0;      c = 0;
1096      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1097        {        {
1098        int cc;                               /* Some compilers don't like ++ */        int cc;                                  /* Some compilers don't like */
1099        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                           /* ++ in initializers */
1100  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1101        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1102        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1103  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1104        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1105        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1106  #endif  #endif
1107        }        }
1108      break;      break;
1109    
1110      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1111      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
1112        coding is ASCII-specific, but then the whole concept of \cx is
1113      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1114    
1115      case 'c':      case CHAR_c:
1116      c = *(++ptr);      c = *(++ptr);
1117      if (c == 0)      if (c == 0)
1118        {        {
1119        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
1120        break;        break;
1121        }        }
1122    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1123  #ifndef EBCDIC  /* ASCII coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
1124      if (c >= 'a' && c <= 'z') c -= 32;        {
1125          *errorcodeptr = ERR68;
1126          break;
1127          }
1128        if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1129      c ^= 0x40;      c ^= 0x40;
1130  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1131      if (c >= 'a' && c <= 'z') c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1132      c ^= 0xC0;      c ^= 0xC0;
1133  #endif  #endif
1134      break;      break;
# Line 764  else Line 1150  else
1150      }      }
1151    }    }
1152    
1153    /* Perl supports \N{name} for character names, as well as plain \N for "not
1154    newline". PCRE does not support \N{name}. However, it does support
1155    quantification such as \N{2,3}. */
1156    
1157    if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1158         !is_counted_repeat(ptr+2))
1159      *errorcodeptr = ERR37;
1160    
1161    /* If PCRE_UCP is set, we change the values for \d etc. */
1162    
1163    if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1164      c -= (ESC_DU - ESC_D);
1165    
1166    /* Set the pointer to the final character before returning. */
1167    
1168  *ptrptr = ptr;  *ptrptr = ptr;
1169  return c;  return c;
1170  }  }
# Line 790  Returns:         type value from ucp_typ Line 1191  Returns:         type value from ucp_typ
1191  */  */
1192    
1193  static int  static int
1194  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1195  {  {
1196  int c, i, bot, top;  int c, i, bot, top;
1197  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1198  char name[32];  pcre_uchar name[32];
1199    
1200  c = *(++ptr);  c = *(++ptr);
1201  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 804  if (c == 0) goto ERROR_RETURN; Line 1205  if (c == 0) goto ERROR_RETURN;
1205  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1206  negation. */  negation. */
1207    
1208  if (c == '{')  if (c == CHAR_LEFT_CURLY_BRACKET)
1209    {    {
1210    if (ptr[1] == '^')    if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1211      {      {
1212      *negptr = TRUE;      *negptr = TRUE;
1213      ptr++;      ptr++;
1214      }      }
1215    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1216      {      {
1217      c = *(++ptr);      c = *(++ptr);
1218      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
1219      if (c == '}') break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1220      name[i] = c;      name[i] = c;
1221      }      }
1222    if (c !='}') goto ERROR_RETURN;    if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1223    name[i] = 0;    name[i] = 0;
1224    }    }
1225    
# Line 835  else Line 1236  else
1236  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1237    
1238  bot = 0;  bot = 0;
1239  top = _pcre_utt_size;  top = PRIV(utt_size);
1240    
1241  while (bot < top)  while (bot < top)
1242    {    {
1243    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1244    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1245    if (c == 0)    if (c == 0)
1246      {      {
1247      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1248      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1249      }      }
1250    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1251    }    }
# Line 864  return -1; Line 1265  return -1;
1265    
1266    
1267  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == '}') return TRUE;  
   
 if (*p++ != ',') return FALSE;  
 if (*p == '}') return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == '}');  
 }  
   
   
   
 /*************************************************  
1268  *         Read repeat counts                     *  *         Read repeat counts                     *
1269  *************************************************/  *************************************************/
1270    
# Line 915  Returns:         pointer to '}' on succe Line 1283  Returns:         pointer to '}' on succe
1283                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1284  */  */
1285    
1286  static const uschar *  static const pcre_uchar *
1287  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1288  {  {
1289  int min = 0;  int min = 0;
1290  int max = -1;  int max = -1;
# Line 924  int max = -1; Line 1292  int max = -1;
1292  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1293  an integer overflow. */  an integer overflow. */
1294    
1295  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1296  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1297    {    {
1298    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 934  if (min < 0 || min > 65535) Line 1302  if (min < 0 || min > 65535)
1302  /* Read the maximum value if there is one, and again do a paranoid on its size.  /* Read the maximum value if there is one, and again do a paranoid on its size.
1303  Also, max must not be less than min. */  Also, max must not be less than min. */
1304    
1305  if (*p == '}') max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1306    {    {
1307    if (*(++p) != '}')    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1308      {      {
1309      max = 0;      max = 0;
1310      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1311      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1312        {        {
1313        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 964  return p; Line 1332  return p;
1332    
1333    
1334  /*************************************************  /*************************************************
1335  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1336  *************************************************/  *************************************************/
1337    
1338  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1339    top-level call starts at the beginning of the pattern. All other calls must
1340    start at a parenthesis. It scans along a pattern's text looking for capturing
1341  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1342  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1343  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. Recursion is used to keep
1344  references to subpatterns. We know that if (?P< is encountered, the name will  track of subpatterns that reset the capturing group numbers - the (?| feature.
1345  be terminated by '>' because that is checked in the first pass.  
1346    This function was originally called only from the second pass, in which we know
1347    that if (?< or (?' or (?P< is encountered, the name will be correctly
1348    terminated because that is checked in the first pass. There is now one call to
1349    this function in the first pass, to check for a recursive back reference by
1350    name (so that we can make the whole group atomic). In this case, we need check
1351    only up to the current position in the pattern, and that is still OK because
1352    and previous occurrences will have been checked. To make this work, the test
1353    for "end of pattern" is a check against cd->end_pattern in the main loop,
1354    instead of looking for a binary zero. This means that the special first-pass
1355    call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1356    processing items within the loop are OK, because afterwards the main loop will
1357    terminate.)
1358    
1359  Arguments:  Arguments:
1360    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1361    cd           compile background data    cd           compile background data
1362    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1363    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1364    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1365      utf          TRUE if we are in UTF-8 / UTF-16 mode
1366      count        pointer to the current capturing subpattern number (updated)
1367    
1368  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1369  */  */
1370    
1371  static int  static int
1372  find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1373    BOOL xmode)    BOOL xmode, BOOL utf, int *count)
1374  {  {
1375  const uschar *thisname;  pcre_uchar *ptr = *ptrptr;
1376  int count = cd->bracount;  int start_count = *count;
1377    int hwm_count = start_count;
1378    BOOL dup_parens = FALSE;
1379    
1380    /* If the first character is a parenthesis, check on the type of group we are
1381    dealing with. The very first call may not start with a parenthesis. */
1382    
1383  for (; *ptr != 0; ptr++)  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1384    {    {
1385    int term;    /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1386    
1387      if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1388    
1389      /* Handle a normal, unnamed capturing parenthesis. */
1390    
1391      else if (ptr[1] != CHAR_QUESTION_MARK)
1392        {
1393        *count += 1;
1394        if (name == NULL && *count == lorn) return *count;
1395        ptr++;
1396        }
1397    
1398      /* All cases now have (? at the start. Remember when we are in a group
1399      where the parenthesis numbers are duplicated. */
1400    
1401      else if (ptr[2] == CHAR_VERTICAL_LINE)
1402        {
1403        ptr += 3;
1404        dup_parens = TRUE;
1405        }
1406    
1407      /* Handle comments; all characters are allowed until a ket is reached. */
1408    
1409      else if (ptr[2] == CHAR_NUMBER_SIGN)
1410        {
1411        for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1412        goto FAIL_EXIT;
1413        }
1414    
1415      /* Handle a condition. If it is an assertion, just carry on so that it
1416      is processed as normal. If not, skip to the closing parenthesis of the
1417      condition (there can't be any nested parens). */
1418    
1419      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1420        {
1421        ptr += 2;
1422        if (ptr[1] != CHAR_QUESTION_MARK)
1423          {
1424          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1425          if (*ptr != 0) ptr++;
1426          }
1427        }
1428    
1429      /* Start with (? but not a condition. */
1430    
1431      else
1432        {
1433        ptr += 2;
1434        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1435    
1436        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1437    
1438        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1439            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1440          {
1441          int term;
1442          const pcre_uchar *thisname;
1443          *count += 1;
1444          if (name == NULL && *count == lorn) return *count;
1445          term = *ptr++;
1446          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1447          thisname = ptr;
1448          while (*ptr != term) ptr++;
1449          if (name != NULL && lorn == ptr - thisname &&
1450              STRNCMP_UC_UC(name, thisname, lorn) == 0)
1451            return *count;
1452          term++;
1453          }
1454        }
1455      }
1456    
1457    /* Past any initial parenthesis handling, scan for parentheses or vertical
1458    bars. Stop if we get to cd->end_pattern. Note that this is important for the
1459    first-pass call when this value is temporarily adjusted to stop at the current
1460    position. So DO NOT change this to a test for binary zero. */
1461    
1462    for (; ptr < cd->end_pattern; ptr++)
1463      {
1464    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1465    
1466    if (*ptr == '\\')    if (*ptr == CHAR_BACKSLASH)
1467      {      {
1468      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1469      if (*ptr == 'Q') for (;;)      if (*ptr == CHAR_Q) for (;;)
1470        {        {
1471        while (*(++ptr) != 0 && *ptr != '\\');        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1472        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1473        if (*(++ptr) == 'E') break;        if (*(++ptr) == CHAR_E) break;
1474        }        }
1475      continue;      continue;
1476      }      }
# Line 1012  for (; *ptr != 0; ptr++) Line 1478  for (; *ptr != 0; ptr++)
1478    /* Skip over character classes; this logic must be similar to the way they    /* Skip over character classes; this logic must be similar to the way they
1479    are handled for real. If the first character is '^', skip it. Also, if the    are handled for real. If the first character is '^', skip it. Also, if the
1480    first few characters (either before or after ^) are \Q\E or \E we skip them    first few characters (either before or after ^) are \Q\E or \E we skip them
1481    too. This makes for compatibility with Perl. */    too. This makes for compatibility with Perl. Note the use of STR macros to
1482      encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1483    
1484    if (*ptr == '[')    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1485      {      {
1486      BOOL negate_class = FALSE;      BOOL negate_class = FALSE;
1487      for (;;)      for (;;)
1488        {        {
1489        int c = *(++ptr);        if (ptr[1] == CHAR_BACKSLASH)
       if (c == '\\')  
1490          {          {
1491          if (ptr[1] == 'E') ptr++;          if (ptr[2] == CHAR_E)
1492            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;            ptr+= 2;
1493              else break;          else if (STRNCMP_UC_C8(ptr + 2,
1494                     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1495              ptr += 4;
1496            else
1497              break;
1498          }          }
1499        else if (!negate_class && c == '^')        else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1500            {
1501          negate_class = TRUE;          negate_class = TRUE;
1502            ptr++;
1503            }
1504        else break;        else break;
1505        }        }
1506    
1507      /* If the next character is ']', it is a data character that must be      /* If the next character is ']', it is a data character that must be
1508      skipped, except in JavaScript compatibility mode. */      skipped, except in JavaScript compatibility mode. */
1509    
1510      if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)      if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1511            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1512        ptr++;        ptr++;
1513    
1514      while (*(++ptr) != ']')      while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1515        {        {
1516        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
1517        if (*ptr == '\\')        if (*ptr == CHAR_BACKSLASH)
1518          {          {
1519          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1520          if (*ptr == 'Q') for (;;)          if (*ptr == CHAR_Q) for (;;)
1521            {            {
1522            while (*(++ptr) != 0 && *ptr != '\\');            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1523            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1524            if (*(++ptr) == 'E') break;            if (*(++ptr) == CHAR_E) break;
1525            }            }
1526          continue;          continue;
1527          }          }
# Line 1057  for (; *ptr != 0; ptr++) Line 1531  for (; *ptr != 0; ptr++)
1531    
1532    /* Skip comments in /x mode */    /* Skip comments in /x mode */
1533    
1534    if (xmode && *ptr == '#')    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1535      {      {
1536      while (*(++ptr) != 0 && *ptr != '\n');      ptr++;
1537      if (*ptr == 0) return -1;      while (*ptr != 0)
1538          {
1539          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1540          ptr++;
1541    #ifdef SUPPORT_UTF
1542          if (utf) FORWARDCHAR(ptr);
1543    #endif
1544          }
1545        if (*ptr == 0) goto FAIL_EXIT;
1546      continue;      continue;
1547      }      }
1548    
1549    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1550    
1551    if (*ptr != '(') continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != '?' && ptr[1] != '*')  
1552      {      {
1553      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1554      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1555      continue;      if (*ptr == 0) goto FAIL_EXIT;
1556      }      }
1557    
1558    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1559    if (*ptr == 'P') ptr++;                      /* Allow optional P */      {
1560        if (dup_parens && *count < hwm_count) *count = hwm_count;
1561        goto FAIL_EXIT;
1562        }
1563    
1564    /* We have to disambiguate (?<! and (?<= from (?<name> */    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1565        {
1566        if (*count > hwm_count) hwm_count = *count;
1567        *count = start_count;
1568        }
1569      }
1570    
1571    FAIL_EXIT:
1572    *ptrptr = ptr;
1573    return -1;
1574    }
1575    
   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&  
        *ptr != '\'')  
     continue;  
1576    
   count++;  
1577    
1578    if (name == NULL && count == lorn) return count;  
1579    term = *ptr++;  /*************************************************
1580    if (term == '<') term = '>';  *       Find forward referenced subpattern       *
1581    thisname = ptr;  *************************************************/
1582    while (*ptr != term) ptr++;  
1583    if (name != NULL && lorn == ptr - thisname &&  /* This function scans along a pattern's text looking for capturing
1584        strncmp((const char *)name, (const char *)thisname, lorn) == 0)  subpatterns, and counting them. If it finds a named pattern that matches the
1585      return count;  name it is given, it returns its number. Alternatively, if the name is NULL, it
1586    returns when it reaches a given numbered subpattern. This is used for forward
1587    references to subpatterns. We used to be able to start this scan from the
1588    current compiling point, using the current count value from cd->bracount, and
1589    do it all in a single loop, but the addition of the possibility of duplicate
1590    subpattern numbers means that we have to scan from the very start, in order to
1591    take account of such duplicates, and to use a recursive function to keep track
1592    of the different types of group.
1593    
1594    Arguments:
1595      cd           compile background data
1596      name         name to seek, or NULL if seeking a numbered subpattern
1597      lorn         name length, or subpattern number if name is NULL
1598      xmode        TRUE if we are in /x mode
1599      utf          TRUE if we are in UTF-8 / UTF-16 mode
1600    
1601    Returns:       the number of the found subpattern, or -1 if not found
1602    */
1603    
1604    static int
1605    find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1606      BOOL utf)
1607    {
1608    pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1609    int count = 0;
1610    int rc;
1611    
1612    /* If the pattern does not start with an opening parenthesis, the first call
1613    to find_parens_sub() will scan right to the end (if necessary). However, if it
1614    does start with a parenthesis, find_parens_sub() will return when it hits the
1615    matching closing parens. That is why we have to have a loop. */
1616    
1617    for (;;)
1618      {
1619      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1620      if (rc > 0 || *ptr++ == 0) break;
1621    }    }
1622    
1623  return -1;  return rc;
1624  }  }
1625    
1626    
1627    
1628    
1629  /*************************************************  /*************************************************
1630  *      Find first significant op code            *  *      Find first significant op code            *
1631  *************************************************/  *************************************************/
1632    
1633  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1634  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1635  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, it makes sense to skip negative
1636  For some calls, it makes sense to skip negative forward and all backward  forward and all backward assertions, and also the \b assertion; for others it
1637  assertions, and also the \b assertion; for others it does not.  does not.
1638    
1639  Arguments:  Arguments:
1640    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1641    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1642    
1643  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1644  */  */
1645    
1646  static const uschar*  static const pcre_uchar*
1647  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const pcre_uchar *code, BOOL skipassert)
   BOOL skipassert)  
1648  {  {
1649  for (;;)  for (;;)
1650    {    {
1651    switch ((int)*code)    switch ((int)*code)
1652      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1653      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1654      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1655      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1656      if (!skipassert) return code;      if (!skipassert) return code;
1657      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1658      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1659      break;      break;
1660    
1661      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1149  for (;;) Line 1665  for (;;)
1665    
1666      case OP_CALLOUT:      case OP_CALLOUT:
1667      case OP_CREF:      case OP_CREF:
1668        case OP_NCREF:
1669      case OP_RREF:      case OP_RREF:
1670        case OP_NRREF:
1671      case OP_DEF:      case OP_DEF:
1672      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1673      break;      break;
1674    
1675      default:      default:
# Line 1165  for (;;) Line 1683  for (;;)
1683    
1684    
1685  /*************************************************  /*************************************************
1686  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1687  *************************************************/  *************************************************/
1688    
1689  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1690  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1691  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1692    temporarily terminated with OP_END when this function is called.
1693    
1694    This function is called when a backward assertion is encountered, so that if it
1695    fails, the error message can point to the correct place in the pattern.
1696    However, we cannot do this when the assertion contains subroutine calls,
1697    because they can be forward references. We solve this by remembering this case
1698    and doing the check at the end; a flag specifies which mode we are running in.
1699    
1700  Arguments:  Arguments:
1701    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1702    options  the compiling options    utf      TRUE in UTF-8 / UTF-16 mode
1703      atend    TRUE if called when the pattern is complete
1704  Returns:   the fixed length, or -1 if there is no fixed length,    cd       the "compile data" structure
1705               or -2 if \C was encountered  
1706    Returns:   the fixed length,
1707                 or -1 if there is no fixed length,
1708                 or -2 if \C was encountered (in UTF-8 mode only)
1709                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1710                 or -4 if an unknown opcode was encountered (internal error)
1711  */  */
1712    
1713  static int  static int
1714  find_fixedlength(uschar *code, int options)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1715  {  {
1716  int length = -1;  int length = -1;
1717    
1718  register int branchlength = 0;  register int branchlength = 0;
1719  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1720    
1721  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1722  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1194  branch, check the length against that of Line 1724  branch, check the length against that of
1724  for (;;)  for (;;)
1725    {    {
1726    int d;    int d;
1727      pcre_uchar *ce, *cs;
1728    register int op = *cc;    register int op = *cc;
1729    
1730    switch (op)    switch (op)
1731      {      {
1732        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1733        OP_BRA (normal non-capturing bracket) because the other variants of these
1734        opcodes are all concerned with unlimited repeated groups, which of course
1735        are not of fixed length. */
1736    
1737      case OP_CBRA:      case OP_CBRA:
1738      case OP_BRA:      case OP_BRA:
1739      case OP_ONCE:      case OP_ONCE:
1740        case OP_ONCE_NC:
1741      case OP_COND:      case OP_COND:
1742      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1743      if (d < 0) return d;      if (d < 0) return d;
1744      branchlength += d;      branchlength += d;
1745      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1746      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1747      break;      break;
1748    
1749      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1750      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1751      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1752        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1753        because they all imply an unlimited repeat. */
1754    
1755      case OP_ALT:      case OP_ALT:
1756      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1757      case OP_END:      case OP_END:
1758        case OP_ACCEPT:
1759        case OP_ASSERT_ACCEPT:
1760      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1761        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1762      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1224  for (;;) Line 1764  for (;;)
1764      branchlength = 0;      branchlength = 0;
1765      break;      break;
1766    
1767        /* A true recursion implies not fixed length, but a subroutine call may
1768        be OK. If the subroutine is a forward reference, we can't deal with
1769        it until the end of the pattern, so return -3. */
1770    
1771        case OP_RECURSE:
1772        if (!atend) return -3;
1773        cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1774        do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1775        if (cc > cs && cc < ce) return -1;                    /* Recursion */
1776        d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1777        if (d < 0) return d;
1778        branchlength += d;
1779        cc += 1 + LINK_SIZE;
1780        break;
1781    
1782      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1783    
1784      case OP_ASSERT:      case OP_ASSERT:
# Line 1231  for (;;) Line 1786  for (;;)
1786      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1787      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1788      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1789      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1790        break;
1791    
1792      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1793    
1794      case OP_REVERSE:      case OP_MARK:
1795        case OP_PRUNE_ARG:
1796        case OP_SKIP_ARG:
1797        case OP_THEN_ARG:
1798        cc += cc[1] + PRIV(OP_lengths)[*cc];
1799        break;
1800    
1801        case OP_CALLOUT:
1802        case OP_CIRC:
1803        case OP_CIRCM:
1804        case OP_CLOSE:
1805        case OP_COMMIT:
1806      case OP_CREF:      case OP_CREF:
     case OP_RREF:  
1807      case OP_DEF:      case OP_DEF:
1808      case OP_OPT:      case OP_DOLL:
1809      case OP_CALLOUT:      case OP_DOLLM:
     case OP_SOD:  
     case OP_SOM:  
1810      case OP_EOD:      case OP_EOD:
1811      case OP_EODN:      case OP_EODN:
1812      case OP_CIRC:      case OP_FAIL:
1813      case OP_DOLL:      case OP_NCREF:
1814        case OP_NRREF:
1815      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1816        case OP_PRUNE:
1817        case OP_REVERSE:
1818        case OP_RREF:
1819        case OP_SET_SOM:
1820        case OP_SKIP:
1821        case OP_SOD:
1822        case OP_SOM:
1823        case OP_THEN:
1824      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1825      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1826      break;      break;
1827    
1828      /* Handle literal characters */      /* Handle literal characters */
1829    
1830      case OP_CHAR:      case OP_CHAR:
1831      case OP_CHARNC:      case OP_CHARI:
1832      case OP_NOT:      case OP_NOT:
1833        case OP_NOTI:
1834      branchlength++;      branchlength++;
1835      cc += 2;      cc += 2;
1836  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1837      if ((options & PCRE_UTF8) != 0)      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
       {  
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1838  #endif  #endif
1839      break;      break;
1840    
# Line 1271  for (;;) Line 1842  for (;;)
1842      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1843    
1844      case OP_EXACT:      case OP_EXACT:
1845        case OP_EXACTI:
1846        case OP_NOTEXACT:
1847        case OP_NOTEXACTI:
1848      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1849      cc += 4;      cc += 2 + IMM2_SIZE;
1850  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1851      if ((options & PCRE_UTF8) != 0)      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
       {  
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1852  #endif  #endif
1853      break;      break;
1854    
1855      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1856      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1857      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1858      cc += 4;      cc += 1 + IMM2_SIZE + 1;
1859      break;      break;
1860    
1861      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1294  for (;;) Line 1865  for (;;)
1865      cc += 2;      cc += 2;
1866      /* Fall through */      /* Fall through */
1867    
1868        case OP_HSPACE:
1869        case OP_VSPACE:
1870        case OP_NOT_HSPACE:
1871        case OP_NOT_VSPACE:
1872      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1873      case OP_DIGIT:      case OP_DIGIT:
1874      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1306  for (;;) Line 1881  for (;;)
1881      cc++;      cc++;
1882      break;      break;
1883    
1884      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1885        otherwise \C is coded as OP_ALLANY. */
1886    
1887      case OP_ANYBYTE:      case OP_ANYBYTE:
1888      return -2;      return -2;
1889    
1890      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1891    
1892  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1893      case OP_XCLASS:      case OP_XCLASS:
1894      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1895      /* Fall through */      /* Fall through */
1896  #endif  #endif
1897    
1898      case OP_CLASS:      case OP_CLASS:
1899      case OP_NCLASS:      case OP_NCLASS:
1900      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1901    
1902      switch (*cc)      switch (*cc)
1903        {        {
1904          case OP_CRPLUS:
1905          case OP_CRMINPLUS:
1906        case OP_CRSTAR:        case OP_CRSTAR:
1907        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1908        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1333  for (;;) Line 1911  for (;;)
1911    
1912        case OP_CRRANGE:        case OP_CRRANGE:
1913        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1914        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1915        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1916        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1917        break;        break;
1918    
1919        default:        default:
# Line 1345  for (;;) Line 1923  for (;;)
1923    
1924      /* Anything else is variable length */      /* Anything else is variable length */
1925    
1926      default:      case OP_ANYNL:
1927        case OP_BRAMINZERO:
1928        case OP_BRAPOS:
1929        case OP_BRAPOSZERO:
1930        case OP_BRAZERO:
1931        case OP_CBRAPOS:
1932        case OP_EXTUNI:
1933        case OP_KETRMAX:
1934        case OP_KETRMIN:
1935        case OP_KETRPOS:
1936        case OP_MINPLUS:
1937        case OP_MINPLUSI:
1938        case OP_MINQUERY:
1939        case OP_MINQUERYI:
1940        case OP_MINSTAR:
1941        case OP_MINSTARI:
1942        case OP_MINUPTO:
1943        case OP_MINUPTOI:
1944        case OP_NOTMINPLUS:
1945        case OP_NOTMINPLUSI:
1946        case OP_NOTMINQUERY:
1947        case OP_NOTMINQUERYI:
1948        case OP_NOTMINSTAR:
1949        case OP_NOTMINSTARI:
1950        case OP_NOTMINUPTO:
1951        case OP_NOTMINUPTOI:
1952        case OP_NOTPLUS:
1953        case OP_NOTPLUSI:
1954        case OP_NOTPOSPLUS:
1955        case OP_NOTPOSPLUSI:
1956        case OP_NOTPOSQUERY:
1957        case OP_NOTPOSQUERYI:
1958        case OP_NOTPOSSTAR:
1959        case OP_NOTPOSSTARI:
1960        case OP_NOTPOSUPTO:
1961        case OP_NOTPOSUPTOI:
1962        case OP_NOTQUERY:
1963        case OP_NOTQUERYI:
1964        case OP_NOTSTAR:
1965        case OP_NOTSTARI:
1966        case OP_NOTUPTO:
1967        case OP_NOTUPTOI:
1968        case OP_PLUS:
1969        case OP_PLUSI:
1970        case OP_POSPLUS:
1971        case OP_POSPLUSI:
1972        case OP_POSQUERY:
1973        case OP_POSQUERYI:
1974        case OP_POSSTAR:
1975        case OP_POSSTARI:
1976        case OP_POSUPTO:
1977        case OP_POSUPTOI:
1978        case OP_QUERY:
1979        case OP_QUERYI:
1980        case OP_REF:
1981        case OP_REFI:
1982        case OP_SBRA:
1983        case OP_SBRAPOS:
1984        case OP_SCBRA:
1985        case OP_SCBRAPOS:
1986        case OP_SCOND:
1987        case OP_SKIPZERO:
1988        case OP_STAR:
1989        case OP_STARI:
1990        case OP_TYPEMINPLUS:
1991        case OP_TYPEMINQUERY:
1992        case OP_TYPEMINSTAR:
1993        case OP_TYPEMINUPTO:
1994        case OP_TYPEPLUS:
1995        case OP_TYPEPOSPLUS:
1996        case OP_TYPEPOSQUERY:
1997        case OP_TYPEPOSSTAR:
1998        case OP_TYPEPOSUPTO:
1999        case OP_TYPEQUERY:
2000        case OP_TYPESTAR:
2001        case OP_TYPEUPTO:
2002        case OP_UPTO:
2003        case OP_UPTOI:
2004      return -1;      return -1;
2005    
2006        /* Catch unrecognized opcodes so that when new ones are added they
2007        are not forgotten, as has happened in the past. */
2008    
2009        default:
2010        return -4;
2011      }      }
2012    }    }
2013  /* Control never gets here */  /* Control never gets here */
# Line 1356  for (;;) Line 2017  for (;;)
2017    
2018    
2019  /*************************************************  /*************************************************
2020  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
2021  *************************************************/  *************************************************/
2022    
2023  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
2024  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
2025    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2026    so that it can be called from pcre_study() when finding the minimum matching
2027    length.
2028    
2029  Arguments:  Arguments:
2030    code        points to start of expression    code        points to start of expression
2031    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2032    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
2033    
2034  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2035  */  */
2036    
2037  static const uschar *  const pcre_uchar *
2038  find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2039  {  {
2040  for (;;)  for (;;)
2041    {    {
2042    register int c = *code;    register int c = *code;
2043    
2044    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
2045    
2046    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1384  for (;;) Line 2049  for (;;)
2049    
2050    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
2051    
2052      /* Handle recursion */
2053    
2054      else if (c == OP_REVERSE)
2055        {
2056        if (number < 0) return (pcre_uchar *)code;
2057        code += PRIV(OP_lengths)[c];
2058        }
2059    
2060    /* Handle capturing bracket */    /* Handle capturing bracket */
2061    
2062    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
2063               c == OP_CBRAPOS || c == OP_SCBRAPOS)
2064      {      {
2065      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
2066      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2067      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2068      }      }
2069    
2070    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
2071    repeated character types, we have to test for \p and \P, which have an extra    repeated character types, we have to test for \p and \P, which have an extra
2072    two bytes of parameters. */    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2073      must add in its length. */
2074    
2075    else    else
2076      {      {
# Line 1417  for (;;) Line 2092  for (;;)
2092        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2093        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2094        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2095        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2096            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2097          break;
2098    
2099          case OP_MARK:
2100          case OP_PRUNE_ARG:
2101          case OP_SKIP_ARG:
2102          code += code[1];
2103          break;
2104    
2105          case OP_THEN_ARG:
2106          code += code[1];
2107        break;        break;
2108        }        }
2109    
2110      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2111    
2112      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2113    
2114    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2115    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2116    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2117    
2118  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2119      if (utf8) switch(c)      if (utf) switch(c)
2120        {        {
2121        case OP_CHAR:        case OP_CHAR:
2122        case OP_CHARNC:        case OP_CHARI:
2123        case OP_EXACT:        case OP_EXACT:
2124          case OP_EXACTI:
2125        case OP_UPTO:        case OP_UPTO:
2126          case OP_UPTOI:
2127        case OP_MINUPTO:        case OP_MINUPTO:
2128          case OP_MINUPTOI:
2129        case OP_POSUPTO:        case OP_POSUPTO:
2130          case OP_POSUPTOI:
2131        case OP_STAR:        case OP_STAR:
2132          case OP_STARI:
2133        case OP_MINSTAR:        case OP_MINSTAR:
2134          case OP_MINSTARI:
2135        case OP_POSSTAR:        case OP_POSSTAR:
2136          case OP_POSSTARI:
2137        case OP_PLUS:        case OP_PLUS:
2138          case OP_PLUSI:
2139        case OP_MINPLUS:        case OP_MINPLUS:
2140          case OP_MINPLUSI:
2141        case OP_POSPLUS:        case OP_POSPLUS:
2142          case OP_POSPLUSI:
2143        case OP_QUERY:        case OP_QUERY:
2144          case OP_QUERYI:
2145        case OP_MINQUERY:        case OP_MINQUERY:
2146          case OP_MINQUERYI:
2147        case OP_POSQUERY:        case OP_POSQUERY:
2148        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2149          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2150        break;        break;
2151        }        }
2152    #else
2153        (void)(utf);  /* Keep compiler happy by referencing function argument */
2154  #endif  #endif
2155      }      }
2156    }    }
# Line 1466  instance of OP_RECURSE. Line 2167  instance of OP_RECURSE.
2167    
2168  Arguments:  Arguments:
2169    code        points to start of expression    code        points to start of expression
2170    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2171    
2172  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2173  */  */
2174    
2175  static const uschar *  static const pcre_uchar *
2176  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2177  {  {
2178  for (;;)  for (;;)
2179    {    {
# Line 1488  for (;;) Line 2189  for (;;)
2189    
2190    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
2191    repeated character types, we have to test for \p and \P, which have an extra    repeated character types, we have to test for \p and \P, which have an extra
2192    two bytes of parameters. */    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2193      must add in its length. */
2194    
2195    else    else
2196      {      {
# Line 1510  for (;;) Line 2212  for (;;)
2212        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2213        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2214        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2215        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2216            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2217          break;
2218    
2219          case OP_MARK:
2220          case OP_PRUNE_ARG:
2221          case OP_SKIP_ARG:
2222          code += code[1];
2223          break;
2224    
2225          case OP_THEN_ARG:
2226          code += code[1];
2227        break;        break;
2228        }        }
2229    
2230      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2231    
2232      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2233    
2234      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2235      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2236      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2237    
2238  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2239      if (utf8) switch(c)      if (utf) switch(c)
2240        {        {
2241        case OP_CHAR:        case OP_CHAR:
2242        case OP_CHARNC:        case OP_CHARI:
2243          case OP_NOT:
2244          case OP_NOTI:
2245        case OP_EXACT:        case OP_EXACT:
2246          case OP_EXACTI:
2247          case OP_NOTEXACT:
2248          case OP_NOTEXACTI:
2249        case OP_UPTO:        case OP_UPTO:
2250          case OP_UPTOI:
2251          case OP_NOTUPTO:
2252          case OP_NOTUPTOI:
2253        case OP_MINUPTO:        case OP_MINUPTO:
2254          case OP_MINUPTOI:
2255          case OP_NOTMINUPTO:
2256          case OP_NOTMINUPTOI:
2257        case OP_POSUPTO:        case OP_POSUPTO:
2258          case OP_POSUPTOI:
2259          case OP_NOTPOSUPTO:
2260          case OP_NOTPOSUPTOI:
2261        case OP_STAR:        case OP_STAR:
2262          case OP_STARI:
2263          case OP_NOTSTAR:
2264          case OP_NOTSTARI:
2265        case OP_MINSTAR:        case OP_MINSTAR:
2266          case OP_MINSTARI:
2267          case OP_NOTMINSTAR:
2268          case OP_NOTMINSTARI:
2269        case OP_POSSTAR:        case OP_POSSTAR:
2270          case OP_POSSTARI:
2271          case OP_NOTPOSSTAR:
2272          case OP_NOTPOSSTARI:
2273        case OP_PLUS:        case OP_PLUS:
2274          case OP_PLUSI:
2275          case OP_NOTPLUS:
2276          case OP_NOTPLUSI:
2277        case OP_MINPLUS:        case OP_MINPLUS:
2278          case OP_MINPLUSI:
2279          case OP_NOTMINPLUS:
2280          case OP_NOTMINPLUSI:
2281        case OP_POSPLUS:        case OP_POSPLUS:
2282          case OP_POSPLUSI:
2283          case OP_NOTPOSPLUS:
2284          case OP_NOTPOSPLUSI:
2285        case OP_QUERY:        case OP_QUERY:
2286          case OP_QUERYI:
2287          case OP_NOTQUERY:
2288          case OP_NOTQUERYI:
2289        case OP_MINQUERY:        case OP_MINQUERY:
2290          case OP_MINQUERYI:
2291          case OP_NOTMINQUERY:
2292          case OP_NOTMINQUERYI:
2293        case OP_POSQUERY:        case OP_POSQUERY:
2294        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2295          case OP_NOTPOSQUERY:
2296          case OP_NOTPOSQUERYI:
2297          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2298        break;        break;
2299        }        }
2300    #else
2301        (void)(utf);  /* Keep compiler happy by referencing function argument */
2302  #endif  #endif
2303      }      }
2304    }    }
# Line 1565  bracket whose current branch will alread Line 2321  bracket whose current branch will alread
2321  Arguments:  Arguments:
2322    code        points to start of search    code        points to start of search
2323    endcode     points to where to stop    endcode     points to where to stop
2324    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2325      cd          contains pointers to tables etc.
2326    
2327  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2328  */  */
2329    
2330  static BOOL  static BOOL
2331  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2332      BOOL utf, compile_data *cd)
2333  {  {
2334  register int c;  register int c;
2335  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2336       code < endcode;       code < endcode;
2337       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2338    {    {
2339    const uschar *ccode;    const pcre_uchar *ccode;
2340    
2341    c = *code;    c = *code;
2342    
# Line 1592  for (code = first_significant_code(code Line 2350  for (code = first_significant_code(code
2350      continue;      continue;
2351      }      }
2352    
2353      /* For a recursion/subroutine call, if its end has been reached, which
2354      implies a backward reference subroutine call, we can scan it. If it's a
2355      forward reference subroutine call, we can't. To detect forward reference
2356      we have to scan up the list that is kept in the workspace. This function is
2357      called only when doing the real compile, not during the pre-compile that
2358      measures the size of the compiled pattern. */
2359    
2360      if (c == OP_RECURSE)
2361        {
2362        const pcre_uchar *scode;
2363        BOOL empty_branch;
2364    
2365        /* Test for forward reference */
2366    
2367        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2368          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2369    
2370        /* Not a forward reference, test for completed backward reference */
2371    
2372        empty_branch = FALSE;
2373        scode = cd->start_code + GET(code, 1);
2374        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2375    
2376        /* Completed backwards reference */
2377    
2378        do
2379          {
2380          if (could_be_empty_branch(scode, endcode, utf, cd))
2381            {
2382            empty_branch = TRUE;
2383            break;
2384            }
2385          scode += GET(scode, 1);
2386          }
2387        while (*scode == OP_ALT);
2388    
2389        if (!empty_branch) return FALSE;  /* All branches are non-empty */
2390        continue;
2391        }
2392    
2393    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
2394    
2395    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2396          c == OP_BRAPOSZERO)
2397        {
2398        code += PRIV(OP_lengths)[c];
2399        do code += GET(code, 1); while (*code == OP_ALT);
2400        c = *code;
2401        continue;
2402        }
2403    
2404      /* A nested group that is already marked as "could be empty" can just be
2405      skipped. */
2406    
2407      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2408          c == OP_SCBRA || c == OP_SCBRAPOS)
2409      {      {
     code += _pcre_OP_lengths[c];  
2410      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
2411      c = *code;      c = *code;
2412      continue;      continue;
# Line 1604  for (code = first_significant_code(code Line 2414  for (code = first_significant_code(code
2414    
2415    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2416    
2417    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2418          c == OP_CBRA || c == OP_CBRAPOS ||
2419          c == OP_ONCE || c == OP_ONCE_NC ||
2420          c == OP_COND)
2421      {      {
2422      BOOL empty_branch;      BOOL empty_branch;
2423      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2424    
2425      /* Scan a closed bracket */      /* If a conditional group has only one branch, there is a second, implied,
2426        empty branch, so just skip over the conditional, because it could be empty.
2427        Otherwise, scan the individual branches of the group. */
2428    
2429      empty_branch = FALSE;      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
     do  
       {  
       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))  
         empty_branch = TRUE;  
2430        code += GET(code, 1);        code += GET(code, 1);
2431        else
2432          {
2433          empty_branch = FALSE;
2434          do
2435            {
2436            if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2437              empty_branch = TRUE;
2438            code += GET(code, 1);
2439            }
2440          while (*code == OP_ALT);
2441          if (!empty_branch) return FALSE;   /* All branches are non-empty */
2442        }        }
2443      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
2444      c = *code;      c = *code;
2445      continue;      continue;
2446      }      }
# Line 1630  for (code = first_significant_code(code Line 2451  for (code = first_significant_code(code
2451      {      {
2452      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2453      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2454      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2455      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2456      here. */      here. */
2457    
2458  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2459      case OP_XCLASS:      case OP_XCLASS:
2460      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2461      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 1642  for (code = first_significant_code(code Line 2463  for (code = first_significant_code(code
2463    
2464      case OP_CLASS:      case OP_CLASS:
2465      case OP_NCLASS:      case OP_NCLASS:
2466      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2467    
2468  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2469      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2470  #endif  #endif
2471    
# Line 1683  for (code = first_significant_code(code Line 2504  for (code = first_significant_code(code
2504      case OP_ALLANY:      case OP_ALLANY:
2505      case OP_ANYBYTE:      case OP_ANYBYTE:
2506      case OP_CHAR:      case OP_CHAR:
2507      case OP_CHARNC:      case OP_CHARI:
2508      case OP_NOT:      case OP_NOT:
2509        case OP_NOTI:
2510      case OP_PLUS:      case OP_PLUS:
2511      case OP_MINPLUS:      case OP_MINPLUS:
2512      case OP_POSPLUS:      case OP_POSPLUS:
# Line 1716  for (code = first_significant_code(code Line 2538  for (code = first_significant_code(code
2538      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2539      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2540      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2541      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP
2542          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2543      break;      break;
2544    
2545      /* End of branch */      /* End of branch */
# Line 1724  for (code = first_significant_code(code Line 2547  for (code = first_significant_code(code
2547      case OP_KET:      case OP_KET:
2548      case OP_KETRMAX:      case OP_KETRMAX:
2549      case OP_KETRMIN:      case OP_KETRMIN:
2550        case OP_KETRPOS:
2551      case OP_ALT:      case OP_ALT:
2552      return TRUE;      return TRUE;
2553    
2554      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2555      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
2556    
2557  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2558      case OP_STAR:      case OP_STAR:
2559        case OP_STARI:
2560      case OP_MINSTAR:      case OP_MINSTAR:
2561        case OP_MINSTARI:
2562      case OP_POSSTAR:      case OP_POSSTAR:
2563        case OP_POSSTARI:
2564      case OP_QUERY:      case OP_QUERY:
2565        case OP_QUERYI:
2566      case OP_MINQUERY:      case OP_MINQUERY:
2567        case OP_MINQUERYI:
2568      case OP_POSQUERY:      case OP_POSQUERY:
2569        case OP_POSQUERYI:
2570        if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2571        break;
2572    
2573      case OP_UPTO:      case OP_UPTO:
2574        case OP_UPTOI:
2575      case OP_MINUPTO:      case OP_MINUPTO:
2576        case OP_MINUPTOI:
2577      case OP_POSUPTO:      case OP_POSUPTO:
2578      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      case OP_POSUPTOI:
2579        if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2580      break;      break;
2581  #endif  #endif
2582    
2583        /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2584        string. */
2585    
2586        case OP_MARK:
2587        case OP_PRUNE_ARG:
2588        case OP_SKIP_ARG:
2589        code += code[1];
2590        break;
2591    
2592        case OP_THEN_ARG:
2593        code += code[1];
2594        break;
2595    
2596        /* None of the remaining opcodes are required to match a character. */
2597    
2598        default:
2599        break;
2600      }      }
2601    }    }
2602    
# Line 1759  return TRUE; Line 2613  return TRUE;
2613  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2614  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2615  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2616    This function is called only during the real compile, not during the
2617    pre-compile.
2618    
2619  Arguments:  Arguments:
2620    code        points to start of the recursion    code        points to start of the recursion
2621    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2622    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2623    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2624      cd          pointers to tables etc
2625    
2626  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2627  */  */
2628    
2629  static BOOL  static BOOL
2630  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2631    BOOL utf8)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2632  {  {
2633  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2634    {    {
2635    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2636        return FALSE;
2637    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2638    }    }
2639  return TRUE;  return TRUE;
# Line 1807  where Perl recognizes it as the POSIX cl Line 2665  where Perl recognizes it as the POSIX cl
2665  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2666  I think.  I think.
2667    
2668    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2669    It seems that the appearance of a nested POSIX class supersedes an apparent
2670    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2671    a digit.
2672    
2673    In Perl, unescaped square brackets may also appear as part of class names. For
2674    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2675    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2676    seem right at all. PCRE does not allow closing square brackets in POSIX class
2677    names.
2678    
2679  Arguments:  Arguments:
2680    ptr      pointer to the initial [    ptr      pointer to the initial [
2681    endptr   where to return the end pointer    endptr   where to return the end pointer
# Line 1815  Returns:   TRUE or FALSE Line 2684  Returns:   TRUE or FALSE
2684  */  */
2685    
2686  static BOOL  static BOOL
2687  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2688  {  {
2689  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2690  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2691  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2692    {    {
2693    if (*ptr == '\\' && ptr[1] == ']') ptr++; else    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2694        ptr++;
2695      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2696      else
2697      {      {
2698      if (*ptr == ']') return FALSE;      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
     if (*ptr == terminator && ptr[1] == ']')  
2699        {        {
2700        *endptr = ptr;        *endptr = ptr;
2701        return TRUE;        return TRUE;
2702        }        }
2703        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2704             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2705              ptr[1] == CHAR_EQUALS_SIGN) &&
2706            check_posix_syntax(ptr, endptr))
2707          return FALSE;
2708      }      }
2709    }    }
2710  return FALSE;  return FALSE;
# Line 1852  Returns:     a value representing the na Line 2728  Returns:     a value representing the na
2728  */  */
2729    
2730  static int  static int
2731  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2732  {  {
2733  const char *pn = posix_names;  const char *pn = posix_names;
2734  register int yield = 0;  register int yield = 0;
2735  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2736    {    {
2737    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2738      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2739    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2740    yield++;    yield++;
2741    }    }
# Line 1891  value in the reference (which is a group Line 2767  value in the reference (which is a group
2767  Arguments:  Arguments:
2768    group      points to the start of the group    group      points to the start of the group
2769    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2770    utf8       TRUE in UTF-8 mode    utf        TRUE in UTF-8 / UTF-16 mode
2771    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2772    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm   the hwm forward reference pointer at the start of the group
2773    
# Line 1899  Returns:     nothing Line 2775  Returns:     nothing
2775  */  */
2776    
2777  static void  static void
2778  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2779    uschar *save_hwm)    pcre_uchar *save_hwm)
2780  {  {
2781  uschar *ptr = group;  pcre_uchar *ptr = group;
2782    
2783  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2784    {    {
2785    int offset;    int offset;
2786    uschar *hc;    pcre_uchar *hc;
2787    
2788    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2789    reference. */    reference. */
# Line 1952  Arguments: Line 2828  Arguments:
2828  Returns:         new code pointer  Returns:         new code pointer
2829  */  */
2830    
2831  static uschar *  static pcre_uchar *
2832  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2833  {  {
2834  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2835  *code++ = 255;  *code++ = 255;
2836  PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2837  PUT(code, LINK_SIZE, 0);                /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2838  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2839  }  }
2840    
2841    
# Line 1981  Returns:             nothing Line 2857  Returns:             nothing
2857  */  */
2858    
2859  static void  static void
2860  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2861  {  {
2862  int length = ptr - cd->start_pattern - GET(previous_callout, 2);  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2863  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
2864  }  }
2865    
# Line 2033  for (++c; c <= d; c++) Line 2909  for (++c; c <= d; c++)
2909    
2910  return TRUE;  return TRUE;
2911  }  }
2912    
2913    
2914    
2915    /*************************************************
2916    *        Check a character and a property        *
2917    *************************************************/
2918    
2919    /* This function is called by check_auto_possessive() when a property item
2920    is adjacent to a fixed character.
2921    
2922    Arguments:
2923      c            the character
2924      ptype        the property type
2925      pdata        the data for the type
2926      negated      TRUE if it's a negated property (\P or \p{^)
2927    
2928    Returns:       TRUE if auto-possessifying is OK
2929    */
2930    
2931    static BOOL
2932    check_char_prop(int c, int ptype, int pdata, BOOL negated)
2933    {
2934    const ucd_record *prop = GET_UCD(c);
2935    switch(ptype)
2936      {
2937      case PT_LAMP:
2938      return (prop->chartype == ucp_Lu ||
2939              prop->chartype == ucp_Ll ||
2940              prop->chartype == ucp_Lt) == negated;
2941    
2942      case PT_GC:
2943      return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2944    
2945      case PT_PC:
2946      return (pdata == prop->chartype) == negated;
2947    
2948      case PT_SC:
2949      return (pdata == prop->script) == negated;
2950    
2951      /* These are specials */
2952    
2953      case PT_ALNUM:
2954      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2955              PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2956    
2957      case PT_SPACE:    /* Perl space */
2958      return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2959              c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2960              == negated;
2961    
2962      case PT_PXSPACE:  /* POSIX space */
2963      return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2964              c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2965              c == CHAR_FF || c == CHAR_CR)
2966              == negated;
2967    
2968      case PT_WORD:
2969      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2970              PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2971              c == CHAR_UNDERSCORE) == negated;
2972      }
2973    return FALSE;
2974    }
2975  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2976    
2977    
# Line 2046  whether the next thing could possibly ma Line 2985  whether the next thing could possibly ma
2985  sense to automatically possessify the repeated item.  sense to automatically possessify the repeated item.
2986    
2987  Arguments:  Arguments:
2988    op_code       the repeated op code    previous      pointer to the repeated opcode
2989    this          data for this item, depends on the opcode    utf           TRUE in UTF-8 / UTF-16 mode
   utf8          TRUE in UTF-8 mode  
   utf8_char     used for utf8 character bytes, NULL if not relevant  
2990    ptr           next character in pattern    ptr           next character in pattern
2991    options       options bits    options       options bits
2992    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 2058  Returns:        TRUE if possessifying is Line 2995  Returns:        TRUE if possessifying is
2995  */  */
2996    
2997  static BOOL  static BOOL
2998  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2999    const uschar *ptr, int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
3000  {  {
3001  int next;  pcre_int32 c, next;
3002    int op_code = *previous++;
3003    
3004  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
3005    
# Line 2069  if ((options & PCRE_EXTENDED) != 0) Line 3007  if ((options & PCRE_EXTENDED) != 0)
3007    {    {
3008    for (;;)    for (;;)
3009      {      {
3010      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3011      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
3012        {        {
3013        while (*(++ptr) != 0)        ptr++;
3014          while (*ptr != 0)
3015            {
3016          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3017            ptr++;
3018    #ifdef SUPPORT_UTF
3019            if (utf) FORWARDCHAR(ptr);
3020    #endif
3021            }
3022        }        }
3023      else break;      else break;
3024      }      }
# Line 2082  if ((options & PCRE_EXTENDED) != 0) Line 3027  if ((options & PCRE_EXTENDED) != 0)
3027  /* If the next item is one that we can handle, get its value. A non-negative  /* If the next item is one that we can handle, get its value. A non-negative
3028  value is a character, a negative value is an escape value. */  value is a character, a negative value is an escape value. */
3029    
3030  if (*ptr == '\\')  if (*ptr == CHAR_BACKSLASH)
3031    {    {
3032    int temperrorcode = 0;    int temperrorcode = 0;
3033    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
3034    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
3035    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
3036    }    }
3037    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
3038    {    {
3039  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3040    if (utf8) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
3041  #endif  #endif
3042    next = *ptr++;    next = *ptr++;
3043    }    }
   
3044  else return FALSE;  else return FALSE;
3045    
3046  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2106  if ((options & PCRE_EXTENDED) != 0) Line 3049  if ((options & PCRE_EXTENDED) != 0)
3049    {    {
3050    for (;;)    for (;;)
3051      {      {
3052      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3053      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
3054        {        {
3055        while (*(++ptr) != 0)        ptr++;
3056          while (*ptr != 0)
3057            {
3058          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3059            ptr++;
3060    #ifdef SUPPORT_UTF
3061            if (utf) FORWARDCHAR(ptr);
3062    #endif
3063            }
3064        }        }
3065      else break;      else break;
3066      }      }
# Line 2118  if ((options & PCRE_EXTENDED) != 0) Line 3068  if ((options & PCRE_EXTENDED) != 0)
3068    
3069  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
3070    
3071  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3072    return FALSE;    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3073        return FALSE;
 /* Now compare the next item with the previous opcode. If the previous is a  
 positive single character match, "item" either contains the character or, if  
 "item" is greater than 127 in utf8 mode, the character's bytes are in  
 utf8_char. */  
   
3074    
3075  /* Handle cases when the next item is a character. */  /* Now compare the next item with the previous opcode. First, handle cases when
3076    the next item is a character. */
3077    
3078  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3079    {    {
3080    case OP_CHAR:    case OP_CHAR:
3081  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3082    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
3083    #else
3084      c = *previous;
3085  #endif  #endif
3086    return item != next;    return c != next;
3087    
3088    /* For CHARNC (caseless character) we must check the other case. If we have    /* For CHARI (caseless character) we must check the other case. If we have
3089    Unicode property support, we can use it to test the other case of    Unicode property support, we can use it to test the other case of
3090    high-valued characters. */    high-valued characters. */
3091    
3092    case OP_CHARNC:    case OP_CHARI:
3093  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3094    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
3095  #endif  #else
3096    if (item == next) return FALSE;    c = *previous;
3097  #ifdef SUPPORT_UTF8  #endif
3098    if (utf8)    if (c == next) return FALSE;
3099    #ifdef SUPPORT_UTF
3100      if (utf)
3101      {      {
3102      unsigned int othercase;      unsigned int othercase;
3103      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2156  if (next >= 0) switch(op_code) Line 3106  if (next >= 0) switch(op_code)
3106  #else  #else
3107      othercase = NOTACHAR;      othercase = NOTACHAR;
3108  #endif  #endif
3109      return (unsigned int)item != othercase;      return (unsigned int)c != othercase;
3110      }      }
3111    else    else
3112  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3113    return (item != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
   
   /* For OP_NOT, "item" must be a single-byte character. */  
3114    
3115    case OP_NOT:    case OP_NOT:
3116    if (item == next) return TRUE;  #ifdef SUPPORT_UTF
3117    if ((options & PCRE_CASELESS) == 0) return FALSE;    GETCHARTEST(c, previous);
3118  #ifdef SUPPORT_UTF8  #else
3119    if (utf8)    c = *previous;
3120    #endif
3121      return c == next;
3122    
3123      case OP_NOTI:
3124    #ifdef SUPPORT_UTF
3125      GETCHARTEST(c, previous);
3126    #else
3127      c = *previous;
3128    #endif
3129      if (c == next) return TRUE;
3130    #ifdef SUPPORT_UTF
3131      if (utf)
3132      {      {
3133      unsigned int othercase;      unsigned int othercase;
3134      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
3135  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3136      othercase = UCD_OTHERCASE(next);      othercase = UCD_OTHERCASE((unsigned int)next);
3137  #else  #else
3138      othercase = NOTACHAR;      othercase = NOTACHAR;
3139  #endif  #endif
3140      return (unsigned int)item == othercase;      return (unsigned int)c == othercase;
3141      }      }
3142    else    else
3143  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3144    return (item == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3145    
3146      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3147      When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3148    
3149    case OP_DIGIT:    case OP_DIGIT:
3150    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3151    
3152    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3153    return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3154    
3155    case OP_WHITESPACE:    case OP_WHITESPACE:
3156    return next > 127 || (cd->ctypes[next] & ctype_space) == 0;    return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3157    
3158    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3159    return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3160    
3161    case OP_WORDCHAR:    case OP_WORDCHAR:
3162    return next > 127 || (cd->ctypes[next] & ctype_word) == 0;    return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3163    
3164    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3165    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3166    
3167    case OP_HSPACE:    case OP_HSPACE:
3168    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
# Line 2224  if (next >= 0) switch(op_code) Line 3187  if (next >= 0) switch(op_code)
3187      case 0x202f:      case 0x202f:
3188      case 0x205f:      case 0x205f:
3189      case 0x3000:      case 0x3000:
3190      return op_code != OP_HSPACE;      return op_code == OP_NOT_HSPACE;
3191      default:      default:
3192      return op_code == OP_HSPACE;      return op_code != OP_NOT_HSPACE;
3193      }      }
3194    
3195      case OP_ANYNL:
3196    case OP_VSPACE:    case OP_VSPACE:
3197    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3198    switch(next)    switch(next)
# Line 2240  if (next >= 0) switch(op_code) Line 3204  if (next >= 0) switch(op_code)
3204      case 0x85:      case 0x85:
3205      case 0x2028:      case 0x2028:
3206      case 0x2029:      case 0x2029:
3207      return op_code != OP_VSPACE;      return op_code == OP_NOT_VSPACE;
3208      default:      default:
3209      return op_code == OP_VSPACE;      return op_code != OP_NOT_VSPACE;
3210      }      }
3211    
3212    #ifdef SUPPORT_UCP
3213      case OP_PROP:
3214      return check_char_prop(next, previous[0], previous[1], FALSE);
3215    
3216      case OP_NOTPROP:
3217      return check_char_prop(next, previous[0], previous[1], TRUE);
3218    #endif
3219    
3220    default:    default:
3221    return FALSE;    return FALSE;
3222    }    }
3223    
3224    
3225  /* Handle the case when the next item is \d, \s, etc. */  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3226    is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3227    generated only when PCRE_UCP is *not* set, that is, when only ASCII
3228    characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3229    replaced by OP_PROP codes when PCRE_UCP is set. */
3230    
3231  switch(op_code)  switch(op_code)
3232    {    {
3233    case OP_CHAR:    case OP_CHAR:
3234    case OP_CHARNC:    case OP_CHARI:
3235  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3236    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
3237    #else
3238      c = *previous;
3239  #endif  #endif
3240    switch(-next)    switch(-next)
3241      {      {
3242      case ESC_d:      case ESC_d:
3243      return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;      return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3244    
3245      case ESC_D:      case ESC_D:
3246      return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3247    
3248      case ESC_s:      case ESC_s:
3249      return item > 127 || (cd->ctypes[item] & ctype_space) == 0;      return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3250    
3251      case ESC_S:      case ESC_S:
3252      return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3253    
3254      case ESC_w:      case ESC_w:
3255      return item > 127 || (cd->ctypes[item] & ctype_word) == 0;      return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3256    
3257      case ESC_W:      case ESC_W:
3258      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3259    
3260      case ESC_h:      case ESC_h:
3261      case ESC_H:      case ESC_H:
3262      switch(item)      switch(c)
3263        {        {
3264        case 0x09:        case 0x09:
3265        case 0x20:        case 0x20:
# Line 2309  switch(op_code) Line 3287  switch(op_code)
3287    
3288      case ESC_v:      case ESC_v:
3289      case ESC_V:      case ESC_V:
3290      switch(item)      switch(c)
3291        {        {
3292        case 0x0a:        case 0x0a:
3293        case 0x0b:        case 0x0b:
# Line 2323  switch(op_code) Line 3301  switch(op_code)
3301        return -next == ESC_v;        return -next == ESC_v;
3302        }        }
3303    
3304        /* When PCRE_UCP is set, these values get generated for \d etc. Find
3305        their substitutions and process them. The result will always be either
3306        -ESC_p or -ESC_P. Then fall through to process those values. */
3307    
3308    #ifdef SUPPORT_UCP
3309        case ESC_du:
3310        case ESC_DU:
3311        case ESC_wu:
3312        case ESC_WU:
3313        case ESC_su:
3314        case ESC_SU:
3315          {
3316          int temperrorcode = 0;
3317          ptr = substitutes[-next - ESC_DU];
3318          next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3319          if (temperrorcode != 0) return FALSE;
3320          ptr++;    /* For compatibility */
3321          }
3322        /* Fall through */
3323    
3324        case ESC_p:
3325        case ESC_P:
3326          {
3327          int ptype, pdata, errorcodeptr;
3328          BOOL negated;
3329    
3330          ptr--;      /* Make ptr point at the p or P */
3331          ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3332          if (ptype < 0) return FALSE;
3333          ptr++;      /* Point past the final curly ket */
3334    
3335          /* If the property item is optional, we have to give up. (When generated
3336          from \d etc by PCRE_UCP, this test will have been applied much earlier,
3337          to the original \d etc. At this point, ptr will point to a zero byte. */
3338    
3339          if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3340            STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3341              return FALSE;
3342    
3343          /* Do the property check. */
3344    
3345          return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3346          }
3347    #endif
3348    
3349      default:      default:
3350      return FALSE;      return FALSE;
3351      }      }
3352    
3353      /* In principle, support for Unicode properties should be integrated here as
3354      well. It means re-organizing the above code so as to get hold of the property
3355      values before switching on the op-code. However, I wonder how many patterns
3356      combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3357      these op-codes are never generated.) */
3358    
3359    case OP_DIGIT:    case OP_DIGIT:
3360    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3361           next == -ESC_h || next == -ESC_v;           next == -ESC_h || next == -ESC_v || next == -ESC_R;
3362    
3363    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3364    return next == -ESC_d;    return next == -ESC_d;
# Line 2338  switch(op_code) Line 3367  switch(op_code)
3367    return next == -ESC_S || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3368    
3369    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3370    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3371    
3372    case OP_HSPACE:    case OP_HSPACE:
3373    return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3374             next == -ESC_w || next == -ESC_v || next == -ESC_R;
3375    
3376    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3377    return next == -ESC_h;    return next == -ESC_h;
3378    
3379    /* Can't have \S in here because VT matches \S (Perl anomaly) */    /* Can't have \S in here because VT matches \S (Perl anomaly) */
3380      case OP_ANYNL:
3381    case OP_VSPACE:    case OP_VSPACE:
3382    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3383    
3384    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3385    return next == -ESC_v;    return next == -ESC_v || next == -ESC_R;
3386    
3387    case OP_WORDCHAR:    case OP_WORDCHAR:
3388    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3389             next == -ESC_v || next == -ESC_R;
3390    
3391    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3392    return next == -ESC_w || next == -ESC_d;    return next == -ESC_w || next == -ESC_d;
# Line 2383  Arguments: Line 3415  Arguments:
3415    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3416    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3417    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3418    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3419    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3420    bcptr          points to current branch chain    bcptr          points to current branch chain
3421      cond_depth     conditional nesting depth
3422    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3423    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3424                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 2395  Returns:         TRUE on success Line 3428  Returns:         TRUE on success
3428  */  */
3429    
3430  static BOOL  static BOOL
3431  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3432    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3433      pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3434    compile_data *cd, int *lengthptr)    compile_data *cd, int *lengthptr)
3435  {  {
3436  int repeat_type, op_type;  int repeat_type, op_type;
3437  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3438  int bravalue = 0;  int bravalue = 0;
3439  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3440  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3441  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3442  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3443  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3444  int after_manual_callout = 0;  int after_manual_callout = 0;
3445  int length_prevgroup = 0;  int length_prevgroup = 0;
3446  register int c;  register int c;
3447  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3448  uschar *last_code = code;  pcre_uchar *last_code = code;
3449  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3450  uschar *tempcode;  pcre_uchar *tempcode;
3451  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3452  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3453  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3454  const uschar *tempptr;  const pcre_uchar *tempptr;
3455  uschar *previous = NULL;  const pcre_uchar *nestptr = NULL;
3456  uschar *previous_callout = NULL;  pcre_uchar *previous = NULL;
3457  uschar *save_hwm = NULL;  pcre_uchar *previous_callout = NULL;
3458  uschar classbits[32];  pcre_uchar *save_hwm = NULL;
3459    pcre_uint8 classbits[32];
3460  #ifdef SUPPORT_UTF8  
3461  BOOL class_utf8;  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3462  BOOL utf8 = (options & PCRE_UTF8) != 0;  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3463  uschar *class_utf8data;  dynamically as we process the pattern. */
3464  uschar *class_utf8data_base;  
3465  uschar utf8_char[6];  #ifdef SUPPORT_UTF
3466    /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3467    BOOL utf = (options & PCRE_UTF8) != 0;
3468    pcre_uchar utf_chars[6];
3469  #else  #else
3470  BOOL utf8 = FALSE;  BOOL utf = FALSE;
3471  uschar *utf8_char = NULL;  #endif
3472    
3473    /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3474    
3475    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3476    BOOL xclass;
3477    pcre_uchar *class_uchardata;
3478    pcre_uchar *class_uchardata_base;
3479  #endif  #endif
3480    
3481  #ifdef DEBUG  #ifdef PCRE_DEBUG
3482  if (lengthptr != NULL) DPRINTF((">> start branch\n"));  if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3483  #endif  #endif
3484    
# Line 2445  greedy_non_default = greedy_default ^ 1; Line 3489  greedy_non_default = greedy_default ^ 1;
3489    
3490  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3491  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3492  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3493  find one.  find one.
3494    
3495  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3496  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3497  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3498  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3499    
3500  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3501    
3502  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3503  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3504  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3505  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3506    value. This is used only for ASCII characters. */
3507    
3508  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3509    
3510  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3511    
# Line 2472  for (;; ptr++) Line 3517  for (;; ptr++)
3517    BOOL is_quantifier;    BOOL is_quantifier;
3518    BOOL is_recurse;    BOOL is_recurse;
3519    BOOL reset_bracount;    BOOL reset_bracount;
3520    int class_charcount;    int class_has_8bitchar;
3521    int class_lastchar;    int class_single_char;
3522    int newoptions;    int newoptions;
3523    int recno;    int recno;
3524    int refsign;    int refsign;
3525    int skipbytes;    int skipbytes;
3526    int subreqbyte;    int subreqchar;
3527    int subfirstbyte;    int subfirstchar;
3528    int terminator;    int terminator;
3529    int mclength;    int mclength;
3530    uschar mcbuffer[8];    int tempbracount;
3531      pcre_uchar mcbuffer[8];
3532    
3533    /* Get next byte in the pattern */    /* Get next character in the pattern */
3534    
3535    c = *ptr;    c = *ptr;
3536    
3537      /* If we are at the end of a nested substitution, revert to the outer level
3538      string. Nesting only happens one level deep. */
3539    
3540      if (c == 0 && nestptr != NULL)
3541        {
3542        ptr = nestptr;
3543        nestptr = NULL;
3544        c = *ptr;
3545        }
3546    
3547    /* If we are in the pre-compile phase, accumulate the length used for the    /* If we are in the pre-compile phase, accumulate the length used for the
3548    previous cycle of this loop. */    previous cycle of this loop. */
3549    
3550    if (lengthptr != NULL)    if (lengthptr != NULL)
3551      {      {
3552  #ifdef DEBUG  #ifdef PCRE_DEBUG
3553      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3554  #endif  #endif
3555      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3556            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3557        {        {
3558        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3559        goto FAILED;        goto FAILED;
# Line 2518  for (;; ptr++) Line 3575  for (;; ptr++)
3575        goto FAILED;        goto FAILED;
3576        }        }
3577    
3578      *lengthptr += code - last_code;      *lengthptr += (int)(code - last_code);
3579      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3580          (int)(code - last_code), c, c));
3581    
3582      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3583      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 2529  for (;; ptr++) Line 3587  for (;; ptr++)
3587        {        {
3588        if (previous > orig_code)        if (previous > orig_code)
3589          {          {
3590          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3591          code -= previous - orig_code;          code -= previous - orig_code;
3592          previous = orig_code;          previous = orig_code;
3593          }          }
# Line 2545  for (;; ptr++) Line 3603  for (;; ptr++)
3603    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3604    reference list. */    reference list. */
3605    
3606    else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3607               WORK_SIZE_SAFETY_MARGIN)
3608      {      {
3609      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3610      goto FAILED;      goto FAILED;
# Line 2555  for (;; ptr++) Line 3614  for (;; ptr++)
3614    
3615    if (inescq && c != 0)    if (inescq && c != 0)
3616      {      {
3617      if (c == '\\' && ptr[1] == 'E')      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3618        {        {
3619        inescq = FALSE;        inescq = FALSE;
3620        ptr++;        ptr++;
# Line 2581  for (;; ptr++) Line 3640  for (;; ptr++)
3640    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
3641    a quantifier. */    a quantifier. */
3642    
3643    is_quantifier = c == '*' || c == '+' || c == '?' ||    is_quantifier =
3644      (c == '{' && is_counted_repeat(ptr+1));      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3645        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3646    
3647    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
3648         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
# Line 2592  for (;; ptr++) Line 3652  for (;; ptr++)
3652      previous_callout = NULL;      previous_callout = NULL;
3653      }      }
3654    
3655    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3656    
3657    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3658      {      {
3659      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3660      if (c == '#')      if (c == CHAR_NUMBER_SIGN)
3661        {        {
3662        while (*(++ptr) != 0)        ptr++;
3663          while (*ptr != 0)
3664          {          {
3665          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3666            ptr++;
3667    #ifdef SUPPORT_UTF
3668            if (utf) FORWARDCHAR(ptr);
3669    #endif
3670          }          }
3671        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3672    
# Line 2622  for (;; ptr++) Line 3687  for (;; ptr++)
3687      {      {
3688      /* ===================================================================*/      /* ===================================================================*/
3689      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3690      case '|':                      /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3691      case ')':      case CHAR_RIGHT_PARENTHESIS:
3692      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3693      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3694      *codeptr = code;      *codeptr = code;
3695      *ptrptr = ptr;      *ptrptr = ptr;
3696      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 2635  for (;; ptr++) Line 3700  for (;; ptr++)
3700          *errorcodeptr = ERR20;          *errorcodeptr = ERR20;
3701          goto FAILED;          goto FAILED;
3702          }          }
3703        *lengthptr += code - last_code;   /* To include callout length */        *lengthptr += (int)(code - last_code);   /* To include callout length */
3704        DPRINTF((">> end branch\n"));        DPRINTF((">> end branch\n"));
3705        }        }
3706      return TRUE;      return TRUE;
# Line 2645  for (;; ptr++) Line 3710  for (;; ptr++)
3710      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
3711      the setting of any following char as a first character. */      the setting of any following char as a first character. */
3712    
3713      case '^':      case CHAR_CIRCUMFLEX_ACCENT:
3714        previous = NULL;
3715      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3716        {        {
3717        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3718          *code++ = OP_CIRCM;
3719        }        }
3720      previous = NULL;      else *code++ = OP_CIRC;
     *code++ = OP_CIRC;  
3721      break;      break;
3722    
3723      case '$':      case CHAR_DOLLAR_SIGN:
3724      previous = NULL;      previous = NULL;
3725      *code++ = OP_DOLL;      *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3726      break;      break;
3727    
3728      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3729      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3730    
3731      case '.':      case CHAR_DOT:
3732      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3733      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3734      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3735      previous = code;      previous = code;
3736      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3737      break;      break;
# Line 2686  for (;; ptr++) Line 3752  for (;; ptr++)
3752      In JavaScript compatibility mode, an isolated ']' causes an error. In      In JavaScript compatibility mode, an isolated ']' causes an error. In
3753      default (Perl) mode, it is treated as a data character. */      default (Perl) mode, it is treated as a data character. */
3754    
3755      case ']':      case CHAR_RIGHT_SQUARE_BRACKET:
3756      if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3757        {        {
3758        *errorcodeptr = ERR64;        *errorcodeptr = ERR64;
# Line 2694  for (;; ptr++) Line 3760  for (;; ptr++)
3760        }        }
3761      goto NORMAL_CHAR;      goto NORMAL_CHAR;
3762    
3763      case '[':      case CHAR_LEFT_SQUARE_BRACKET:
3764      previous = code;      previous = code;
3765    
3766      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3767      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
3768    
3769      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3770             ptr[1] == CHAR_EQUALS_SIGN) &&
3771          check_posix_syntax(ptr, &tempptr))          check_posix_syntax(ptr, &tempptr))
3772        {        {
3773        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3774        goto FAILED;        goto FAILED;
3775        }        }
3776    
# Line 2715  for (;; ptr++) Line 3782  for (;; ptr++)
3782      for (;;)      for (;;)
3783        {        {
3784        c = *(++ptr);        c = *(++ptr);
3785        if (c == '\\')        if (c == CHAR_BACKSLASH)
3786          {          {
3787          if (ptr[1] == 'E') ptr++;          if (ptr[1] == CHAR_E)
3788            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;            ptr++;
3789              else break;          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3790              ptr += 3;
3791            else
3792              break;
3793          }          }
3794        else if (!negate_class && c == '^')        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3795          negate_class = TRUE;          negate_class = TRUE;
3796        else break;        else break;
3797        }        }
# Line 2731  for (;; ptr++) Line 3801  for (;; ptr++)
3801      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3802      [^] must match any character, so generate OP_ALLANY. */      [^] must match any character, so generate OP_ALLANY. */
3803    
3804      if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)      if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3805            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3806        {        {
3807        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3808        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3809        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3810        break;        break;
3811        }        }
3812    
# Line 2745  for (;; ptr++) Line 3816  for (;; ptr++)
3816    
3817      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3818    
3819      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3820      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3821      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3822        a single character. */
3823    
3824      class_charcount = 0;      class_has_8bitchar = 0;
3825      class_lastchar = -1;      class_single_char = 0;
3826    
3827      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3828      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
3829      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3830      */      */
3831    
3832      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3833    
3834  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3835      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3836      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3837      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3838  #endif  #endif
3839    
3840      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2771  for (;; ptr++) Line 3843  for (;; ptr++)
3843    
3844      if (c != 0) do      if (c != 0) do
3845        {        {
3846        const uschar *oldptr;        const pcre_uchar *oldptr;
3847    
3848  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3849        if (utf8 && c > 127)        if (utf && HAS_EXTRALEN(c))
3850          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3851          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3852          }          }
3853    #endif
3854    
3855        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3856          /* In the pre-compile phase, accumulate the length of any extra
3857        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3858        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
3859        (which is on the stack). */        (which is on the stack). */
3860    
3861        if (lengthptr != NULL)        if (lengthptr != NULL)
3862          {          {
3863          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_uchardata - class_uchardata_base;
3864          class_utf8data = class_utf8data_base;          class_uchardata = class_uchardata_base;
3865          }          }
   
3866  #endif  #endif
3867    
3868        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
3869    
3870        if (inescq)        if (inescq)
3871          {          {
3872          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */          if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3873            {            {
3874            inescq = FALSE;                   /* Reset literal state */            inescq = FALSE;                   /* Reset literal state */
3875            ptr++;                            /* Skip the 'E' */            ptr++;                            /* Skip the 'E' */
# Line 2811  for (;; ptr++) Line 3884  for (;; ptr++)
3884        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3885        5.6 and 5.8 do. */        5.6 and 5.8 do. */
3886    
3887        if (c == '[' &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
3888            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3889            check_posix_syntax(ptr, &tempptr))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3890          {          {
3891          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3892          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3893          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3894          uschar pbits[32];          pcre_uint8 pbits[32];
3895    
3896          if (ptr[1] != ':')          if (ptr[1] != CHAR_COLON)
3897            {            {
3898            *errorcodeptr = ERR31;            *errorcodeptr = ERR31;
3899            goto FAILED;            goto FAILED;
3900            }            }
3901    
3902          ptr += 2;          ptr += 2;
3903          if (*ptr == '^')          if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3904            {            {
3905            local_negate = TRUE;            local_negate = TRUE;
3906            should_flip_negation = TRUE;  /* Note negative special */            should_flip_negation = TRUE;  /* Note negative special */
3907            ptr++;            ptr++;
3908            }            }
3909    
3910          posix_class = check_posix_name(ptr, tempptr - ptr);          posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3911          if (posix_class < 0)          if (posix_class < 0)
3912            {            {
3913            *errorcodeptr = ERR30;            *errorcodeptr = ERR30;
# Line 2848  for (;; ptr++) Line 3921  for (;; ptr++)
3921          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3922            posix_class = 0;            posix_class = 0;
3923    
3924          /* We build the bit map for the POSIX class in a chunk of local store          /* When PCRE_UCP is set, some of the POSIX classes are converted to
3925          because we may be adding and subtracting from it, and we don't want to          different escape sequences that use Unicode properties. */
3926          subtract bits that may be in the main map already. At the end we or the  
3927          result into the bit map that is being built. */  #ifdef SUPPORT_UCP
3928            if ((options & PCRE_UCP) != 0)
3929              {
3930              int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3931              if (posix_substitutes[pc] != NULL)
3932                {
3933                nestptr = tempptr + 1;
3934                ptr = posix_substitutes[pc] - 1;
3935                continue;
3936                }
3937              }
3938    #endif
3939            /* In the non-UCP case, we build the bit map for the POSIX class in a
3940            chunk of local store because we may be adding and subtracting from it,
3941            and we don't want to subtract bits that may be in the main map already.
3942            At the end we or the result into the bit map that is being built. */
3943    
3944          posix_class *= 3;          posix_class *= 3;
3945    
3946          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3947    
3948          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3949            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3950    
3951          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3952    
# Line 2889  for (;; ptr++) Line 3977  for (;; ptr++)
3977            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3978    
3979          ptr = tempptr + 1;          ptr = tempptr + 1;
3980          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3981            class_has_8bitchar = 1;
3982            /* Every class contains at least two characters. */
3983            class_single_char = 2;
3984          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3985          }          }
3986    
3987        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3988        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3989        case. Inside a class (and only there) it is treated as backspace.        case. Inside a class (and only there) it is treated as backspace. We
3990        Elsewhere it marks a word boundary. Other escapes have preset maps ready        assume that other escapes have more than one character in them, so
3991        to 'or' into the one we are building. We assume they have more than one        speculatively set both class_has_8bitchar and class_single_char bigger
3992        character in them, so set class_charcount bigger than one. */        than one. Unrecognized escapes fall through and are either treated
3993          as literal characters (by default), or are faulted if
3994          PCRE_EXTRA is set. */
3995    
3996        if (c == '\\')        if (c == CHAR_BACKSLASH)
3997          {          {
3998          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3999          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
4000    
4001          if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
4002          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_N)            /* \N is not supported in a class */
4003          else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */            {
4004              *errorcodeptr = ERR71;
4005              goto FAILED;
4006              }
4007          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
4008            {            {
4009            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4010              {              {
4011              ptr += 2; /* avoid empty string */              ptr += 2; /* avoid empty string */
4012              }              }
# Line 2921  for (;; ptr++) Line 4017  for (;; ptr++)
4017    
4018          if (c < 0)          if (c < 0)
4019            {            {
4020            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
4021            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
4022              class_has_8bitchar++;
4023              /* Every class contains at least two characters. */
4024              class_single_char += 2;
4025    
4026            /* Save time by not doing this in the pre-compile phase. */            switch (-c)
   
           if (lengthptr == NULL) switch (-c)  
4027              {              {
4028    #ifdef SUPPORT_UCP
4029                case ESC_du:     /* These are the values given for \d etc */
4030                case ESC_DU:     /* when PCRE_UCP is set. We replace the */
4031                case ESC_wu:     /* escape sequence with an appropriate \p */
4032                case ESC_WU:     /* or \P to test Unicode properties instead */
4033                case ESC_su:     /* of the default ASCII testing. */
4034                case ESC_SU:
4035                nestptr = ptr;
4036                ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
4037                class_has_8bitchar--;                /* Undo! */
4038                continue;
4039    #endif
4040              case ESC_d:              case ESC_d:
4041              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4042              continue;              continue;
# Line 2946  for (;; ptr++) Line 4055  for (;; ptr++)
4055              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4056              continue;              continue;
4057    
4058                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4059                if it was previously set by something earlier in the character
4060                class. */
4061    
4062              case ESC_s:              case ESC_s:
4063              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
4064              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
4065                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4066              continue;              continue;
4067    
4068              case ESC_S:              case ESC_S:
# Line 2957  for (;; ptr++) Line 4071  for (;; ptr++)
4071              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
4072              continue;              continue;
4073    
4074              default:    /* Not recognized; fall through */              case ESC_h:
             break;      /* Need "default" setting to stop compiler warning. */  
             }  
   
           /* In the pre-compile phase, just do the recognition. */  
   
           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||  
                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;  
   
           /* We need to deal with \H, \h, \V, and \v in both phases because  
           they use extra memory. */  
   
           if (-c == ESC_h)  
             {  
4075              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
4076              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
4077              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
4078  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4079              if (utf8)              xclass = TRUE;
4080                *class_uchardata++ = XCL_SINGLE;
4081                *class_uchardata++ = 0x1680;
4082                *class_uchardata++ = XCL_SINGLE;
4083                *class_uchardata++ = 0x180e;
4084                *class_uchardata++ = XCL_RANGE;
4085                *class_uchardata++ = 0x2000;
4086                *class_uchardata++ = 0x200a;
4087                *class_uchardata++ = XCL_SINGLE;
4088                *class_uchardata++ = 0x202f;
4089                *class_uchardata++ = XCL_SINGLE;
4090                *class_uchardata++ = 0x205f;
4091                *class_uchardata++ = XCL_SINGLE;
4092                *class_uchardata++ = 0x3000;
4093    #elif defined SUPPORT_UTF
4094                if (utf)
4095                {                {
4096                class_utf8 = TRUE;                xclass = TRUE;
4097                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4098                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4099                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4100                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4101                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4102                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4103                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4104                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4105                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4106                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4107                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4108                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4109                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4110                }                }
4111  #endif  #endif
4112              continue;              continue;
             }  
4113    
4114            if (-c == ESC_H)              case ESC_H:
             {  
4115              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
4116                {                {
4117                int x = 0xff;                int x = 0xff;
# Line 3010  for (;; ptr++) Line 4124  for (;; ptr++)
4124                  }                  }
4125                classbits[c] |= x;                classbits[c] |= x;
4126                }                }
4127    #ifndef COMPILE_PCRE8
4128  #ifdef SUPPORT_UTF8              xclass = TRUE;
4129              if (utf8)              *class_uchardata++ = XCL_RANGE;
4130                *class_uchardata++ = 0x0100;
4131                *class_uchardata++ = 0x167f;
4132                *class_uchardata++ = XCL_RANGE;
4133                *class_uchardata++ = 0x1681;
4134                *class_uchardata++ = 0x180d;
4135                *class_uchardata++ = XCL_RANGE;
4136                *class_uchardata++ = 0x180f;
4137                *class_uchardata++ = 0x1fff;
4138                *class_uchardata++ = XCL_RANGE;
4139                *class_uchardata++ = 0x200b;
4140                *class_uchardata++ = 0x202e;
4141                *class_uchardata++ = XCL_RANGE;
4142                *class_uchardata++ = 0x2030;
4143                *class_uchardata++ = 0x205e;
4144                *class_uchardata++ = XCL_RANGE;
4145                *class_uchardata++ = 0x2060;
4146                *class_uchardata++ = 0x2fff;
4147                *class_uchardata++ = XCL_RANGE;
4148                *class_uchardata++ = 0x3001;
4149    #ifdef SUPPORT_UTF
4150                if (utf)
4151                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4152                else
4153    #endif
4154                  *class_uchardata++ = 0xffff;
4155    #elif defined SUPPORT_UTF
4156                if (utf)
4157                {                {
4158                class_utf8 = TRUE;                xclass = TRUE;
4159                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4160                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4161                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4162                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4163                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4164                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4165                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4166                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4167                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4168                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4169                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4170                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4171                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4172                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4173                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4174                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4175                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4176                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4177                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4178                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);