/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

code/trunk/pcre_compile.c revision 459 by ph10, Sun Oct 4 09:21:39 2009 UTC code/branches/pcre16/pcre_compile.c revision 801 by ph10, Mon Dec 12 16:23:37 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2009 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57  used by pcretest. DEBUG is not defined when building a production library. */  also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. */
59    
60  #ifdef DEBUG  #ifdef PCRE_DEBUG
61  #include "pcre_printint.src"  #include "pcre_printint.src"
62  #endif  #endif
63    
# Line 87  so this number is very generous. Line 88  so this number is very generous.
88  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
89  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
90  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
92    filled up by repetitions of forward references, for example patterns like
93    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
94    that the workspace is expanded using malloc() in this situation. The value
95    below is therefore a minimum, and we put a maximum on it for safety. The
96    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
97    kicks in at the same number of forward references in all cases. */
98    
99  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
100    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
101    
102    /* The overrun tests check for a slightly smaller size so that they detect the
103    overrun before it actually does run off the end of the data block. */
104    
105    #define WORK_SIZE_SAFETY_MARGIN (100)
106    
107    /* Private flags added to firstchar and reqchar. */
108    
109    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
110    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
111    
112    /* Repeated character flags. */
113    
114    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
115    
116  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
117  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 118  static const short int escapes[] = { Line 139  static const short int escapes[] = {
139       -ESC_H,                  0,       -ESC_H,                  0,
140       0,                       -ESC_K,       0,                       -ESC_K,
141       0,                       0,       0,                       0,
142       0,                       0,       -ESC_N,                  0,
143       -ESC_P,                  -ESC_Q,       -ESC_P,                  -ESC_Q,
144       -ESC_R,                  -ESC_S,       -ESC_R,                  -ESC_S,
145       0,                       0,       0,                       0,
# Line 165  static const short int escapes[] = { Line 186  static const short int escapes[] = {
186  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
187  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
188  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
189  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
190  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
191  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
192  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
# Line 182  string is built from string macros so th Line 203  string is built from string macros so th
203  platforms. */  platforms. */
204    
205  typedef struct verbitem {  typedef struct verbitem {
206    int   len;    int   len;                 /* Length of verb name */
207    int   op;    int   op;                  /* Op when no arg, or -1 if arg mandatory */
208      int   op_arg;              /* Op when arg present, or -1 if not allowed */
209  } verbitem;  } verbitem;
210    
211  static const char verbnames[] =  static const char verbnames[] =
212      "\0"                       /* Empty name is a shorthand for MARK */
213      STRING_MARK0
214    STRING_ACCEPT0    STRING_ACCEPT0
215    STRING_COMMIT0    STRING_COMMIT0
216    STRING_F0    STRING_F0
# Line 196  static const char verbnames[] = Line 220  static const char verbnames[] =
220    STRING_THEN;    STRING_THEN;
221    
222  static const verbitem verbs[] = {  static const verbitem verbs[] = {
223    { 6, OP_ACCEPT },    { 0, -1,        OP_MARK },
224    { 6, OP_COMMIT },    { 4, -1,        OP_MARK },
225    { 1, OP_FAIL },    { 6, OP_ACCEPT, -1 },
226    { 4, OP_FAIL },    { 6, OP_COMMIT, -1 },
227    { 5, OP_PRUNE },    { 1, OP_FAIL,   -1 },
228    { 4, OP_SKIP  },    { 4, OP_FAIL,   -1 },
229    { 4, OP_THEN  }    { 5, OP_PRUNE,  OP_PRUNE_ARG },
230      { 4, OP_SKIP,   OP_SKIP_ARG  },
231      { 4, OP_THEN,   OP_THEN_ARG  }
232  };  };
233    
234  static const int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
# Line 220  static const char posix_names[] = Line 246  static const char posix_names[] =
246    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
247    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
248    
249  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
250    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
251    
252  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 250  static const int posix_class_maps[] = { Line 276  static const int posix_class_maps[] = {
276    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
277  };  };
278    
279    /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
280    substitutes must be in the order of the names, defined above, and there are
281    both positive and negative cases. NULL means no substitute. */
282    
283    #ifdef SUPPORT_UCP
284    static const pcre_uchar string_PNd[]  = {
285      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
286      CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
287    static const pcre_uchar string_pNd[]  = {
288      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
289      CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
290    static const pcre_uchar string_PXsp[] = {
291      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
292      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
293    static const pcre_uchar string_pXsp[] = {
294      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
295      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
296    static const pcre_uchar string_PXwd[] = {
297      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
298      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
299    static const pcre_uchar string_pXwd[] = {
300      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
301      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
302    
303    static const pcre_uchar *substitutes[] = {
304      string_PNd,           /* \D */
305      string_pNd,           /* \d */
306      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
307      string_pXsp,          /* \s */
308      string_PXwd,          /* \W */
309      string_pXwd           /* \w */
310    };
311    
312    static const pcre_uchar string_pL[] =   {
313      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
314      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
315    static const pcre_uchar string_pLl[] =  {
316      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
317      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
318    static const pcre_uchar string_pLu[] =  {
319      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
320      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
321    static const pcre_uchar string_pXan[] = {
322      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
323      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
324    static const pcre_uchar string_h[] =    {
325      CHAR_BACKSLASH, CHAR_h, '\0' };
326    static const pcre_uchar string_pXps[] = {
327      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
328      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
329    static const pcre_uchar string_PL[] =   {
330      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
331      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
332    static const pcre_uchar string_PLl[] =  {
333      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
334      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
335    static const pcre_uchar string_PLu[] =  {
336      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
337      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
338    static const pcre_uchar string_PXan[] = {
339      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
340      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341    static const pcre_uchar string_H[] =    {
342      CHAR_BACKSLASH, CHAR_H, '\0' };
343    static const pcre_uchar string_PXps[] = {
344      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
345      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346    
347    static const pcre_uchar *posix_substitutes[] = {
348      string_pL,            /* alpha */
349      string_pLl,           /* lower */
350      string_pLu,           /* upper */
351      string_pXan,          /* alnum */
352      NULL,                 /* ascii */
353      string_h,             /* blank */
354      NULL,                 /* cntrl */
355      string_pNd,           /* digit */
356      NULL,                 /* graph */
357      NULL,                 /* print */
358      NULL,                 /* punct */
359      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
360      string_pXwd,          /* word */
361      NULL,                 /* xdigit */
362      /* Negated cases */
363      string_PL,            /* ^alpha */
364      string_PLl,           /* ^lower */
365      string_PLu,           /* ^upper */
366      string_PXan,          /* ^alnum */
367      NULL,                 /* ^ascii */
368      string_H,             /* ^blank */
369      NULL,                 /* ^cntrl */
370      string_PNd,           /* ^digit */
371      NULL,                 /* ^graph */
372      NULL,                 /* ^print */
373      NULL,                 /* ^punct */
374      string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
375      string_PXwd,          /* ^word */
376      NULL                  /* ^xdigit */
377    };
378    #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
379    #endif
380    
381  #define STRING(a)  # a  #define STRING(a)  # a
382  #define XSTRING(s) STRING(s)  #define XSTRING(s) STRING(s)
# Line 262  the number of relocations needed when a Line 389  the number of relocations needed when a
389  it is now one long string. We cannot use a table of offsets, because the  it is now one long string. We cannot use a table of offsets, because the
390  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
391  simply count through to the one we want - this isn't a performance issue  simply count through to the one we want - this isn't a performance issue
392  because these strings are used only when there is a compilation error. */  because these strings are used only when there is a compilation error.
393    
394    Each substring ends with \0 to insert a null character. This includes the final
395    substring, so that the whole string ends with \0\0, which can be detected when
396    counting through. */
397    
398  static const char error_texts[] =  static const char error_texts[] =
399    "no error\0"    "no error\0"
# Line 309  static const char error_texts[] = Line 440  static const char error_texts[] =
440    /* 35 */    /* 35 */
441    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
442    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
443    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"    "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
444    "number after (?C is > 255\0"    "number after (?C is > 255\0"
445    "closing ) for (?C expected\0"    "closing ) for (?C expected\0"
446    /* 40 */    /* 40 */
# Line 331  static const char error_texts[] = Line 462  static const char error_texts[] =
462    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
463    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
464    /* 55 */    /* 55 */
465    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
466    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
467    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
468    "a numbered reference must not be zero\0"    "a numbered reference must not be zero\0"
469    "(*VERB) with an argument is not supported\0"    "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
470    /* 60 */    /* 60 */
471    "(*VERB) not recognized\0"    "(*VERB) not recognized\0"
472    "number is too big\0"    "number is too big\0"
# Line 343  static const char error_texts[] = Line 474  static const char error_texts[] =
474    "digit expected after (?+\0"    "digit expected after (?+\0"
475    "] is an invalid data character in JavaScript compatibility mode\0"    "] is an invalid data character in JavaScript compatibility mode\0"
476    /* 65 */    /* 65 */
477    "different names for subpatterns of the same number are not allowed";    "different names for subpatterns of the same number are not allowed\0"
478      "(*MARK) must have an argument\0"
479      "this version of PCRE is not compiled with PCRE_UCP support\0"
480      "\\c must be followed by an ASCII character\0"
481      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
482      /* 70 */
483      "internal error: unknown opcode in find_fixedlength()\0"
484      "\\N is not supported in a class\0"
485      "too many forward references\0"
486      "disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff)\0"
487      ;
488    
489  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
490  patterns. Note that the tables in chartables are dependent on the locale, and  patterns. Note that the tables in chartables are dependent on the locale, and
# Line 362  For convenience, we use the same bit def Line 502  For convenience, we use the same bit def
502    
503  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
504    
505    /* Using a simple comparison for decimal numbers rather than a memory read
506    is much faster, and the resulting code is simpler (the compiler turns it
507    into a subtraction and unsigned comparison). */
508    
509    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
510    
511  #ifndef EBCDIC  #ifndef EBCDIC
512    
513  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
514  UTF-8 mode. */  UTF-8 mode. */
515    
516  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
517    {    {
518    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
519    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 406  static const unsigned char digitab[] = Line 552  static const unsigned char digitab[] =
552    
553  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
554    
555  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
556    {    {
557    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
558    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 441  static const unsigned char digitab[] = Line 587  static const unsigned char digitab[] =
587    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
588    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
589    
590  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
591    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
592    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
593    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 480  static const unsigned char ebcdic_charta Line 626  static const unsigned char ebcdic_charta
626  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
627    
628  static BOOL  static BOOL
629    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
630      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
631    
632    
# Line 502  static const char * Line 648  static const char *
648  find_error_text(int n)  find_error_text(int n)
649  {  {
650  const char *s = error_texts;  const char *s = error_texts;
651  for (; n > 0; n--) while (*s++ != 0) {};  for (; n > 0; n--)
652      {
653      while (*s++ != 0) {};
654      if (*s == 0) return "Error text not found (please report)";
655      }
656  return s;  return s;
657  }  }
658    
659    
660  /*************************************************  /*************************************************
661    *           Expand the workspace                 *
662    *************************************************/
663    
664    /* This function is called during the second compiling phase, if the number of
665    forward references fills the existing workspace, which is originally a block on
666    the stack. A larger block is obtained from malloc() unless the ultimate limit
667    has been reached or the increase will be rather small.
668    
669    Argument: pointer to the compile data block
670    Returns:  0 if all went well, else an error number
671    */
672    
673    static int
674    expand_workspace(compile_data *cd)
675    {
676    pcre_uchar *newspace;
677    int newsize = cd->workspace_size * 2;
678    
679    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
680    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
681        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
682     return ERR72;
683    
684    newspace = (pcre_malloc)(newsize);
685    if (newspace == NULL) return ERR21;
686    
687    memcpy(newspace, cd->start_workspace, cd->workspace_size);
688    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
689    if (cd->workspace_size > COMPILE_WORK_SIZE)
690      (pcre_free)((void *)cd->start_workspace);
691    cd->start_workspace = newspace;
692    cd->workspace_size = newsize;
693    return 0;
694    }
695    
696    
697    
698    /*************************************************
699    *            Check for counted repeat            *
700    *************************************************/
701    
702    /* This function is called when a '{' is encountered in a place where it might
703    start a quantifier. It looks ahead to see if it really is a quantifier or not.
704    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
705    where the ddds are digits.
706    
707    Arguments:
708      p         pointer to the first char after '{'
709    
710    Returns:    TRUE or FALSE
711    */
712    
713    static BOOL
714    is_counted_repeat(const pcre_uchar *p)
715    {
716    if (!IS_DIGIT(*p)) return FALSE;
717    p++;
718    while (IS_DIGIT(*p)) p++;
719    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
720    
721    if (*p++ != CHAR_COMMA) return FALSE;
722    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
723    
724    if (!IS_DIGIT(*p)) return FALSE;
725    p++;
726    while (IS_DIGIT(*p)) p++;
727    
728    return (*p == CHAR_RIGHT_CURLY_BRACKET);
729    }
730    
731    
732    
733    /*************************************************
734  *            Handle escapes                      *  *            Handle escapes                      *
735  *************************************************/  *************************************************/
736    
# Line 532  Returns:         zero or positive => a d Line 755  Returns:         zero or positive => a d
755  */  */
756    
757  static int  static int
758  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
759    int options, BOOL isclass)    int options, BOOL isclass)
760  {  {
761  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
762  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
763  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
764    pcre_int32 c;
765    int i;
766    
767  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
768  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
# Line 551  in a table. A non-zero result is somethi Line 776  in a table. A non-zero result is somethi
776  Otherwise further processing may be required. */  Otherwise further processing may be required. */
777    
778  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
779  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
780    else if (c < CHAR_0 || c > CHAR_z) {}
781  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
782    
783  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
784  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
785    else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
786  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
787  #endif  #endif
788    
# Line 563  else if ((i = escapes[c - 0x48]) != 0) Line 790  else if ((i = escapes[c - 0x48]) != 0)
790    
791  else  else
792    {    {
793    const uschar *oldptr;    const pcre_uchar *oldptr;
794    BOOL braced, negated;    BOOL braced, negated;
795    
796    switch (c)    switch (c)
# Line 573  else Line 800  else
800    
801      case CHAR_l:      case CHAR_l:
802      case CHAR_L:      case CHAR_L:
803      case CHAR_N:      *errorcodeptr = ERR37;
804        break;
805    
806      case CHAR_u:      case CHAR_u:
807        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
808          {
809          /* In JavaScript, \u must be followed by four hexadecimal numbers.
810          Otherwise it is a lowercase u letter. */
811          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
812            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
813            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
814            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
815            {
816            c = 0;
817            for (i = 0; i < 4; ++i)
818              {
819              register int cc = *(++ptr);
820    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
821              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
822              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
823    #else           /* EBCDIC coding */
824              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
825              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
826    #endif
827              }
828            }
829          }
830        else
831          *errorcodeptr = ERR37;
832        break;
833    
834      case CHAR_U:      case CHAR_U:
835      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
836        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
837      break;      break;
838    
839      /* \g must be followed by one of a number of specific things:      /* In a character class, \g is just a literal "g". Outside a character
840        class, \g must be followed by one of a number of specific things:
841    
842      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
843      backreference. If negative, it is a relative backreference. This is a Perl      backreference. If negative, it is a relative backreference. This is a Perl
# Line 596  else Line 854  else
854      the -ESC_g code (cf \k). */      the -ESC_g code (cf \k). */
855    
856      case CHAR_g:      case CHAR_g:
857        if (isclass) break;
858      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
859        {        {
860        c = -ESC_g;        c = -ESC_g;
# Line 606  else Line 865  else
865    
866      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
867        {        {
868        const uschar *p;        const pcre_uchar *p;
869        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
870          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
871        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
872          {          {
873          c = -ESC_k;          c = -ESC_k;
# Line 626  else Line 885  else
885        }        }
886      else negated = FALSE;      else negated = FALSE;
887    
888        /* The integer range is limited by the machine's int representation. */
889      c = 0;      c = 0;
890      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
891          {
892          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
893            {
894            c = -1;
895            break;
896            }
897        c = c * 10 + *(++ptr) - CHAR_0;        c = c * 10 + *(++ptr) - CHAR_0;
898          }
899      if (c < 0)   /* Integer overflow */      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
900        {        {
901          while (IS_DIGIT(ptr[1]))
902            ptr++;
903        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
904        break;        break;
905        }        }
# Line 679  else Line 947  else
947      if (!isclass)      if (!isclass)
948        {        {
949        oldptr = ptr;        oldptr = ptr;
950          /* The integer range is limited by the machine's int representation. */
951        c -= CHAR_0;        c -= CHAR_0;
952        while ((digitab[ptr[1]] & ctype_digit) != 0)        while (IS_DIGIT(ptr[1]))
953            {
954            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
955              {
956              c = -1;
957              break;
958              }
959          c = c * 10 + *(++ptr) - CHAR_0;          c = c * 10 + *(++ptr) - CHAR_0;
960        if (c < 0)    /* Integer overflow */          }
961          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
962          {          {
963            while (IS_DIGIT(ptr[1]))
964              ptr++;
965          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
966          break;          break;
967          }          }
# Line 716  else Line 994  else
994      c -= CHAR_0;      c -= CHAR_0;
995      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
996          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
997      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf && c > 0xff) *errorcodeptr = ERR51;
998      break;      break;
999    
1000      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
1001      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1002      treated as a data character. */      If not, { is treated as a data character. */
1003    
1004      case CHAR_x:      case CHAR_x:
1005        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1006          {
1007          /* In JavaScript, \x must be followed by two hexadecimal numbers.
1008          Otherwise it is a lowercase x letter. */
1009          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1010            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1011            {
1012            c = 0;
1013            for (i = 0; i < 2; ++i)
1014              {
1015              register int cc = *(++ptr);
1016    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1017              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1018              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1019    #else           /* EBCDIC coding */
1020              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1021              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1022    #endif
1023              }
1024            }
1025          break;
1026          }
1027    
1028      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1029        {        {
1030        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
       int count = 0;  
1031    
1032        c = 0;        c = 0;
1033        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1034          {          {
1035          register int cc = *pt++;          register int cc = *pt++;
1036          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
         count++;  
1037    
1038  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1039          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
# Line 743  else Line 1042  else
1042          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1043          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1044  #endif  #endif
1045    
1046    #ifdef COMPILE_PCRE8
1047            if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1048    #else
1049    #ifdef COMPILE_PCRE16
1050            if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1051    #endif
1052    #endif
1053            }
1054    
1055          if (c < 0)
1056            {
1057            while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1058            *errorcodeptr = ERR34;
1059          }          }
1060    
1061        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1062          {          {
1063          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1064          ptr = pt;          ptr = pt;
1065          break;          break;
1066          }          }
# Line 759  else Line 1072  else
1072      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1073    
1074      c = 0;      c = 0;
1075      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1076        {        {
1077        int cc;                                  /* Some compilers don't like */        int cc;                                  /* Some compilers don't like */
1078        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
# Line 774  else Line 1087  else
1087      break;      break;
1088    
1089      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1090      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
1091        coding is ASCII-specific, but then the whole concept of \cx is
1092      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1093    
1094      case CHAR_c:      case CHAR_c:
# Line 784  else Line 1098  else
1098        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
1099        break;        break;
1100        }        }
1101    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1102  #ifndef EBCDIC  /* ASCII/UTF-8 coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
1103          {
1104          *errorcodeptr = ERR68;
1105          break;
1106          }
1107      if (c >= CHAR_a && c <= CHAR_z) c -= 32;      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1108      c ^= 0x40;      c ^= 0x40;
1109  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1110      if (c >= CHAR_a && c <= CHAR_z) c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1111      c ^= 0xC0;      c ^= 0xC0;
1112  #endif  #endif
# Line 811  else Line 1129  else
1129      }      }
1130    }    }
1131    
1132    /* Perl supports \N{name} for character names, as well as plain \N for "not
1133    newline". PCRE does not support \N{name}. However, it does support
1134    quantification such as \N{2,3}. */
1135    
1136    if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1137         !is_counted_repeat(ptr+2))
1138      *errorcodeptr = ERR37;
1139    
1140    /* If PCRE_UCP is set, we change the values for \d etc. */
1141    
1142    if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1143      c -= (ESC_DU - ESC_D);
1144    
1145    /* Set the pointer to the final character before returning. */
1146    
1147  *ptrptr = ptr;  *ptrptr = ptr;
1148  return c;  return c;
1149  }  }
# Line 837  Returns:         type value from ucp_typ Line 1170  Returns:         type value from ucp_typ
1170  */  */
1171    
1172  static int  static int
1173  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1174  {  {
1175  int c, i, bot, top;  int c, i, bot, top;
1176  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1177  char name[32];  pcre_uchar name[32];
1178    
1179  c = *(++ptr);  c = *(++ptr);
1180  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 882  else Line 1215  else
1215  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1216    
1217  bot = 0;  bot = 0;
1218  top = _pcre_utt_size;  top = PRIV(utt_size);
1219    
1220  while (bot < top)  while (bot < top)
1221    {    {
1222    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1223    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1224    if (c == 0)    if (c == 0)
1225      {      {
1226      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1227      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1228      }      }
1229    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1230    }    }
# Line 911  return -1; Line 1244  return -1;
1244    
1245    
1246  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if (*p++ != CHAR_COMMA) return FALSE;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == CHAR_RIGHT_CURLY_BRACKET);  
 }  
   
   
   
 /*************************************************  
1247  *         Read repeat counts                     *  *         Read repeat counts                     *
1248  *************************************************/  *************************************************/
1249    
# Line 962  Returns:         pointer to '}' on succe Line 1262  Returns:         pointer to '}' on succe
1262                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1263  */  */
1264    
1265  static const uschar *  static const pcre_uchar *
1266  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1267  {  {
1268  int min = 0;  int min = 0;
1269  int max = -1;  int max = -1;
# Line 971  int max = -1; Line 1271  int max = -1;
1271  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1272  an integer overflow. */  an integer overflow. */
1273    
1274  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1275  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1276    {    {
1277    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 986  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1286  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1286    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1287      {      {
1288      max = 0;      max = 0;
1289      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1290      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1291        {        {
1292        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1019  top-level call starts at the beginning o Line 1319  top-level call starts at the beginning o
1319  start at a parenthesis. It scans along a pattern's text looking for capturing  start at a parenthesis. It scans along a pattern's text looking for capturing
1320  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1321  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1322  returns when it reaches a given numbered subpattern. We know that if (?P< is  returns when it reaches a given numbered subpattern. Recursion is used to keep
1323  encountered, the name will be terminated by '>' because that is checked in the  track of subpatterns that reset the capturing group numbers - the (?| feature.
1324  first pass. Recursion is used to keep track of subpatterns that reset the  
1325  capturing group numbers - the (?| feature.  This function was originally called only from the second pass, in which we know
1326    that if (?< or (?' or (?P< is encountered, the name will be correctly
1327    terminated because that is checked in the first pass. There is now one call to
1328    this function in the first pass, to check for a recursive back reference by
1329    name (so that we can make the whole group atomic). In this case, we need check
1330    only up to the current position in the pattern, and that is still OK because
1331    and previous occurrences will have been checked. To make this work, the test
1332    for "end of pattern" is a check against cd->end_pattern in the main loop,
1333    instead of looking for a binary zero. This means that the special first-pass
1334    call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1335    processing items within the loop are OK, because afterwards the main loop will
1336    terminate.)
1337    
1338  Arguments:  Arguments:
1339    ptrptr       address of the current character pointer (updated)    ptrptr       address of the current character pointer (updated)
# Line 1030  Arguments: Line 1341  Arguments:
1341    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1342    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1343    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1344      utf          TRUE if we are in UTF-8 / UTF-16 mode
1345    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1346    
1347  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1348  */  */
1349    
1350  static int  static int
1351  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1352    BOOL xmode, int *count)    BOOL xmode, BOOL utf, int *count)
1353  {  {
1354  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1355  int start_count = *count;  int start_count = *count;
1356  int hwm_count = start_count;  int hwm_count = start_count;
1357  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1049  dealing with. The very first call may no Line 1361  dealing with. The very first call may no
1361    
1362  if (ptr[0] == CHAR_LEFT_PARENTHESIS)  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1363    {    {
1364    if (ptr[1] == CHAR_QUESTION_MARK &&    /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1365        ptr[2] == CHAR_VERTICAL_LINE)  
1366      if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1367    
1368      /* Handle a normal, unnamed capturing parenthesis. */
1369    
1370      else if (ptr[1] != CHAR_QUESTION_MARK)
1371        {
1372        *count += 1;
1373        if (name == NULL && *count == lorn) return *count;
1374        ptr++;
1375        }
1376    
1377      /* All cases now have (? at the start. Remember when we are in a group
1378      where the parenthesis numbers are duplicated. */
1379    
1380      else if (ptr[2] == CHAR_VERTICAL_LINE)
1381      {      {
1382      ptr += 3;      ptr += 3;
1383      dup_parens = TRUE;      dup_parens = TRUE;
1384      }      }
1385    
1386    /* Handle a normal, unnamed capturing parenthesis */    /* Handle comments; all characters are allowed until a ket is reached. */
1387    
1388    else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)    else if (ptr[2] == CHAR_NUMBER_SIGN)
1389      {      {
1390      *count += 1;      for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1391      if (name == NULL && *count == lorn) return *count;      goto FAIL_EXIT;
     ptr++;  
1392      }      }
1393    
1394    /* Handle a condition. If it is an assertion, just carry on so that it    /* Handle a condition. If it is an assertion, just carry on so that it
1395    is processed as normal. If not, skip to the closing parenthesis of the    is processed as normal. If not, skip to the closing parenthesis of the
1396    condition (there can't be any nested parens. */    condition (there can't be any nested parens). */
1397    
1398    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1399      {      {
# Line 1079  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1405  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1405        }        }
1406      }      }
1407    
1408    /* We have either (? or (* and not a condition */    /* Start with (? but not a condition. */
1409    
1410    else    else
1411      {      {
# Line 1092  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1418  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1418          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1419        {        {
1420        int term;        int term;
1421        const uschar *thisname;        const pcre_uchar *thisname;
1422        *count += 1;        *count += 1;
1423        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1424        term = *ptr++;        term = *ptr++;
# Line 1100  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1426  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1426        thisname = ptr;        thisname = ptr;
1427        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1428        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1429            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, lorn) == 0)
1430          return *count;          return *count;
1431        term++;        term++;
1432        }        }
1433      }      }
1434    }    }
1435    
1436  /* Past any initial parenthesis handling, scan for parentheses or vertical  /* Past any initial parenthesis handling, scan for parentheses or vertical
1437  bars. */  bars. Stop if we get to cd->end_pattern. Note that this is important for the
1438    first-pass call when this value is temporarily adjusted to stop at the current
1439    position. So DO NOT change this to a test for binary zero. */
1440    
1441  for (; *ptr != 0; ptr++)  for (; ptr < cd->end_pattern; ptr++)
1442    {    {
1443    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1444    
# Line 1141  for (; *ptr != 0; ptr++) Line 1469  for (; *ptr != 0; ptr++)
1469          {          {
1470          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1471            ptr+= 2;            ptr+= 2;
1472          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1473                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1474            ptr += 4;            ptr += 4;
1475          else          else
1476            break;            break;
1477          }          }
1478        else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1479          {          {
1480          negate_class = TRUE;          negate_class = TRUE;
1481          ptr++;          ptr++;
1482          }          }
1483        else break;        else break;
1484        }        }
1485    
# Line 1184  for (; *ptr != 0; ptr++) Line 1512  for (; *ptr != 0; ptr++)
1512    
1513    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1514      {      {
1515      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      ptr++;
1516        while (*ptr != 0)
1517          {
1518          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1519          ptr++;
1520    #ifdef SUPPORT_UTF
1521          if (utf) FORWARDCHAR(ptr);
1522    #endif
1523          }
1524      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1525      continue;      continue;
1526      }      }
# Line 1193  for (; *ptr != 0; ptr++) Line 1529  for (; *ptr != 0; ptr++)
1529    
1530    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1531      {      {
1532      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1533      if (rc > 0) return rc;      if (rc > 0) return rc;
1534      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1535      }      }
# Line 1201  for (; *ptr != 0; ptr++) Line 1537  for (; *ptr != 0; ptr++)
1537    else if (*ptr == CHAR_RIGHT_PARENTHESIS)    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1538      {      {
1539      if (dup_parens && *count < hwm_count) *count = hwm_count;      if (dup_parens && *count < hwm_count) *count = hwm_count;
1540      *ptrptr = ptr;      goto FAIL_EXIT;
     return -1;  
1541      }      }
1542    
1543    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
# Line 1240  Arguments: Line 1575  Arguments:
1575    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1576    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1577    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1578      utf          TRUE if we are in UTF-8 / UTF-16 mode
1579    
1580  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1581  */  */
1582    
1583  static int  static int
1584  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1585      BOOL utf)
1586  {  {
1587  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1588  int count = 0;  int count = 0;
1589  int rc;  int rc;
1590    
# Line 1258  matching closing parens. That is why we Line 1595  matching closing parens. That is why we
1595    
1596  for (;;)  for (;;)
1597    {    {
1598    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1599    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1600    }    }
1601    
# Line 1274  return rc; Line 1611  return rc;
1611    
1612  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1613  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1614  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, it makes sense to skip negative
1615  For some calls, it makes sense to skip negative forward and all backward  forward and all backward assertions, and also the \b assertion; for others it
1616  assertions, and also the \b assertion; for others it does not.  does not.
1617    
1618  Arguments:  Arguments:
1619    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1620    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1621    
1622  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1623  */  */
1624    
1625  static const uschar*  static const pcre_uchar*
1626  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const pcre_uchar *code, BOOL skipassert)
   BOOL skipassert)  
1627  {  {
1628  for (;;)  for (;;)
1629    {    {
1630    switch ((int)*code)    switch ((int)*code)
1631      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1632      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1633      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1634      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1635      if (!skipassert) return code;      if (!skipassert) return code;
1636      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1637      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1638      break;      break;
1639    
1640      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1321  for (;;) Line 1648  for (;;)
1648      case OP_RREF:      case OP_RREF:
1649      case OP_NRREF:      case OP_NRREF:
1650      case OP_DEF:      case OP_DEF:
1651      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1652      break;      break;
1653    
1654      default:      default:
# Line 1340  for (;;) Line 1667  for (;;)
1667    
1668  /* Scan a branch and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1669  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1670  In UTF8 mode, the result is in characters rather than bytes. The branch is  In UTF8 mode, the result is in characters rather than bytes. The branch is
1671  temporarily terminated with OP_END when this function is called.  temporarily terminated with OP_END when this function is called.
1672    
1673  This function is called when a backward assertion is encountered, so that if it  This function is called when a backward assertion is encountered, so that if it
1674  fails, the error message can point to the correct place in the pattern.  fails, the error message can point to the correct place in the pattern.
1675  However, we cannot do this when the assertion contains subroutine calls,  However, we cannot do this when the assertion contains subroutine calls,
1676  because they can be forward references. We solve this by remembering this case  because they can be forward references. We solve this by remembering this case
1677  and doing the check at the end; a flag specifies which mode we are running in.  and doing the check at the end; a flag specifies which mode we are running in.
1678    
1679  Arguments:  Arguments:
1680    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1681    options  the compiling options    utf      TRUE in UTF-8 / UTF-16 mode
1682    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1683    cd       the "compile data" structure    cd       the "compile data" structure
1684    
1685  Returns:   the fixed length,  Returns:   the fixed length,
1686               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1687               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1688               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1689                 or -4 if an unknown opcode was encountered (internal error)
1690  */  */
1691    
1692  static int  static int
1693  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1694  {  {
1695  int length = -1;  int length = -1;
1696    
1697  register int branchlength = 0;  register int branchlength = 0;
1698  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1699    
1700  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1701  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1375  branch, check the length against that of Line 1703  branch, check the length against that of
1703  for (;;)  for (;;)
1704    {    {
1705    int d;    int d;
1706    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1707    register int op = *cc;    register int op = *cc;
1708    switch (op)    switch (op)
1709      {      {
1710        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1711        OP_BRA (normal non-capturing bracket) because the other variants of these
1712        opcodes are all concerned with unlimited repeated groups, which of course
1713        are not of fixed length. */
1714    
1715      case OP_CBRA:      case OP_CBRA:
1716      case OP_BRA:      case OP_BRA:
1717      case OP_ONCE:      case OP_ONCE:
1718        case OP_ONCE_NC:
1719      case OP_COND:      case OP_COND:
1720      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1721      if (d < 0) return d;      if (d < 0) return d;
1722      branchlength += d;      branchlength += d;
1723      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1724      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1725      break;      break;
1726    
1727      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1728      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1729      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1730        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1731        because they all imply an unlimited repeat. */
1732    
1733      case OP_ALT:      case OP_ALT:
1734      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1735      case OP_END:      case OP_END:
1736        case OP_ACCEPT:
1737        case OP_ASSERT_ACCEPT:
1738      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1739        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1740      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
1741      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1742      branchlength = 0;      branchlength = 0;
1743      break;      break;
1744    
1745      /* A true recursion implies not fixed length, but a subroutine call may      /* A true recursion implies not fixed length, but a subroutine call may
1746      be OK. If the subroutine is a forward reference, we can't deal with      be OK. If the subroutine is a forward reference, we can't deal with
1747      it until the end of the pattern, so return -3. */      it until the end of the pattern, so return -3. */
1748    
1749      case OP_RECURSE:      case OP_RECURSE:
1750      if (!atend) return -3;      if (!atend) return -3;
1751      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1752      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1753      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1754      d = find_fixedlength(cs + 2, options, atend, cd);      d = find_fixedlength(cs + 2, utf, atend, cd);
1755      if (d < 0) return d;      if (d < 0) return d;
1756      branchlength += d;      branchlength += d;
1757      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1758      break;      break;
1759    
1760      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1761    
# Line 1432  for (;;) Line 1768  for (;;)
1768    
1769      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1770    
1771      case OP_REVERSE:      case OP_MARK:
1772        case OP_PRUNE_ARG:
1773        case OP_SKIP_ARG:
1774        case OP_THEN_ARG:
1775        cc += cc[1] + PRIV(OP_lengths)[*cc];
1776        break;
1777    
1778        case OP_CALLOUT:
1779        case OP_CIRC:
1780        case OP_CIRCM:
1781        case OP_CLOSE:
1782        case OP_COMMIT:
1783      case OP_CREF:      case OP_CREF:
     case OP_NCREF:  
     case OP_RREF:  
     case OP_NRREF:  
1784      case OP_DEF:      case OP_DEF:
1785      case OP_OPT:      case OP_DOLL:
1786      case OP_CALLOUT:      case OP_DOLLM:
     case OP_SOD:  
     case OP_SOM:  
1787      case OP_EOD:      case OP_EOD:
1788      case OP_EODN:      case OP_EODN:
1789      case OP_CIRC:      case OP_FAIL:
1790      case OP_DOLL:      case OP_NCREF:
1791        case OP_NRREF:
1792      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1793        case OP_PRUNE:
1794        case OP_REVERSE:
1795        case OP_RREF:
1796        case OP_SET_SOM:
1797        case OP_SKIP:
1798        case OP_SOD:
1799        case OP_SOM:
1800        case OP_THEN:
1801      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1802      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1803      break;      break;
1804    
1805      /* Handle literal characters */      /* Handle literal characters */
1806    
1807      case OP_CHAR:      case OP_CHAR:
1808      case OP_CHARNC:      case OP_CHARI:
1809      case OP_NOT:      case OP_NOT:
1810        case OP_NOTI:
1811      branchlength++;      branchlength++;
1812      cc += 2;      cc += 2;
1813  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1814      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1815  #endif  #endif
1816      break;      break;
1817    
# Line 1468  for (;;) Line 1819  for (;;)
1819      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1820    
1821      case OP_EXACT:      case OP_EXACT:
1822        case OP_EXACTI:
1823        case OP_NOTEXACT:
1824        case OP_NOTEXACTI:
1825      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1826      cc += 4;      cc += 2 + IMM2_SIZE;
1827  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1828      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1829  #endif  #endif
1830      break;      break;
1831    
1832      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1833      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1834      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1835      cc += 4;      cc += 1 + IMM2_SIZE + 1;
1836      break;      break;
1837    
1838      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1489  for (;;) Line 1842  for (;;)
1842      cc += 2;      cc += 2;
1843      /* Fall through */      /* Fall through */
1844    
1845        case OP_HSPACE:
1846        case OP_VSPACE:
1847        case OP_NOT_HSPACE:
1848        case OP_NOT_VSPACE:
1849      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1850      case OP_DIGIT:      case OP_DIGIT:
1851      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1501  for (;;) Line 1858  for (;;)
1858      cc++;      cc++;
1859      break;      break;
1860    
1861      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1862        otherwise \C is coded as OP_ALLANY. */
1863    
1864      case OP_ANYBYTE:      case OP_ANYBYTE:
1865      return -2;      return -2;
1866    
1867      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1868    
1869  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1870      case OP_XCLASS:      case OP_XCLASS:
1871      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1872      /* Fall through */      /* Fall through */
1873  #endif  #endif
1874    
1875      case OP_CLASS:      case OP_CLASS:
1876      case OP_NCLASS:      case OP_NCLASS:
1877      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1878    
1879      switch (*cc)      switch (*cc)
1880        {        {
1881          case OP_CRPLUS:
1882          case OP_CRMINPLUS:
1883        case OP_CRSTAR:        case OP_CRSTAR:
1884        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1885        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1528  for (;;) Line 1888  for (;;)
1888    
1889        case OP_CRRANGE:        case OP_CRRANGE:
1890        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1891        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1892        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1893        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1894        break;        break;
1895    
1896        default:        default:
# Line 1540  for (;;) Line 1900  for (;;)
1900    
1901      /* Anything else is variable length */      /* Anything else is variable length */
1902    
1903      default:      case OP_ANYNL:
1904        case OP_BRAMINZERO:
1905        case OP_BRAPOS:
1906        case OP_BRAPOSZERO:
1907        case OP_BRAZERO:
1908        case OP_CBRAPOS:
1909        case OP_EXTUNI:
1910        case OP_KETRMAX:
1911        case OP_KETRMIN:
1912        case OP_KETRPOS:
1913        case OP_MINPLUS:
1914        case OP_MINPLUSI:
1915        case OP_MINQUERY:
1916        case OP_MINQUERYI:
1917        case OP_MINSTAR:
1918        case OP_MINSTARI:
1919        case OP_MINUPTO:
1920        case OP_MINUPTOI:
1921        case OP_NOTMINPLUS:
1922        case OP_NOTMINPLUSI:
1923        case OP_NOTMINQUERY:
1924        case OP_NOTMINQUERYI:
1925        case OP_NOTMINSTAR:
1926        case OP_NOTMINSTARI:
1927        case OP_NOTMINUPTO:
1928        case OP_NOTMINUPTOI:
1929        case OP_NOTPLUS:
1930        case OP_NOTPLUSI:
1931        case OP_NOTPOSPLUS:
1932        case OP_NOTPOSPLUSI:
1933        case OP_NOTPOSQUERY:
1934        case OP_NOTPOSQUERYI:
1935        case OP_NOTPOSSTAR:
1936        case OP_NOTPOSSTARI:
1937        case OP_NOTPOSUPTO:
1938        case OP_NOTPOSUPTOI:
1939        case OP_NOTQUERY:
1940        case OP_NOTQUERYI:
1941        case OP_NOTSTAR:
1942        case OP_NOTSTARI:
1943        case OP_NOTUPTO:
1944        case OP_NOTUPTOI:
1945        case OP_PLUS:
1946        case OP_PLUSI:
1947        case OP_POSPLUS:
1948        case OP_POSPLUSI:
1949        case OP_POSQUERY:
1950        case OP_POSQUERYI:
1951        case OP_POSSTAR:
1952        case OP_POSSTARI:
1953        case OP_POSUPTO:
1954        case OP_POSUPTOI:
1955        case OP_QUERY:
1956        case OP_QUERYI:
1957        case OP_REF:
1958        case OP_REFI:
1959        case OP_SBRA:
1960        case OP_SBRAPOS:
1961        case OP_SCBRA:
1962        case OP_SCBRAPOS:
1963        case OP_SCOND:
1964        case OP_SKIPZERO:
1965        case OP_STAR:
1966        case OP_STARI:
1967        case OP_TYPEMINPLUS:
1968        case OP_TYPEMINQUERY:
1969        case OP_TYPEMINSTAR:
1970        case OP_TYPEMINUPTO:
1971        case OP_TYPEPLUS:
1972        case OP_TYPEPOSPLUS:
1973        case OP_TYPEPOSQUERY:
1974        case OP_TYPEPOSSTAR:
1975        case OP_TYPEPOSUPTO:
1976        case OP_TYPEQUERY:
1977        case OP_TYPESTAR:
1978        case OP_TYPEUPTO:
1979        case OP_UPTO:
1980        case OP_UPTOI:
1981      return -1;      return -1;
1982    
1983        /* Catch unrecognized opcodes so that when new ones are added they
1984        are not forgotten, as has happened in the past. */
1985    
1986        default:
1987        return -4;
1988      }      }
1989    }    }
1990  /* Control never gets here */  /* Control never gets here */
# Line 1556  for (;;) Line 1999  for (;;)
1999    
2000  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
2001  capturing bracket with the given number, or, if the number is negative, an  capturing bracket with the given number, or, if the number is negative, an
2002  instance of OP_REVERSE for a lookbehind. The function is global in the C sense  instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2003  so that it can be called from pcre_study() when finding the minimum matching  so that it can be called from pcre_study() when finding the minimum matching
2004  length.  length.
2005    
2006  Arguments:  Arguments:
2007    code        points to start of expression    code        points to start of expression
2008    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2009    number      the required bracket number or negative to find a lookbehind    number      the required bracket number or negative to find a lookbehind
2010    
2011  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2012  */  */
2013    
2014  const uschar *  const pcre_uchar *
2015  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2016  {  {
2017  for (;;)  for (;;)
2018    {    {
2019    register int c = *code;    register int c = *code;
2020    
2021    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
2022    
2023    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1581  for (;;) Line 2025  for (;;)
2025    the table is zero; the actual length is stored in the compiled code. */    the table is zero; the actual length is stored in the compiled code. */
2026    
2027    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
2028    
2029    /* Handle recursion */    /* Handle recursion */
2030    
2031    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
2032      {      {
2033      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
2034      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2035      }      }
2036    
2037    /* Handle capturing bracket */    /* Handle capturing bracket */
2038    
2039    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
2040               c == OP_CBRAPOS || c == OP_SCBRAPOS)
2041      {      {
2042      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
2043      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2044      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2045      }      }
2046    
2047    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
2048    repeated character types, we have to test for \p and \P, which have an extra    repeated character types, we have to test for \p and \P, which have an extra
2049    two bytes of parameters. */    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2050      must add in its length. */
2051    
2052    else    else
2053      {      {
# Line 1623  for (;;) Line 2069  for (;;)
2069        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2070        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2071        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2072        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2073            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2074          break;
2075    
2076          case OP_MARK:
2077          case OP_PRUNE_ARG:
2078          case OP_SKIP_ARG:
2079          code += code[1];
2080          break;
2081    
2082          case OP_THEN_ARG:
2083          code += code[1];
2084        break;        break;
2085        }        }
2086    
2087      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2088    
2089      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2090    
2091    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2092    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2093    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2094    
2095  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2096      if (utf8) switch(c)      if (utf) switch(c)
2097        {        {
2098        case OP_CHAR:        case OP_CHAR:
2099        case OP_CHARNC:        case OP_CHARI:
2100        case OP_EXACT:        case OP_EXACT:
2101          case OP_EXACTI:
2102        case OP_UPTO:        case OP_UPTO:
2103          case OP_UPTOI:
2104        case OP_MINUPTO:        case OP_MINUPTO:
2105          case OP_MINUPTOI:
2106        case OP_POSUPTO:        case OP_POSUPTO:
2107          case OP_POSUPTOI:
2108        case OP_STAR:        case OP_STAR:
2109          case OP_STARI:
2110        case OP_MINSTAR:        case OP_MINSTAR:
2111          case OP_MINSTARI:
2112        case OP_POSSTAR:        case OP_POSSTAR:
2113          case OP_POSSTARI:
2114        case OP_PLUS:        case OP_PLUS:
2115          case OP_PLUSI:
2116        case OP_MINPLUS:        case OP_MINPLUS:
2117          case OP_MINPLUSI:
2118        case OP_POSPLUS:        case OP_POSPLUS:
2119          case OP_POSPLUSI:
2120        case OP_QUERY:        case OP_QUERY:
2121          case OP_QUERYI:
2122        case OP_MINQUERY:        case OP_MINQUERY:
2123          case OP_MINQUERYI:
2124        case OP_POSQUERY:        case OP_POSQUERY:
2125        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2126          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2127        break;        break;
2128        }        }
2129  #else  #else
2130      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2131  #endif  #endif
2132      }      }
2133    }    }
# Line 1674  instance of OP_RECURSE. Line 2144  instance of OP_RECURSE.
2144    
2145  Arguments:  Arguments:
2146    code        points to start of expression    code        points to start of expression
2147    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2148    
2149  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2150  */  */
2151    
2152  static const uschar *  static const pcre_uchar *
2153  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2154  {  {
2155  for (;;)  for (;;)
2156    {    {
# Line 1696  for (;;) Line 2166  for (;;)
2166    
2167    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
2168    repeated character types, we have to test for \p and \P, which have an extra    repeated character types, we have to test for \p and \P, which have an extra
2169    two bytes of parameters. */    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2170      must add in its length. */
2171    
2172    else    else
2173      {      {
# Line 1718  for (;;) Line 2189  for (;;)
2189        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2190        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2191        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2192        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2193            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2194          break;
2195    
2196          case OP_MARK:
2197          case OP_PRUNE_ARG:
2198          case OP_SKIP_ARG:
2199          code += code[1];
2200          break;
2201    
2202          case OP_THEN_ARG:
2203          code += code[1];
2204        break;        break;
2205        }        }
2206    
2207      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2208    
2209      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2210    
2211      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2212      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2213      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2214    
2215  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2216      if (utf8) switch(c)      if (utf) switch(c)
2217        {        {
2218        case OP_CHAR:        case OP_CHAR:
2219        case OP_CHARNC:        case OP_CHARI:
2220        case OP_EXACT:        case OP_EXACT:
2221          case OP_EXACTI:
2222        case OP_UPTO:        case OP_UPTO:
2223          case OP_UPTOI:
2224        case OP_MINUPTO:        case OP_MINUPTO:
2225          case OP_MINUPTOI:
2226        case OP_POSUPTO:        case OP_POSUPTO:
2227          case OP_POSUPTOI:
2228        case OP_STAR:        case OP_STAR:
2229          case OP_STARI:
2230        case OP_MINSTAR:        case OP_MINSTAR:
2231          case OP_MINSTARI:
2232        case OP_POSSTAR:        case OP_POSSTAR:
2233          case OP_POSSTARI:
2234        case OP_PLUS:        case OP_PLUS:
2235          case OP_PLUSI:
2236        case OP_MINPLUS:        case OP_MINPLUS:
2237          case OP_MINPLUSI:
2238        case OP_POSPLUS:        case OP_POSPLUS:
2239          case OP_POSPLUSI:
2240        case OP_QUERY:        case OP_QUERY:
2241          case OP_QUERYI:
2242        case OP_MINQUERY:        case OP_MINQUERY:
2243          case OP_MINQUERYI:
2244        case OP_POSQUERY:        case OP_POSQUERY:
2245        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2246          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2247        break;        break;
2248        }        }
2249  #else  #else
2250      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2251  #endif  #endif
2252      }      }
2253    }    }
# Line 1775  bracket whose current branch will alread Line 2270  bracket whose current branch will alread
2270  Arguments:  Arguments:
2271    code        points to start of search    code        points to start of search
2272    endcode     points to where to stop    endcode     points to where to stop
2273    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2274      cd          contains pointers to tables etc.
2275    
2276  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2277  */  */
2278    
2279  static BOOL  static BOOL
2280  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2281      BOOL utf, compile_data *cd)
2282  {  {
2283  register int c;  register int c;
2284  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2285       code < endcode;       code < endcode;
2286       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2287    {    {
2288    const uschar *ccode;    const pcre_uchar *ccode;
2289    
2290    c = *code;    c = *code;
2291    
# Line 1802  for (code = first_significant_code(code Line 2299  for (code = first_significant_code(code
2299      continue;      continue;
2300      }      }
2301    
2302      /* For a recursion/subroutine call, if its end has been reached, which
2303      implies a backward reference subroutine call, we can scan it. If it's a
2304      forward reference subroutine call, we can't. To detect forward reference
2305      we have to scan up the list that is kept in the workspace. This function is
2306      called only when doing the real compile, not during the pre-compile that
2307      measures the size of the compiled pattern. */
2308    
2309      if (c == OP_RECURSE)
2310        {
2311        const pcre_uchar *scode;
2312        BOOL empty_branch;
2313    
2314        /* Test for forward reference */
2315    
2316        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2317          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2318    
2319        /* Not a forward reference, test for completed backward reference */
2320    
2321        empty_branch = FALSE;
2322        scode = cd->start_code + GET(code, 1);
2323        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2324    
2325        /* Completed backwards reference */
2326    
2327        do
2328          {
2329          if (could_be_empty_branch(scode, endcode, utf, cd))
2330            {
2331            empty_branch = TRUE;
2332            break;
2333            }
2334          scode += GET(scode, 1);
2335          }
2336        while (*scode == OP_ALT);
2337    
2338        if (!empty_branch) return FALSE;  /* All branches are non-empty */
2339        continue;
2340        }
2341    
2342    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
2343    
2344    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2345          c == OP_BRAPOSZERO)
2346        {
2347        code += PRIV(OP_lengths)[c];
2348        do code += GET(code, 1); while (*code == OP_ALT);
2349        c = *code;
2350        continue;
2351        }
2352    
2353      /* A nested group that is already marked as "could be empty" can just be
2354      skipped. */
2355    
2356      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2357          c == OP_SCBRA || c == OP_SCBRAPOS)
2358      {      {
     code += _pcre_OP_lengths[c];  
2359      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
2360      c = *code;      c = *code;
2361      continue;      continue;
# Line 1814  for (code = first_significant_code(code Line 2363  for (code = first_significant_code(code
2363    
2364    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2365    
2366    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2367          c == OP_CBRA || c == OP_CBRAPOS ||
2368          c == OP_ONCE || c == OP_ONCE_NC ||
2369          c == OP_COND)
2370      {      {
2371      BOOL empty_branch;      BOOL empty_branch;
2372      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1830  for (code = first_significant_code(code Line 2382  for (code = first_significant_code(code
2382        empty_branch = FALSE;        empty_branch = FALSE;
2383        do        do
2384          {          {
2385          if (!empty_branch && could_be_empty_branch(code, endcode, utf8))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2386            empty_branch = TRUE;            empty_branch = TRUE;
2387          code += GET(code, 1);          code += GET(code, 1);
2388          }          }
# Line 1848  for (code = first_significant_code(code Line 2400  for (code = first_significant_code(code
2400      {      {
2401      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2402      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2403      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2404      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2405      here. */      here. */
2406    
2407  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2408      case OP_XCLASS:      case OP_XCLASS:
2409      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2410      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 1860  for (code = first_significant_code(code Line 2412  for (code = first_significant_code(code
2412    
2413      case OP_CLASS:      case OP_CLASS:
2414      case OP_NCLASS:      case OP_NCLASS:
2415      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2416    
2417  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2418      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2419  #endif  #endif
2420    
# Line 1901  for (code = first_significant_code(code Line 2453  for (code = first_significant_code(code
2453      case OP_ALLANY:      case OP_ALLANY:
2454      case OP_ANYBYTE:      case OP_ANYBYTE:
2455      case OP_CHAR:      case OP_CHAR:
2456      case OP_CHARNC:      case OP_CHARI:
2457      case OP_NOT:      case OP_NOT:
2458        case OP_NOTI:
2459      case OP_PLUS:      case OP_PLUS:
2460      case OP_MINPLUS:      case OP_MINPLUS:
2461      case OP_POSPLUS:      case OP_POSPLUS:
# Line 1934  for (code = first_significant_code(code Line 2487  for (code = first_significant_code(code
2487      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2488      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2489      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2490      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP
2491          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2492      break;      break;
2493    
2494      /* End of branch */      /* End of branch */
# Line 1942  for (code = first_significant_code(code Line 2496  for (code = first_significant_code(code
2496      case OP_KET:      case OP_KET:
2497      case OP_KETRMAX:      case OP_KETRMAX:
2498      case OP_KETRMIN:      case OP_KETRMIN:
2499        case OP_KETRPOS:
2500      case OP_ALT:      case OP_ALT:
2501      return TRUE;      return TRUE;
2502    
2503      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2504      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
2505    
2506  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2507      case OP_STAR:      case OP_STAR:
2508        case OP_STARI:
2509      case OP_MINSTAR:      case OP_MINSTAR:
2510        case OP_MINSTARI:
2511      case OP_POSSTAR:      case OP_POSSTAR:
2512        case OP_POSSTARI:
2513      case OP_QUERY:      case OP_QUERY:
2514        case OP_QUERYI:
2515      case OP_MINQUERY:      case OP_MINQUERY:
2516        case OP_MINQUERYI:
2517      case OP_POSQUERY:      case OP_POSQUERY:
2518      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      case OP_POSQUERYI:
2519        if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2520      break;      break;
2521    
2522      case OP_UPTO:      case OP_UPTO:
2523        case OP_UPTOI:
2524      case OP_MINUPTO:      case OP_MINUPTO:
2525        case OP_MINUPTOI:
2526      case OP_POSUPTO:      case OP_POSUPTO:
2527      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      case OP_POSUPTOI:
2528        if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2529      break;      break;
2530  #endif  #endif
2531    
2532        /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2533        string. */
2534    
2535        case OP_MARK:
2536        case OP_PRUNE_ARG:
2537        case OP_SKIP_ARG:
2538        code += code[1];
2539        break;
2540    
2541        case OP_THEN_ARG:
2542        code += code[1];
2543        break;
2544    
2545        /* None of the remaining opcodes are required to match a character. */
2546    
2547        default:
2548        break;
2549      }      }
2550    }    }
2551    
# Line 1980  return TRUE; Line 2562  return TRUE;
2562  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2563  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2564  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2565    This function is called only during the real compile, not during the
2566    pre-compile.
2567    
2568  Arguments:  Arguments:
2569    code        points to start of the recursion    code        points to start of the recursion
2570    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2571    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2572    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2573      cd          pointers to tables etc
2574    
2575  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2576  */  */
2577    
2578  static BOOL  static BOOL
2579  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2580    BOOL utf8)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2581  {  {
2582  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2583    {    {
2584    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2585        return FALSE;
2586    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2587    }    }
2588  return TRUE;  return TRUE;
# Line 2028  where Perl recognizes it as the POSIX cl Line 2614  where Perl recognizes it as the POSIX cl
2614  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2615  I think.  I think.
2616    
2617    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2618    It seems that the appearance of a nested POSIX class supersedes an apparent
2619    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2620    a digit.
2621    
2622    In Perl, unescaped square brackets may also appear as part of class names. For
2623    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2624    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2625    seem right at all. PCRE does not allow closing square brackets in POSIX class
2626    names.
2627    
2628  Arguments:  Arguments:
2629    ptr      pointer to the initial [    ptr      pointer to the initial [
2630    endptr   where to return the end pointer    endptr   where to return the end pointer
# Line 2036  Returns:   TRUE or FALSE Line 2633  Returns:   TRUE or FALSE
2633  */  */
2634    
2635  static BOOL  static BOOL
2636  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2637  {  {
2638  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2639  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2640  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2641    {    {
2642    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2643        ptr++;
2644      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2645      else
2646      {      {
     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;  
2647      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2648        {        {
2649        *endptr = ptr;        *endptr = ptr;
2650        return TRUE;        return TRUE;
2651        }        }
2652        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2653             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2654              ptr[1] == CHAR_EQUALS_SIGN) &&
2655            check_posix_syntax(ptr, endptr))
2656          return FALSE;
2657      }      }
2658    }    }
2659  return FALSE;  return FALSE;
# Line 2073  Returns:     a value representing the na Line 2677  Returns:     a value representing the na
2677  */  */
2678    
2679  static int  static int
2680  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2681  {  {
2682  const char *pn = posix_names;  const char *pn = posix_names;
2683  register int yield = 0;  register int yield = 0;
2684  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2685    {    {
2686    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2687      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2688    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2689    yield++;    yield++;
2690    }    }
# Line 2112  value in the reference (which is a group Line 2716  value in the reference (which is a group
2716  Arguments:  Arguments:
2717    group      points to the start of the group    group      points to the start of the group
2718    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2719    utf8       TRUE in UTF-8 mode    utf        TRUE in UTF-8 / UTF-16 mode
2720    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2721    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm   the hwm forward reference pointer at the start of the group
2722    
# Line 2120  Returns:     nothing Line 2724  Returns:     nothing
2724  */  */
2725    
2726  static void  static void
2727  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2728    uschar *save_hwm)    pcre_uchar *save_hwm)
2729  {  {
2730  uschar *ptr = group;  pcre_uchar *ptr = group;
2731    
2732  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2733    {    {
2734    int offset;    int offset;
2735    uschar *hc;    pcre_uchar *hc;
2736    
2737    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2738    reference. */    reference. */
# Line 2173  Arguments: Line 2777  Arguments:
2777  Returns:         new code pointer  Returns:         new code pointer
2778  */  */
2779    
2780  static uschar *  static pcre_uchar *
2781  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2782  {  {
2783  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2784  *code++ = 255;  *code++ = 255;
2785  PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2786  PUT(code, LINK_SIZE, 0);                /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2787  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2788  }  }
2789    
2790    
# Line 2202  Returns:             nothing Line 2806  Returns:             nothing
2806  */  */
2807    
2808  static void  static void
2809  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2810  {  {
2811  int length = ptr - cd->start_pattern - GET(previous_callout, 2);  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2812  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
2813  }  }
2814    
# Line 2254  for (++c; c <= d; c++) Line 2858  for (++c; c <= d; c++)
2858    
2859  return TRUE;  return TRUE;
2860  }  }
2861    
2862    
2863    
2864    /*************************************************
2865    *        Check a character and a property        *
2866    *************************************************/
2867    
2868    /* This function is called by check_auto_possessive() when a property item
2869    is adjacent to a fixed character.
2870    
2871    Arguments:
2872      c            the character
2873      ptype        the property type
2874      pdata        the data for the type
2875      negated      TRUE if it's a negated property (\P or \p{^)
2876    
2877    Returns:       TRUE if auto-possessifying is OK
2878    */
2879    
2880    static BOOL
2881    check_char_prop(int c, int ptype, int pdata, BOOL negated)
2882    {
2883    const ucd_record *prop = GET_UCD(c);
2884    switch(ptype)
2885      {
2886      case PT_LAMP:
2887      return (prop->chartype == ucp_Lu ||
2888              prop->chartype == ucp_Ll ||
2889              prop->chartype == ucp_Lt) == negated;
2890    
2891      case PT_GC:
2892      return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2893    
2894      case PT_PC:
2895      return (pdata == prop->chartype) == negated;
2896    
2897      case PT_SC:
2898      return (pdata == prop->script) == negated;
2899    
2900      /* These are specials */
2901    
2902      case PT_ALNUM:
2903      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2904              PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2905    
2906      case PT_SPACE:    /* Perl space */
2907      return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2908              c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2909              == negated;
2910    
2911      case PT_PXSPACE:  /* POSIX space */
2912      return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2913              c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2914              c == CHAR_FF || c == CHAR_CR)
2915              == negated;
2916    
2917      case PT_WORD:
2918      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2919              PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2920              c == CHAR_UNDERSCORE) == negated;
2921      }
2922    return FALSE;
2923    }
2924  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2925    
2926    
# Line 2267  whether the next thing could possibly ma Line 2934  whether the next thing could possibly ma
2934  sense to automatically possessify the repeated item.  sense to automatically possessify the repeated item.
2935    
2936  Arguments:  Arguments:
2937    op_code       the repeated op code    previous      pointer to the repeated opcode
2938    this          data for this item, depends on the opcode    utf           TRUE in UTF-8 / UTF-16 mode
   utf8          TRUE in UTF-8 mode  
   utf8_char     used for utf8 character bytes, NULL if not relevant  
2939    ptr           next character in pattern    ptr           next character in pattern
2940    options       options bits    options       options bits
2941    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 2279  Returns:        TRUE if possessifying is Line 2944  Returns:        TRUE if possessifying is
2944  */  */
2945    
2946  static BOOL  static BOOL
2947  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2948    const uschar *ptr, int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2949  {  {
2950  int next;  pcre_int32 c, next;
2951    int op_code = *previous++;
2952    
2953  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
2954    
# Line 2293  if ((options & PCRE_EXTENDED) != 0) Line 2959  if ((options & PCRE_EXTENDED) != 0)
2959      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2960      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2961        {        {
2962        while (*(++ptr) != 0)        ptr++;
2963          while (*ptr != 0)
2964            {
2965          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2966            ptr++;
2967    #ifdef SUPPORT_UTF
2968            if (utf) FORWARDCHAR(ptr);
2969    #endif
2970            }
2971        }        }
2972      else break;      else break;
2973      }      }
# Line 2310  if (*ptr == CHAR_BACKSLASH) Line 2983  if (*ptr == CHAR_BACKSLASH)
2983    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
2984    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
2985    }    }
2986    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
2987    {    {
2988  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2989    if (utf8) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
2990  #endif  #endif
2991    next = *ptr++;    next = *ptr++;
2992    }    }
   
2993  else return FALSE;  else return FALSE;
2994    
2995  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2330  if ((options & PCRE_EXTENDED) != 0) Line 3001  if ((options & PCRE_EXTENDED) != 0)
3001      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3002      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3003        {        {
3004        while (*(++ptr) != 0)        ptr++;
3005          while (*ptr != 0)
3006            {
3007          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3008            ptr++;
3009    #ifdef SUPPORT_UTF
3010            if (utf) FORWARDCHAR(ptr);
3011    #endif
3012            }
3013        }        }
3014      else break;      else break;
3015      }      }
# Line 2340  if ((options & PCRE_EXTENDED) != 0) Line 3018  if ((options & PCRE_EXTENDED) != 0)
3018  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
3019    
3020  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3021    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3022      return FALSE;      return FALSE;
3023    
3024  /* Now compare the next item with the previous opcode. If the previous is a  /* Now compare the next item with the previous opcode. First, handle cases when
3025  positive single character match, "item" either contains the character or, if  the next item is a character. */
 "item" is greater than 127 in utf8 mode, the character's bytes are in  
 utf8_char. */  
   
   
 /* Handle cases when the next item is a character. */  
3026    
3027  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3028    {    {
3029    case OP_CHAR:    case OP_CHAR:
3030  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3031    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
3032  #else  #else
3033    (void)(utf8_char);  /* Keep compiler happy by referencing function argument */    c = *previous;
3034  #endif  #endif
3035    return item != next;    return c != next;
3036    
3037    /* For CHARNC (caseless character) we must check the other case. If we have    /* For CHARI (caseless character) we must check the other case. If we have
3038    Unicode property support, we can use it to test the other case of    Unicode property support, we can use it to test the other case of
3039    high-valued characters. */    high-valued characters. */
3040    
3041    case OP_CHARNC:    case OP_CHARI:
3042  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3043    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
3044  #endif  #else
3045    if (item == next) return FALSE;    c = *previous;
3046  #ifdef SUPPORT_UTF8  #endif
3047    if (utf8)    if (c == next) return FALSE;
3048    #ifdef SUPPORT_UTF
3049      if (utf)
3050      {      {
3051      unsigned int othercase;      unsigned int othercase;
3052      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2380  if (next >= 0) switch(op_code) Line 3055  if (next >= 0) switch(op_code)
3055  #else  #else
3056      othercase = NOTACHAR;      othercase = NOTACHAR;
3057  #endif  #endif
3058      return (unsigned int)item != othercase;      return (unsigned int)c != othercase;
3059      }      }
3060    else    else
3061  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3062    return (item != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
3063    
3064    /* For OP_NOT, "item" must be a single-byte character. */    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
3065      opcodes are not used for multi-byte characters, because they are coded using
3066      an XCLASS instead. */
3067    
3068    case OP_NOT:    case OP_NOT:
3069    if (item == next) return TRUE;    return (c = *previous) == next;
3070    if ((options & PCRE_CASELESS) == 0) return FALSE;  
3071  #ifdef SUPPORT_UTF8    case OP_NOTI:
3072    if (utf8)    if ((c = *previous) == next) return TRUE;
3073    #ifdef SUPPORT_UTF
3074      if (utf)
3075      {      {
3076      unsigned int othercase;      unsigned int othercase;
3077      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2401  if (next >= 0) switch(op_code) Line 3080  if (next >= 0) switch(op_code)
3080  #else  #else
3081      othercase = NOTACHAR;      othercase = NOTACHAR;
3082  #endif  #endif
3083      return (unsigned int)item == othercase;      return (unsigned int)c == othercase;
3084      }      }
3085    else    else
3086  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3087    return (item == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
3088    
3089      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3090      When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3091    
3092    case OP_DIGIT:    case OP_DIGIT:
3093    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
# Line 2448  if (next >= 0) switch(op_code) Line 3130  if (next >= 0) switch(op_code)
3130      case 0x202f:      case 0x202f:
3131      case 0x205f:      case 0x205f:
3132      case 0x3000:      case 0x3000:
3133      return op_code != OP_HSPACE;      return op_code == OP_NOT_HSPACE;
3134      default:      default:
3135      return op_code == OP_HSPACE;      return op_code != OP_NOT_HSPACE;
3136      }      }
3137    
3138      case OP_ANYNL:
3139    case OP_VSPACE:    case OP_VSPACE:
3140    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3141    switch(next)    switch(next)
# Line 2464  if (next >= 0) switch(op_code) Line 3147  if (next >= 0) switch(op_code)
3147      case 0x85:      case 0x85:
3148      case 0x2028:      case 0x2028:
3149      case 0x2029:      case 0x2029:
3150      return op_code != OP_VSPACE;      return op_code == OP_NOT_VSPACE;
3151      default:      default:
3152      return op_code == OP_VSPACE;      return op_code != OP_NOT_VSPACE;
3153      }      }
3154    
3155    #ifdef SUPPORT_UCP
3156      case OP_PROP:
3157      return check_char_prop(next, previous[0], previous[1], FALSE);
3158    
3159      case OP_NOTPROP:
3160      return check_char_prop(next, previous[0], previous[1], TRUE);
3161    #endif
3162    
3163    default:    default:
3164    return FALSE;    return FALSE;
3165    }    }
3166    
3167    
3168  /* Handle the case when the next item is \d, \s, etc. */  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3169    is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3170    generated only when PCRE_UCP is *not* set, that is, when only ASCII
3171    characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3172    replaced by OP_PROP codes when PCRE_UCP is set. */
3173    
3174  switch(op_code)  switch(op_code)
3175    {    {
3176    case OP_CHAR:    case OP_CHAR:
3177    case OP_CHARNC:    case OP_CHARI:
3178  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3179    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
3180    #else
3181      c = *previous;
3182  #endif  #endif
3183    switch(-next)    switch(-next)
3184      {      {
3185      case ESC_d:      case ESC_d:
3186      return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;      return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3187    
3188      case ESC_D:      case ESC_D:
3189      return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3190    
3191      case ESC_s:      case ESC_s:
3192      return item > 127 || (cd->ctypes[item] & ctype_space) == 0;      return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3193    
3194      case ESC_S:      case ESC_S:
3195      return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3196    
3197      case ESC_w:      case ESC_w:
3198      return item > 127 || (cd->ctypes[item] & ctype_word) == 0;      return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3199    
3200      case ESC_W:      case ESC_W:
3201      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3202    
3203      case ESC_h:      case ESC_h:
3204      case ESC_H:      case ESC_H:
3205      switch(item)      switch(c)
3206        {        {
3207        case 0x09:        case 0x09:
3208        case 0x20:        case 0x20:
# Line 2533  switch(op_code) Line 3230  switch(op_code)
3230    
3231      case ESC_v:      case ESC_v:
3232      case ESC_V:      case ESC_V:
3233      switch(item)      switch(c)
3234        {        {
3235        case 0x0a:        case 0x0a:
3236        case 0x0b:        case 0x0b:
# Line 2547  switch(op_code) Line 3244  switch(op_code)
3244        return -next == ESC_v;        return -next == ESC_v;
3245        }        }
3246    
3247        /* When PCRE_UCP is set, these values get generated for \d etc. Find
3248        their substitutions and process them. The result will always be either
3249        -ESC_p or -ESC_P. Then fall through to process those values. */
3250    
3251    #ifdef SUPPORT_UCP
3252        case ESC_du:
3253        case ESC_DU:
3254        case ESC_wu:
3255        case ESC_WU:
3256        case ESC_su:
3257        case ESC_SU:
3258          {
3259          int temperrorcode = 0;
3260          ptr = substitutes[-next - ESC_DU];
3261          next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3262          if (temperrorcode != 0) return FALSE;
3263          ptr++;    /* For compatibility */
3264          }
3265        /* Fall through */
3266    
3267        case ESC_p:
3268        case ESC_P:
3269          {
3270          int ptype, pdata, errorcodeptr;
3271          BOOL negated;
3272    
3273          ptr--;      /* Make ptr point at the p or P */
3274          ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3275          if (ptype < 0) return FALSE;
3276          ptr++;      /* Point past the final curly ket */
3277    
3278          /* If the property item is optional, we have to give up. (When generated
3279          from \d etc by PCRE_UCP, this test will have been applied much earlier,
3280          to the original \d etc. At this point, ptr will point to a zero byte. */
3281    
3282          if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3283            STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3284              return FALSE;
3285    
3286          /* Do the property check. */
3287    
3288          return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3289          }
3290    #endif
3291    
3292      default:      default:
3293      return FALSE;      return FALSE;
3294      }      }
3295    
3296      /* In principle, support for Unicode properties should be integrated here as
3297      well. It means re-organizing the above code so as to get hold of the property
3298      values before switching on the op-code. However, I wonder how many patterns
3299      combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3300      these op-codes are never generated.) */
3301    
3302    case OP_DIGIT:    case OP_DIGIT:
3303    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3304           next == -ESC_h || next == -ESC_v;           next == -ESC_h || next == -ESC_v || next == -ESC_R;
3305    
3306    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3307    return next == -ESC_d;    return next == -ESC_d;
3308    
3309    case OP_WHITESPACE:    case OP_WHITESPACE:
3310    return next == -ESC_S || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3311    
3312    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3313    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3314    
3315    case OP_HSPACE:    case OP_HSPACE:
3316    return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3317             next == -ESC_w || next == -ESC_v || next == -ESC_R;
3318    
3319    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3320    return next == -ESC_h;    return next == -ESC_h;
3321    
3322    /* Can't have \S in here because VT matches \S (Perl anomaly) */    /* Can't have \S in here because VT matches \S (Perl anomaly) */
3323      case OP_ANYNL:
3324    case OP_VSPACE:    case OP_VSPACE:
3325    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3326    
3327    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3328    return next == -ESC_v;    return next == -ESC_v || next == -ESC_R;
3329    
3330    case OP_WORDCHAR:    case OP_WORDCHAR:
3331    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3332             next == -ESC_v || next == -ESC_R;
3333    
3334    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3335    return next == -ESC_w || next == -ESC_d;    return next == -ESC_w || next == -ESC_d;
# Line 2607  Arguments: Line 3358  Arguments:
3358    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3359    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3360    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3361    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3362    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3363    bcptr          points to current branch chain    bcptr          points to current branch chain
3364      cond_depth     conditional nesting depth
3365    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3366    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3367                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 2619  Returns:         TRUE on success Line 3371  Returns:         TRUE on success
3371  */  */
3372    
3373  static BOOL  static BOOL
3374  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3375    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3376      pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3377    compile_data *cd, int *lengthptr)    compile_data *cd, int *lengthptr)
3378  {  {
3379  int repeat_type, op_type;  int repeat_type, op_type;
3380  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3381  int bravalue = 0;  int bravalue = 0;
3382  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3383  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3384  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3385  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3386  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3387  int after_manual_callout = 0;  int after_manual_callout = 0;
3388  int length_prevgroup = 0;  int length_prevgroup = 0;
3389  register int c;  register int c;
3390  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3391  uschar *last_code = code;  pcre_uchar *last_code = code;
3392  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3393  uschar *tempcode;  pcre_uchar *tempcode;
3394  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3395  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3396  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3397  const uschar *tempptr;  const pcre_uchar *tempptr;
3398  uschar *previous = NULL;  const pcre_uchar *nestptr = NULL;
3399  uschar *previous_callout = NULL;  pcre_uchar *previous = NULL;
3400  uschar *save_hwm = NULL;  pcre_uchar *previous_callout = NULL;
3401  uschar classbits[32];  pcre_uchar *save_hwm = NULL;
3402    pcre_uint8 classbits[32];
3403  #ifdef SUPPORT_UTF8  
3404  BOOL class_utf8;  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3405  BOOL utf8 = (options & PCRE_UTF8) != 0;  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3406  uschar *class_utf8data;  dynamically as we process the pattern. */
3407  uschar *class_utf8data_base;  
3408  uschar utf8_char[6];  #ifdef SUPPORT_UTF
3409    /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3410    BOOL utf = (options & PCRE_UTF8) != 0;
3411    pcre_uchar utf_chars[6];
3412  #else  #else
3413  BOOL utf8 = FALSE;  BOOL utf = FALSE;
3414  uschar *utf8_char = NULL;  #endif
3415    
3416    /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3417    
3418    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3419    BOOL xclass;
3420    pcre_uchar *class_uchardata;
3421    pcre_uchar *class_uchardata_base;
3422  #endif  #endif
3423    
3424  #ifdef DEBUG  #ifdef PCRE_DEBUG
3425  if (lengthptr != NULL) DPRINTF((">> start branch\n"));  if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3426  #endif  #endif
3427    
# Line 2669  greedy_non_default = greedy_default ^ 1; Line 3432  greedy_non_default = greedy_default ^ 1;
3432    
3433  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3434  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3435  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3436  find one.  find one.
3437    
3438  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3439  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3440  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3441  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3442    
3443  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3444    
3445  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3446  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3447  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3448  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3449    value. This is used only for ASCII characters. */
3450    
3451  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3452    
3453  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3454    
# Line 2696  for (;; ptr++) Line 3460  for (;; ptr++)
3460    BOOL is_quantifier;    BOOL is_quantifier;
3461    BOOL is_recurse;    BOOL is_recurse;
3462    BOOL reset_bracount;    BOOL reset_bracount;
3463    int class_charcount;    int class_has_8bitchar;
3464      int class_single_char;
3465    int class_lastchar;    int class_lastchar;
3466    int newoptions;    int newoptions;
3467    int recno;    int recno;
3468    int refsign;    int refsign;
3469    int skipbytes;    int skipbytes;
3470    int subreqbyte;    int subreqchar;
3471    int subfirstbyte;    int subfirstchar;
3472    int terminator;    int terminator;
3473    int mclength;    int mclength;
3474    uschar mcbuffer[8];    int tempbracount;
3475      pcre_uchar mcbuffer[8];
3476    
3477    /* Get next byte in the pattern */    /* Get next character in the pattern */
3478    
3479    c = *ptr;    c = *ptr;
3480    
3481      /* If we are at the end of a nested substitution, revert to the outer level
3482      string. Nesting only happens one level deep. */
3483    
3484      if (c == 0 && nestptr != NULL)
3485        {
3486        ptr = nestptr;
3487        nestptr = NULL;
3488        c = *ptr;
3489        }
3490    
3491    /* If we are in the pre-compile phase, accumulate the length used for the    /* If we are in the pre-compile phase, accumulate the length used for the
3492    previous cycle of this loop. */    previous cycle of this loop. */
3493    
3494    if (lengthptr != NULL)    if (lengthptr != NULL)
3495      {      {
3496  #ifdef DEBUG  #ifdef PCRE_DEBUG
3497      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3498  #endif  #endif
3499      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3500            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3501        {        {
3502        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3503        goto FAILED;        goto FAILED;
# Line 2742  for (;; ptr++) Line 3519  for (;; ptr++)
3519        goto FAILED;        goto FAILED;
3520        }        }
3521    
3522      *lengthptr += code - last_code;      *lengthptr += (int)(code - last_code);
3523      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3524          (int)(code - last_code), c, c));
3525    
3526      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3527      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
3528      if "previous" is NULL, reset the current code pointer to the start. */      if "previous" is NULL, reset the current code pointer to the start. */
# Line 2753  for (;; ptr++) Line 3531  for (;; ptr++)
3531        {        {
3532        if (previous > orig_code)        if (previous > orig_code)
3533          {          {
3534          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3535          code -= previous - orig_code;          code -= previous - orig_code;
3536          previous = orig_code;          previous = orig_code;
3537          }          }
# Line 2769  for (;; ptr++) Line 3547  for (;; ptr++)
3547    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3548    reference list. */    reference list. */
3549    
3550    else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3551               WORK_SIZE_SAFETY_MARGIN)
3552      {      {
3553      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3554      goto FAILED;      goto FAILED;
# Line 2817  for (;; ptr++) Line 3596  for (;; ptr++)
3596      previous_callout = NULL;      previous_callout = NULL;
3597      }      }
3598    
3599    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3600    
3601    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3602      {      {
3603      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
3604      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3605        {        {
3606        while (*(++ptr) != 0)        ptr++;
3607          while (*ptr != 0)
3608          {          {
3609          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3610            ptr++;
3611    #ifdef SUPPORT_UTF
3612            if (utf) FORWARDCHAR(ptr);
3613    #endif
3614          }          }
3615        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3616    
# Line 2849  for (;; ptr++) Line 3633  for (;; ptr++)
3633      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3634      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3635      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3636      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3637      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3638      *codeptr = code;      *codeptr = code;
3639      *ptrptr = ptr;      *ptrptr = ptr;
3640      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 2860  for (;; ptr++) Line 3644  for (;; ptr++)
3644          *errorcodeptr = ERR20;          *errorcodeptr = ERR20;
3645          goto FAILED;          goto FAILED;
3646          }          }
3647        *lengthptr += code - last_code;   /* To include callout length */        *lengthptr += (int)(code - last_code);   /* To include callout length */
3648        DPRINTF((">> end branch\n"));        DPRINTF((">> end branch\n"));
3649        }        }
3650      return TRUE;      return TRUE;
# Line 2871  for (;; ptr++) Line 3655  for (;; ptr++)
3655      the setting of any following char as a first character. */      the setting of any following char as a first character. */
3656    
3657      case CHAR_CIRCUMFLEX_ACCENT:      case CHAR_CIRCUMFLEX_ACCENT:
3658        previous = NULL;
3659      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3660        {        {
3661        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3662          *code++ = OP_CIRCM;
3663        }        }
3664      previous = NULL;      else *code++ = OP_CIRC;
     *code++ = OP_CIRC;  
3665      break;      break;
3666    
3667      case CHAR_DOLLAR_SIGN:      case CHAR_DOLLAR_SIGN:
3668      previous = NULL;      previous = NULL;
3669      *code++ = OP_DOLL;      *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3670      break;      break;
3671    
3672      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3673      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3674    
3675      case CHAR_DOT:      case CHAR_DOT:
3676      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3677      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3678      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3679      previous = code;      previous = code;
3680      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3681      break;      break;
# Line 2945  for (;; ptr++) Line 3730  for (;; ptr++)
3730          {          {
3731          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
3732            ptr++;            ptr++;
3733          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
                           STR_Q STR_BACKSLASH STR_E, 3) == 0)  
3734            ptr += 3;            ptr += 3;
3735          else          else
3736            break;            break;
# Line 2965  for (;; ptr++) Line 3749  for (;; ptr++)
3749          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3750        {        {
3751        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3752        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3753        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3754        break;        break;
3755        }        }
3756    
# Line 2976  for (;; ptr++) Line 3760  for (;; ptr++)
3760    
3761      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3762    
3763      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3764      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3765      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1, if the class only contains
3766        a single character. */
3767    
3768      class_charcount = 0;      class_has_8bitchar = 0;
3769        class_single_char = 0;
3770      class_lastchar = -1;      class_lastchar = -1;
3771    
3772      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
# Line 2988  for (;; ptr++) Line 3774  for (;; ptr++)
3774      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3775      */      */
3776    
3777      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3778    
3779  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3780      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3781      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3782      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3783  #endif  #endif
3784    
3785      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3002  for (;; ptr++) Line 3788  for (;; ptr++)
3788    
3789      if (c != 0) do      if (c != 0) do
3790        {        {
3791        const uschar *oldptr;        const pcre_uchar *oldptr;
3792    
3793  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3794        if (utf8 && c > 127)        if (utf && HAS_EXTRALEN(c))
3795          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3796          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3797          }          }
3798    #endif
3799    
3800        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3801          /* In the pre-compile phase, accumulate the length of any extra
3802        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3803        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
3804        (which is on the stack). */        (which is on the stack). */
3805    
3806        if (lengthptr != NULL)        if (lengthptr != NULL)
3807          {          {
3808          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_uchardata - class_uchardata_base;
3809          class_utf8data = class_utf8data_base;          class_uchardata = class_uchardata_base;
3810          }          }
   
3811  #endif  #endif
3812    
3813        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3048  for (;; ptr++) Line 3835  for (;; ptr++)
3835          {          {
3836          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3837          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3838          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3839          uschar pbits[32];          pcre_uint8 pbits[32];
3840    
3841          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
3842            {            {
# Line 3065  for (;; ptr++) Line 3852  for (;; ptr++)
3852            ptr++;            ptr++;
3853            }            }
3854    
3855          posix_class = check_posix_name(ptr, tempptr - ptr);          posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3856          if (posix_class < 0)          if (posix_class < 0)
3857            {            {
3858            *errorcodeptr = ERR30;            *errorcodeptr = ERR30;
# Line 3079  for (;; ptr++) Line 3866  for (;; ptr++)
3866          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3867            posix_class = 0;            posix_class = 0;
3868    
3869          /* We build the bit map for the POSIX class in a chunk of local store          /* When PCRE_UCP is set, some of the POSIX classes are converted to
3870          because we may be adding and subtracting from it, and we don't want to          different escape sequences that use Unicode properties. */
3871          subtract bits that may be in the main map already. At the end we or the  
3872          result into the bit map that is being built. */  #ifdef SUPPORT_UCP
3873            if ((options & PCRE_UCP) != 0)
3874              {
3875              int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3876              if (posix_substitutes[pc] != NULL)
3877                {
3878                nestptr = tempptr + 1;
3879                ptr = posix_substitutes[pc] - 1;
3880                continue;
3881                }
3882              }
3883    #endif
3884            /* In the non-UCP case, we build the bit map for the POSIX class in a
3885            chunk of local store because we may be adding and subtracting from it,
3886            and we don't want to subtract bits that may be in the main map already.
3887            At the end we or the result into the bit map that is being built. */
3888    
3889          posix_class *= 3;          posix_class *= 3;
3890    
3891          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3892    
3893          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3894            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3895    
3896          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3897    
# Line 3120  for (;; ptr++) Line 3922  for (;; ptr++)
3922            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3923    
3924          ptr = tempptr + 1;          ptr = tempptr + 1;
3925          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3926            class_has_8bitchar = 1;
3927            /* Every class contains at least two characters. */
3928            class_single_char = 2;
3929          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3930          }          }
3931    
3932        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3933        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3934        case. Inside a class (and only there) it is treated as backspace.        case. Inside a class (and only there) it is treated as backspace. We
3935        Elsewhere it marks a word boundary. Other escapes have preset maps ready        assume that other escapes have more than one character in them, so
3936        to 'or' into the one we are building. We assume they have more than one        speculatively set both class_has_8bitchar class_single_char bigger
3937        character in them, so set class_charcount bigger than one. */        than one. Unrecognized escapes fall through and are either treated
3938          as literal characters (by default), or are faulted if
3939          PCRE_EXTRA is set. */
3940    
3941        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
3942          {          {
3943          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3944          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3945    
3946          if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3947          else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */          else if (-c == ESC_N)            /* \N is not supported in a class */
3948          else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */            {
3949              *errorcodeptr = ERR71;
3950              goto FAILED;
3951              }
3952          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3953            {            {
3954            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3152  for (;; ptr++) Line 3962  for (;; ptr++)
3962    
3963          if (c < 0)          if (c < 0)
3964            {            {
3965            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3966            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3967              class_has_8bitchar++;
3968            /* Save time by not doing this in the pre-compile phase. */            /* Every class contains at least two characters. */
3969              class_single_char += 2;
3970    
3971            if (lengthptr == NULL) switch (-c)            switch (-c)
3972              {              {
3973    #ifdef SUPPORT_UCP
3974                case ESC_du:     /* These are the values given for \d etc */
3975                case ESC_DU:     /* when PCRE_UCP is set. We replace the */
3976                case ESC_wu:     /* escape sequence with an appropriate \p */
3977                case ESC_WU:     /* or \P to test Unicode properties instead */
3978                case ESC_su:     /* of the default ASCII testing. */
3979                case ESC_SU:
3980                nestptr = ptr;
3981                ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3982                class_has_8bitchar--;                /* Undo! */
3983                continue;
3984    #endif
3985              case ESC_d:              case ESC_d:
3986              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3987              continue;              continue;
# Line 3177  for (;; ptr++) Line 4000  for (;; ptr++)
4000              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4001              continue;              continue;
4002    
4003                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4004                if it was previously set by something earlier in the character
4005                class. */
4006    
4007              case ESC_s:              case ESC_s:
4008              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
4009              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
4010                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4011              continue;              continue;
4012    
4013              case ESC_S:              case ESC_S:
# Line 3188  for (;; ptr++) Line 4016  for (;; ptr++)
4016              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
4017              continue;              continue;
4018    
4019              default:    /* Not recognized; fall through */              case ESC_h:
             break;      /* Need "default" setting to stop compiler warning. */  
             }  
   
           /* In the pre-compile phase, just do the recognition. */  
   
           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||  
                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;  
   
           /* We need to deal with \H, \h, \V, and \v in both phases because  
           they use extra memory. */  
   
           if (-c == ESC_h)  
             {  
4020              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
4021              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
4022              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
4023  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4024              if (utf8)              if (utf)
4025                {                {
4026                class_utf8 = TRUE;                xclass = TRUE;
4027                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4028                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4029                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4030                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4031                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4032                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4033                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200A, class_uchardata);
4034                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4035                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4036                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4037                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4038                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4039                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4040                }                }
4041  #endif  #endif
4042              continue;              continue;
             }  
4043    
4044            if (-c == ESC_H)              case ESC_H:
             {  
4045              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
4046                {                {
4047                int x = 0xff;                int x = 0xff;
# Line 3242  for (;; ptr++) Line 4055  for (;; ptr++)
4055                classbits[c] |= x;                classbits[c] |= x;
4056                }                }
4057    
4058  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4059              if (utf8)              if (utf)
4060                {                {
4061                class_utf8 = TRUE;                xclass = TRUE;
4062                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4063                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4064                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4065                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4066                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4067                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4068                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4069                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4070                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4071                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4072                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200B, class_uchardata);
4073                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4074                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4075                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4076                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4077                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4078                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4079                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4080                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4081                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4082                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4083                }                }
4084  #endif  #endif
4085              continue;              continue;
             }  
4086    
4087            if (-c == ESC_v)              case ESC_v:
             {  
4088              SETBIT(classbits, 0x0a); /* LF */              SETBIT(classbits, 0x0a); /* LF */
4089              SETBIT(classbits, 0x0b); /* VT */              SETBIT(classbits, 0x0b); /* VT */
4090              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
4091              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, 0x0d); /* CR */
4092              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, 0x85); /* NEL */
4093  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4094              if (utf8)              if (utf)
4095                {                {
4096                class_utf8 = TRUE;                xclass = TRUE;
4097                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4098                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4099                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4100                }                }
4101  #endif  #endif
4102              continue;              continue;
             }  
4103    
4104            if (-c == ESC_V)              case ESC_V:
             {  
4105              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
4106                {                {
4107                int x = 0xff;                int x = 0xff;
# Line 3309  for (;; ptr++) Line 4118  for (;; ptr++)
4118                classbits[c] |= x;                classbits[c] |= x;
4119                }                }
4120    
4121  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4122              if (utf8)              if (utf)
4123                {                {
4124                class_utf8 = TRUE;                xclass = TRUE;
4125                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4126                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4127                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4128                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4129                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4130                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4131                }                }
4132  #endif  #endif
4133              continue;              continue;
             }  
   
           /* We need to deal with \P and \p in both phases. */  
4134    
4135  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4136            if (-c == ESC_p || -c == ESC_P)              case ESC_p:
4137              {              case ESC_P:
4138              BOOL negated;                {
4139              int pdata;                BOOL negated;
4140              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int pdata;
4141              if (ptype < 0) goto FAILED;                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4142              class_utf8 = TRUE;                if (ptype < 0) goto FAILED;
4143              *class_utf8data++ = ((-c == ESC_p) != negated)?                xclass = TRUE;
4144                XCL_PROP : XCL_NOTPROP;                *class_uchardata++ = ((-c == ESC_p) != negated)?
4145              *class_utf8data++ = ptype;                  XCL_PROP : XCL_NOTPROP;
4146              *class_utf8data++ = pdata;                *class_uchardata++ = ptype;
4147              class_charcount -= 2;   /* Not a < 256 character */                *class_uchardata++ = pdata;
4148              continue;                class_has_8bitchar--;                /* Undo! */
4149              }                continue;
4150                  }
4151  #endif  #endif
4152            /* Unrecognized escapes are faulted if PCRE is running in its              /* Unrecognized escapes are faulted if PCRE is running in its
4153            strict mode. By default, for compatibility with Perl, they are              strict mode. By default, for compatibility with Perl, they are
4154            treated as literals. */              treated as literals. */
4155    
4156            if ((options & PCRE_EXTRA) != 0)              default:
4157              {              if ((options & PCRE_EXTRA) != 0)
4158              *errorcodeptr = ERR7;                {
4159              goto FAILED;                *errorcodeptr = ERR7;
4160                  goto FAILED;
4161                  }
4162                class_has_8bitchar--;    /* Undo the speculative increase. */
4163                class_single_char -= 2;  /* Undo the speculative increase. */
4164                c = *ptr;                /* Get the final character and fall through */
4165                break;
4166              }              }
   
           class_charcount -= 2;  /* Undo the default count from above */  
           c = *ptr;              /* Get the final character and fall through */  
4167            }            }
4168    
4169          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4170          greater than 256 in UTF-8 mode. */          greater than 256. */
4171    
4172          }   /* End of backslash handling */          }   /* End of backslash handling */
4173    
# Line 3405  for (;; ptr++) Line 4215  for (;; ptr++)
4215            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4216            }            }
4217    
4218  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4219          if (utf8)          if (utf)
4220            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4221            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4222            }            }
# Line 3423  for (;; ptr++) Line 4233  for (;; ptr++)
4233            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4234            if (*errorcodeptr != 0) goto FAILED;            if (*errorcodeptr != 0) goto FAILED;
4235    
4236            /* \b is backspace; \X is literal X; \R is literal R; any other            /* \b is backspace; any other special means the '-' was literal */
           special means the '-' was literal */  
4237    
4238            if (d < 0)            if (d < 0)
4239              {              {
4240              if (d == -ESC_b) d = CHAR_BS;              if (d == -ESC_b) d = CHAR_BS; else
             else if (d == -ESC_X) d = CHAR_X;  
             else if (d == -ESC_R) d = CHAR_R; else  
4241                {                {
4242                ptr = oldptr;                ptr = oldptr;
4243                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
# Line 3453  for (;; ptr++) Line 4260  for (;; ptr++)
4260    
4261          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4262    
4263            /* Since we found a character range, single character optimizations
4264            cannot be done anymore. */
4265            class_single_char = 2;
4266    
4267          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4268          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4269          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4270          available. */          available. */
4271    
4272  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4273          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4274    #elif defined  SUPPORT_UTF
4275            if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4276    #elif !(defined COMPILE_PCRE8)
4277            if (d > 255)
4278    #endif
4279    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4280            {            {
4281            class_utf8 = TRUE;            xclass = TRUE;
4282    
4283            /* With UCP support, we can find the other case equivalents of            /* With UCP support, we can find the other case equivalents of
4284            the relevant characters. There may be several ranges. Optimize how            the relevant characters. There may be several ranges. Optimize how
4285            they fit with the basic range. */            they fit with the basic range. */
4286    
4287  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4288    #ifndef COMPILE_PCRE8
4289              if (utf && (options & PCRE_CASELESS) != 0)
4290    #else
4291            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4292    #endif
4293              {              {
4294              unsigned int occ, ocd;              unsigned int occ, ocd;
4295              unsigned int cc = c;              unsigned int cc = c;
# Line 3494  for (;; ptr++) Line 4315  for (;; ptr++)
4315    
4316                if (occ == ocd)                if (occ == ocd)
4317                  {                  {
4318                  *class_utf8data++ = XCL_SINGLE;                  *class_uchardata++ = XCL_SINGLE;
4319                  }                  }
4320                else                else
4321                  {                  {
4322                  *class_utf8data++ = XCL_RANGE;                  *class_uchardata++ = XCL_RANGE;
4323                  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);                  class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4324                  }                  }
4325                class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);                class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4326                }                }
4327              }              }
4328  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 3509  for (;; ptr++) Line 4330  for (;; ptr++)
4330            /* Now record the original range, possibly modified for UCP caseless            /* Now record the original range, possibly modified for UCP caseless
4331            overlapping ranges. */            overlapping ranges. */
4332    
4333            *class_utf8data++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4334            class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #ifdef SUPPORT_UTF
4335            class_utf8data += _pcre_ord2utf8(d, class_utf8data);  #ifndef COMPILE_PCRE8
4336              if (utf)
4337                {
4338                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4339                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4340                }
4341              else
4342                {
4343                *class_uchardata++ = c;
4344                *class_uchardata++ = d;
4345                }
4346    #else
4347              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4348              class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4349    #endif
4350    #else /* SUPPORT_UTF */
4351              *class_uchardata++ = c;
4352              *class_uchardata++ = d;
4353    #endif /* SUPPORT_UTF */
4354    
4355            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4356            caseless matching for UTF-8 characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
4357            for the smaller ones. */            for the smaller ones. As for 16 bit characters without UTF, we
4358              can still use  */
4359    
4360  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4361            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4362  #else            if (utf)
4363            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  #endif
4364                continue;    /* With next character in the class */
4365    #endif  /* SUPPORT_UCP */
4366    
4367    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4368              if (utf)
4369                {
4370                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4371                /* Adjust upper limit and fall through to set up the map */
4372                d = 127;
4373                }
4374              else
4375                {
4376                if (c > 255) continue;
4377                /* Adjust upper limit and fall through to set up the map */
4378                d = 255;
4379                }
4380    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4381              if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4382            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
   
4383            d = 127;            d = 127;
4384    #else
4385  #endif  /* SUPPORT_UCP */            if (c > 255) continue;
4386              /* Adjust upper limit and fall through to set up the map */
4387              d = 255;
4388    #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4389            }            }
4390  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4391    
4392          /* We use the bit map for all cases when not in UTF-8 mode; else          /* We use the bit map for 8 bit mode, or when the characters fall
4393          ranges that lie entirely within 0-127 when there is UCP support; else          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
         for partial ranges without UCP support. */  
4394    
4395          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4396    
4397          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4398    
# Line 3558  for (;; ptr++) Line 4415  for (;; ptr++)
4415    
4416        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4417    
4418        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4419          if (class_single_char < 2) class_single_char++;
4420          class_lastchar = c;
4421    
4422  #ifdef SUPPORT_UTF8        /* Handle a character that cannot go in the bit map */
4423        if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4424          {        if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4425          class_utf8 = TRUE;  #elif defined SUPPORT_UTF
4426          *class_utf8data++ = XCL_SINGLE;        if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4427          class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #elif !(defined COMPILE_PCRE8)
4428          if (c > 255)
4429    #endif
4430    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4431            {
4432            xclass = TRUE;
4433            *class_uchardata++ = XCL_SINGLE;
4434    #ifdef SUPPORT_UTF
4435    #ifndef COMPILE_PCRE8
4436            /* In non 8 bit mode, we can get here even
4437            if we are not in UTF mode. */
4438            if (!utf)
4439              *class_uchardata++ = c;
4440            else
4441    #endif
4442              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4443    #else /* SUPPORT_UTF */
4444            *class_uchardata++ = c;
4445    #endif /* SUPPORT_UTF */
4446    
4447  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4448    #ifdef COMPILE_PCRE8
4449          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4450    #else
4451            /* In non 8 bit mode, we can get here even
4452            if we are not in UTF mode. */
4453            if (utf && (options & PCRE_CASELESS) != 0)
4454    #endif
4455            {            {
4456            unsigned int othercase;            unsigned int othercase;
4457            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((othercase = UCD_OTHERCASE(c)) != c)
4458              {              {
4459              *class_utf8data++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4460              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4461    
4462                /* In the first pass, we must accumulate the space used here for
4463                the following reason: If this ends up as the only character in the
4464                class, it will later be optimized down to a single character.
4465                However, that uses less memory, and so if this happens to be at the
4466                end of the regex, there will not be enough memory in the real
4467                compile for this temporary storage. */
4468    
4469                if (lengthptr != NULL)
4470                  {
4471                  *lengthptr += class_uchardata - class_uchardata_base;
4472                  class_uchardata = class_uchardata_base;
4473                  }
4474              }              }
4475            }            }
4476  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
4477    
4478          }          }
4479        else        else
4480  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
   
4481        /* Handle a single-byte character */        /* Handle a single-byte character */
4482          {          {
4483            class_has_8bitchar = 1;
4484          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4485          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4486            {            {
4487            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c];   /* flip case */
4488            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4489            }            }
         class_charcount++;  
         class_lastchar = c;  
4490          }          }
       }  
   
     /* Loop until ']' reached. This "while" is the end of the "do" above. */  
4491    
     while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));  
   
     if (c == 0)                          /* Missing terminating ']' */  
       {  
       *errorcodeptr = ERR6;  
       goto FAILED;  
4492        }        }
4493    
4494        /* Loop until ']' reached. This "while" is the end of the "do" far above.
4495  /* This code has been disabled because it would mean that \s counts as      If we are at the end of an internal nested string, revert to the outer
4496  an explicit \r or \n reference, and that's not really what is wanted. Now      string. */
4497  we set the flag only if there is a literal "\r" or "\n" in the class. */  
4498        while (((c = *(++ptr)) != 0 ||
4499  #if 0             (nestptr != NULL &&
4500      /* Remember whether \r or \n are in this class */               (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
4501               (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4502      if (negate_class)  
4503        {      /* Check for missing terminating ']' */
4504        if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;  
4505        }      if (c == 0)
     else  
4506        {        {
4507        if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;        *errorcodeptr = ERR6;
4508          goto FAILED;
4509        }        }
 #endif  
4510    
4511        /* COMMENT NEEDS FIXING - no longer true.
4512      /* If class_charcount is 1, we saw precisely one character whose value is      If class_charcount is 1, we saw precisely one character whose value is
4513      less than 256. As long as there were no characters >= 128 and there was no      less than 256. As long as there were no characters >= 128 and there was no
4514      use of \p or \P, in other words, no use of any XCLASS features, we can      use of \p or \P, in other words, no use of any XCLASS features, we can
4515      optimize.      optimize.
4516    
4517      In UTF-8 mode, we can optimize the negative case only if there were no      In UTF-8 mode, we can optimize the negative case only if there were no
4518      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4519      operate on single-bytes only. This is an historical hangover. Maybe one day      operate on single-bytes characters only. This is an historical hangover.
4520      we can tidy these opcodes to handle multi-byte characters.      Maybe one day we can tidy these opcodes to handle multi-byte characters.
4521    
4522      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
4523      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4524      that OP_NOT does not support multibyte characters. In the positive case, it      Note that OP_NOT[I] does not support multibyte characters. In the positive
4525      can cause firstbyte to be set. Otherwise, there can be no first char if      case, it can cause firstchar to be set. Otherwise, there can be no first
4526      this item is first, whatever repeat count may follow. In the case of      char if this item is first, whatever repeat count may follow. In the case
4527      reqbyte, save the previous value for reinstating. */      of reqchar, save the previous value for reinstating. */
4528    
4529  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4530      if (class_charcount == 1 && !class_utf8 &&      if (class_single_char == 1 && (!utf || !negate_class
4531        (!utf8 || !negate_class || class_lastchar < 128))        || class_lastchar < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4532  #else  #else
4533      if (class_charcount == 1)      if (class_single_char == 1)
4534  #endif  #endif
4535        {        {
4536        zeroreqbyte = reqbyte;        zeroreqchar = reqchar;
4537    
4538        /* The OP_NOT opcode works on one-byte characters only. */        /* The OP_NOT[I] opcodes work on single characters only. */
4539    
4540        if (negate_class)        if (negate_class)
4541          {          {
4542          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4543          zerofirstbyte = firstbyte;          zerofirstchar = firstchar;
4544          *code++ = OP_NOT;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4545          *code++ = class_lastchar;          *code++ = class_lastchar;
4546          break;          break;
4547          }          }
# Line 3665  we set the flag only if there is a liter Line 4549  we set the flag only if there is a liter
4549        /* For a single, positive character, get the value into mcbuffer, and        /* For a single, positive character, get the value into mcbuffer, and
4550        then we can handle this with the normal one-character code. */        then we can handle this with the normal one-character code. */
4551    
4552  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4553        if (utf8 && class_lastchar > 127)        if (utf && class_lastchar > MAX_VALUE_FOR_SINGLE_CHAR)
4554          mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);          mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);
4555        else        else
4556  #endif  #endif
4557          {          {
# Line 3679  we set the flag only if there is a liter Line 4563  we set the flag only if there is a liter
4563    
4564      /* The general case - not the one-char optimization. If this is the first      /* The general case - not the one-char optimization. If this is the first
4565      thing in the branch, there can be no first char setting, whatever the      thing in the branch, there can be no first char setting, whatever the
4566      repeat count. Any reqbyte setting must remain unchanged after any kind of      repeat count. Any reqchar setting must remain unchanged after any kind of
4567      repeat. */      repeat. */
4568    
4569      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;