/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 544 by ph10, Tue Jun 15 17:20:55 2010 UTC revision 1078 by chpe, Tue Oct 16 15:55:00 2012 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
69  /* Macro for setting individual bits in class bitmaps. */  /* Macro for setting individual bits in class bitmaps. */
70    
71  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))  #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72    
73  /* Maximum length value to check against when making sure that the integer that  /* Maximum length value to check against when making sure that the integer that
74  holds the compiled pattern length does not overflow. We make it a bit less than  holds the compiled pattern length does not overflow. We make it a bit less than
# Line 73  to check them every time. */ Line 77  to check them every time. */
77    
78  #define OFLOW_MAX (INT_MAX - 20)  #define OFLOW_MAX (INT_MAX - 20)
79    
80    /* Definitions to allow mutual recursion */
81    
82    static int
83      add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84        const pcre_uint32 *, unsigned int);
85    
86    static BOOL
87      compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88        pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89        compile_data *, int *);
90    
91    
92    
93  /*************************************************  /*************************************************
94  *      Code parameters and static tables         *  *      Code parameters and static tables         *
# Line 88  so this number is very generous. Line 104  so this number is very generous.
104  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
105  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
106  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
108    filled up by repetitions of forward references, for example patterns like
109    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110    that the workspace is expanded using malloc() in this situation. The value
111    below is therefore a minimum, and we put a maximum on it for safety. The
112    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113    kicks in at the same number of forward references in all cases. */
114    
115  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117    
118  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
119  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
120    
121  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
122    
123    /* Private flags added to firstchar and reqchar. */
124    
125    #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
126    #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
127    /* Negative values for the firstchar and reqchar flags */
128    #define REQ_UNSET       (-2)
129    #define REQ_NONE        (-1)
130    
131    /* Repeated character flags. */
132    
133    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
134    
135  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
136  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 231  static const char posix_names[] = Line 265  static const char posix_names[] =
265    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
266    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
267    
268  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
269    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
270    
271  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 266  substitutes must be in the order of the Line 300  substitutes must be in the order of the
300  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
301    
302  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
303  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
304    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
305    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
307    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
308    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
309    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
310      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
311      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
312    static const pcre_uchar string_pXsp[] = {
313      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
314      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
315    static const pcre_uchar string_PXwd[] = {
316      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
317      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
318    static const pcre_uchar string_pXwd[] = {
319      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
320      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
321    
322    static const pcre_uchar *substitutes[] = {
323      string_PNd,           /* \D */
324      string_pNd,           /* \d */
325      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
326      string_pXsp,          /* \s */
327      string_PXwd,          /* \W */
328      string_pXwd           /* \w */
329  };  };
330    
331  static const uschar *posix_substitutes[] = {  static const pcre_uchar string_pL[] =   {
332    (uschar *)"\\p{L}",     /* alpha */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
333    (uschar *)"\\p{Ll}",    /* lower */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
334    (uschar *)"\\p{Lu}",    /* upper */  static const pcre_uchar string_pLl[] =  {
335    (uschar *)"\\p{Xan}",   /* alnum */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
336    NULL,                   /* ascii */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pLu[] =  {
338    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340    NULL,                   /* graph */  static const pcre_uchar string_pXan[] = {
341    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
342    NULL,                   /* punct */    CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_h[] =    {
344    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_h, '\0' };
345    NULL,                   /* xdigit */  static const pcre_uchar string_pXps[] = {
346      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
347      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
348    static const pcre_uchar string_PL[] =   {
349      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
350      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
351    static const pcre_uchar string_PLl[] =  {
352      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
353      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
354    static const pcre_uchar string_PLu[] =  {
355      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
356      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
357    static const pcre_uchar string_PXan[] = {
358      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
359      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
360    static const pcre_uchar string_H[] =    {
361      CHAR_BACKSLASH, CHAR_H, '\0' };
362    static const pcre_uchar string_PXps[] = {
363      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
364      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
365    
366    static const pcre_uchar *posix_substitutes[] = {
367      string_pL,            /* alpha */
368      string_pLl,           /* lower */
369      string_pLu,           /* upper */
370      string_pXan,          /* alnum */
371      NULL,                 /* ascii */
372      string_h,             /* blank */
373      NULL,                 /* cntrl */
374      string_pNd,           /* digit */
375      NULL,                 /* graph */
376      NULL,                 /* print */
377      NULL,                 /* punct */
378      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
379      string_pXwd,          /* word */
380      NULL,                 /* xdigit */
381    /* Negated cases */    /* Negated cases */
382    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
383    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
384    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
385    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
386    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
387    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
388    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
389    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
390    NULL,                   /* ^graph */    NULL,                 /* ^graph */
391    NULL,                   /* ^print */    NULL,                 /* ^print */
392    NULL,                   /* ^punct */    NULL,                 /* ^punct */
393    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
394    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */
395    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
396  };  };
397  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
398  #endif  #endif
399    
400  #define STRING(a)  # a  #define STRING(a)  # a
# Line 365  static const char error_texts[] = Line 453  static const char error_texts[] =
453    /* 30 */    /* 30 */
454    "unknown POSIX class name\0"    "unknown POSIX class name\0"
455    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
456    "this version of PCRE is not compiled with PCRE_UTF8 support\0"    "this version of PCRE is compiled without UTF support\0"
457    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
458    "character value in \\x{...} sequence is too large\0"    "character value in \\x{...} sequence is too large\0"
459    /* 35 */    /* 35 */
# Line 388  static const char error_texts[] = Line 476  static const char error_texts[] =
476    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
477    /* 50 */    /* 50 */
478    "repeated subpattern is too long\0"    /** DEAD **/    "repeated subpattern is too long\0"    /** DEAD **/
479    "octal value is greater than \\377 (not in UTF-8 mode)\0"    "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
480    "internal error: overran compiling workspace\0"    "internal error: overran compiling workspace\0"
481    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
482    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
483    /* 55 */    /* 55 */
484    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
485    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
486    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
487    "a numbered reference must not be zero\0"    "a numbered reference must not be zero\0"
# Line 407  static const char error_texts[] = Line 495  static const char error_texts[] =
495    /* 65 */    /* 65 */
496    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
497    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
498    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with Unicode property support\0"
499      "\\c must be followed by an ASCII character\0"
500      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
501      /* 70 */
502      "internal error: unknown opcode in find_fixedlength()\0"
503      "\\N is not supported in a class\0"
504      "too many forward references\0"
505      "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
506      "invalid UTF-16 string\0"
507      /* 75 */
508      "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
509      "character value in \\u.... sequence is too large\0"
510      "invalid UTF-32 string\0"
511    ;    ;
512    
513  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 426  For convenience, we use the same bit def Line 526  For convenience, we use the same bit def
526    
527  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
528    
529    /* Using a simple comparison for decimal numbers rather than a memory read
530    is much faster, and the resulting code is simpler (the compiler turns it
531    into a subtraction and unsigned comparison). */
532    
533    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
534    
535  #ifndef EBCDIC  #ifndef EBCDIC
536    
537  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
538  UTF-8 mode. */  UTF-8 mode. */
539    
540  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
541    {    {
542    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
543    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 470  static const unsigned char digitab[] = Line 576  static const unsigned char digitab[] =
576    
577  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
578    
579  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
580    {    {
581    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
582    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 505  static const unsigned char digitab[] = Line 611  static const unsigned char digitab[] =
611    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
612    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
613    
614  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
615    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
616    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
617    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 541  static const unsigned char ebcdic_charta Line 647  static const unsigned char ebcdic_charta
647  #endif  #endif
648    
649    
 /* Definition to allow mutual recursion */  
   
 static BOOL  
   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,  
     int *, int *, branch_chain *, compile_data *, int *);  
   
   
650    
651  /*************************************************  /*************************************************
652  *            Find an error text                  *  *            Find an error text                  *
# Line 576  return s; Line 675  return s;
675    
676    
677  /*************************************************  /*************************************************
678    *           Expand the workspace                 *
679    *************************************************/
680    
681    /* This function is called during the second compiling phase, if the number of
682    forward references fills the existing workspace, which is originally a block on
683    the stack. A larger block is obtained from malloc() unless the ultimate limit
684    has been reached or the increase will be rather small.
685    
686    Argument: pointer to the compile data block
687    Returns:  0 if all went well, else an error number
688    */
689    
690    static int
691    expand_workspace(compile_data *cd)
692    {
693    pcre_uchar *newspace;
694    int newsize = cd->workspace_size * 2;
695    
696    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
697    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
698        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
699     return ERR72;
700    
701    newspace = (PUBL(malloc))(IN_UCHARS(newsize));
702    if (newspace == NULL) return ERR21;
703    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
704    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
705    if (cd->workspace_size > COMPILE_WORK_SIZE)
706      (PUBL(free))((void *)cd->start_workspace);
707    cd->start_workspace = newspace;
708    cd->workspace_size = newsize;
709    return 0;
710    }
711    
712    
713    
714    /*************************************************
715    *            Check for counted repeat            *
716    *************************************************/
717    
718    /* This function is called when a '{' is encountered in a place where it might
719    start a quantifier. It looks ahead to see if it really is a quantifier or not.
720    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
721    where the ddds are digits.
722    
723    Arguments:
724      p         pointer to the first char after '{'
725    
726    Returns:    TRUE or FALSE
727    */
728    
729    static BOOL
730    is_counted_repeat(const pcre_uchar *p)
731    {
732    if (!IS_DIGIT(*p)) return FALSE;
733    p++;
734    while (IS_DIGIT(*p)) p++;
735    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
736    
737    if (*p++ != CHAR_COMMA) return FALSE;
738    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
739    
740    if (!IS_DIGIT(*p)) return FALSE;
741    p++;
742    while (IS_DIGIT(*p)) p++;
743    
744    return (*p == CHAR_RIGHT_CURLY_BRACKET);
745    }
746    
747    
748    
749    /*************************************************
750  *            Handle escapes                      *  *            Handle escapes                      *
751  *************************************************/  *************************************************/
752    
753  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
754  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or 0 for a data character
755  encodes one of the more complicated things such as \d. A backreference to group  which will be placed in chptr. A backreference to group n is returned as
756  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When  negative n. When UTF-8 is enabled, a positive value greater than 255 may
757  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,  be returned in chptr.
758  ptr is pointing at the \. On exit, it is on the final character of the escape  On entry,ptr is pointing at the \. On exit, it is on the final character of the
759  sequence.  escape sequence.
760    
761  Arguments:  Arguments:
762    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
763      chptr          points to the data character
764    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
765    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
766    options        the options bits    options        the options bits
767    isclass        TRUE if inside a character class    isclass        TRUE if inside a character class
768    
769  Returns:         zero or positive => a data character  Returns:         zero => a data character
770                   negative => a special escape sequence                   positive => a special escape sequence
771                     negative => a back reference
772                   on error, errorcodeptr is set                   on error, errorcodeptr is set
773  */  */
774    
775  static int  static int
776  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
777    int options, BOOL isclass)    int bracount, int options, BOOL isclass)
778  {  {
779  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
780  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
781  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
782    pcre_uint32 c;
783    int escape = 0;
784    int i;
785    
786  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
787  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
# Line 619  in a table. A non-zero result is somethi Line 795  in a table. A non-zero result is somethi
795  Otherwise further processing may be required. */  Otherwise further processing may be required. */
796    
797  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
798  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
799  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if (c < CHAR_0 || c > CHAR_z) {}
800    else if ((i = escapes[c - CHAR_0]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
801    
802  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
803  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
804  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
805    else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
806  #endif  #endif
807    
808  /* Escapes that need further processing, or are illegal. */  /* Escapes that need further processing, or are illegal. */
809    
810  else  else
811    {    {
812    const uschar *oldptr;    const pcre_uchar *oldptr;
813    BOOL braced, negated;    BOOL braced, negated, overflow;
814      int s;
815    
816    switch (c)    switch (c)
817      {      {
# Line 641  else Line 820  else
820    
821      case CHAR_l:      case CHAR_l:
822      case CHAR_L:      case CHAR_L:
823        *errorcodeptr = ERR37;
824        break;
825    
826      case CHAR_u:      case CHAR_u:
827        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
828          {
829          /* In JavaScript, \u must be followed by four hexadecimal numbers.
830          Otherwise it is a lowercase u letter. */
831          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
832            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
833            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
834            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
835            {
836            c = 0;
837            for (i = 0; i < 4; ++i)
838              {
839              register pcre_uint32 cc = *(++ptr);
840    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
841              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
842              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
843    #else           /* EBCDIC coding */
844              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
845              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
846    #endif
847              }
848    
849    #if defined COMPILE_PCRE8
850            if (c > (utf ? 0x10ffff : 0xff))
851    #elif defined COMPILE_PCRE16
852            if (c > (utf ? 0x10ffff : 0xffff))
853    #elif defined COMPILE_PCRE32
854            if (utf && c > 0x10ffff)
855    #endif
856              {
857              *errorcodeptr = ERR76;
858              }
859            else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
860            }
861          }
862        else
863          *errorcodeptr = ERR37;
864        break;
865    
866      case CHAR_U:      case CHAR_U:
867      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
868        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
869      break;      break;
870    
871      /* \g must be followed by one of a number of specific things:      /* In a character class, \g is just a literal "g". Outside a character
872        class, \g must be followed by one of a number of specific things:
873    
874      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
875      backreference. If negative, it is a relative backreference. This is a Perl      backreference. If negative, it is a relative backreference. This is a Perl
# Line 660  else Line 883  else
883      (3) For Oniguruma compatibility we also support \g followed by a name or a      (3) For Oniguruma compatibility we also support \g followed by a name or a
884      number either in angle brackets or in single quotes. However, these are      number either in angle brackets or in single quotes. However, these are
885      (possibly recursive) subroutine calls, _not_ backreferences. Just return      (possibly recursive) subroutine calls, _not_ backreferences. Just return
886      the -ESC_g code (cf \k). */      the ESC_g code (cf \k). */
887    
888      case CHAR_g:      case CHAR_g:
889        if (isclass) break;
890      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
891        {        {
892        c = -ESC_g;        escape = ESC_g;
893        break;        break;
894        }        }
895    
# Line 673  else Line 897  else
897    
898      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
899        {        {
900        const uschar *p;        const pcre_uchar *p;
901        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
902          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
903        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
904          {          {
905          c = -ESC_k;          escape = ESC_k;
906          break;          break;
907          }          }
908        braced = TRUE;        braced = TRUE;
# Line 693  else Line 917  else
917        }        }
918      else negated = FALSE;      else negated = FALSE;
919    
920      c = 0;      /* The integer range is limited by the machine's int representation. */
921      while ((digitab[ptr[1]] & ctype_digit) != 0)      s = 0;
922        c = c * 10 + *(++ptr) - CHAR_0;      overflow = FALSE;
923        while (IS_DIGIT(ptr[1]))
924      if (c < 0)   /* Integer overflow */        {
925          if (s > INT_MAX / 10 - 1) /* Integer overflow */
926            {
927            overflow = TRUE;
928            break;
929            }
930          s = s * 10 + (int)(*(++ptr) - CHAR_0);
931          }
932        if (overflow) /* Integer overflow */
933        {        {
934          while (IS_DIGIT(ptr[1]))
935            ptr++;
936        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
937        break;        break;
938        }        }
# Line 709  else Line 943  else
943        break;        break;
944        }        }
945    
946      if (c == 0)      if (s == 0)
947        {        {
948        *errorcodeptr = ERR58;        *errorcodeptr = ERR58;
949        break;        break;
# Line 717  else Line 951  else
951    
952      if (negated)      if (negated)
953        {        {
954        if (c > bracount)        if (s > bracount)
955          {          {
956          *errorcodeptr = ERR15;          *errorcodeptr = ERR15;
957          break;          break;
958          }          }
959        c = bracount - (c - 1);        s = bracount - (s - 1);
960        }        }
961    
962      c = -(ESC_REF + c);      escape = -s;
963      break;      break;
964    
965      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
# Line 746  else Line 980  else
980      if (!isclass)      if (!isclass)
981        {        {
982        oldptr = ptr;        oldptr = ptr;
983        c -= CHAR_0;        /* The integer range is limited by the machine's int representation. */
984        while ((digitab[ptr[1]] & ctype_digit) != 0)        s = (int)(c -CHAR_0);
985          c = c * 10 + *(++ptr) - CHAR_0;        overflow = FALSE;
986        if (c < 0)    /* Integer overflow */        while (IS_DIGIT(ptr[1]))
987            {
988            if (s > INT_MAX / 10 - 1) /* Integer overflow */
989              {
990              overflow = TRUE;
991              break;
992              }
993            s = s * 10 + (int)(*(++ptr) - CHAR_0);
994            }
995          if (overflow) /* Integer overflow */
996          {          {
997            while (IS_DIGIT(ptr[1]))
998              ptr++;
999          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1000          break;          break;
1001          }          }
1002        if (c < 10 || c <= bracount)        if (s < 10 || s <= bracount)
1003          {          {
1004          c = -(ESC_REF + c);          escape = -s;
1005          break;          break;
1006          }          }
1007        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
# Line 776  else Line 1021  else
1021      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1022      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
1023      significant 8 bits of octal numbers (I think this is what early Perls used      significant 8 bits of octal numbers (I think this is what early Perls used
1024      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1025      than 3 octal digits. */      but no more than 3 octal digits. */
1026    
1027      case CHAR_0:      case CHAR_0:
1028      c -= CHAR_0;      c -= CHAR_0;
1029      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1030          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
1031      if (!utf8 && c > 255) *errorcodeptr = ERR51;  #ifdef COMPILE_PCRE8
1032        if (!utf && c > 0xff) *errorcodeptr = ERR51;
1033    #endif
1034      break;      break;
1035    
1036      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
1037      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1038      treated as a data character. */      If not, { is treated as a data character. */
1039    
1040      case CHAR_x:      case CHAR_x:
1041        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1042          {
1043          /* In JavaScript, \x must be followed by two hexadecimal numbers.
1044          Otherwise it is a lowercase x letter. */
1045          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1046            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1047            {
1048            c = 0;
1049            for (i = 0; i < 2; ++i)
1050              {
1051              register pcre_uint32 cc = *(++ptr);
1052    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1053              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1054              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1055    #else           /* EBCDIC coding */
1056              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1057              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1058    #endif
1059              }
1060            }
1061          break;
1062          }
1063    
1064      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1065        {        {
1066        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
       int count = 0;  
1067    
1068        c = 0;        c = 0;
1069        while ((digitab[*pt] & ctype_xdigit) != 0)        overflow = FALSE;
1070          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1071          {          {
1072          register int cc = *pt++;          register pcre_uint32 cc = *pt++;
1073          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1074          count++;  
1075    #ifdef COMPILE_PCRE32
1076            if (c >= 0x10000000l) { overflow = TRUE; break; }
1077    #endif
1078    
1079  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1080          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
# Line 810  else Line 1083  else
1083          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1084          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1085  #endif  #endif
1086    
1087    #if defined COMPILE_PCRE8
1088            if (c > (utf ? 0x10ffff : 0xff)) { overflow = TRUE; break; }
1089    #elif defined COMPILE_PCRE16
1090            if (c > (utf ? 0x10ffff : 0xffff)) { overflow = TRUE; break; }
1091    #elif defined COMPILE_PCRE32
1092            if (utf && c > 0x10ffff) { overflow = TRUE; break; }
1093    #endif
1094            }
1095    
1096          if (overflow)
1097            {
1098            while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1099            *errorcodeptr = ERR34;
1100          }          }
1101    
1102        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1103          {          {
1104          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1105          ptr = pt;          ptr = pt;
1106          break;          break;
1107          }          }
# Line 826  else Line 1113  else
1113      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1114    
1115      c = 0;      c = 0;
1116      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1117        {        {
1118        int cc;                                  /* Some compilers don't like */        pcre_uint32 cc;                          /* Some compilers don't like */
1119        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
1120  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1121        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
# Line 841  else Line 1128  else
1128      break;      break;
1129    
1130      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1131      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
1132        coding is ASCII-specific, but then the whole concept of \cx is
1133      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1134    
1135      case CHAR_c:      case CHAR_c:
# Line 851  else Line 1139  else
1139        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
1140        break;        break;
1141        }        }
1142    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1143  #ifndef EBCDIC  /* ASCII/UTF-8 coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
1144          {
1145          *errorcodeptr = ERR68;
1146          break;
1147          }
1148      if (c >= CHAR_a && c <= CHAR_z) c -= 32;      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1149      c ^= 0x40;      c ^= 0x40;
1150  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1151      if (c >= CHAR_a && c <= CHAR_z) c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1152      c ^= 0xC0;      c ^= 0xC0;
1153  #endif  #endif
# Line 879  else Line 1171  else
1171    }    }
1172    
1173  /* Perl supports \N{name} for character names, as well as plain \N for "not  /* Perl supports \N{name} for character names, as well as plain \N for "not
1174  newline". PCRE does not support \N{name}. */  newline". PCRE does not support \N{name}. However, it does support
1175    quantification such as \N{2,3}. */
1176    
1177  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)  if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1178         !is_counted_repeat(ptr+2))
1179    *errorcodeptr = ERR37;    *errorcodeptr = ERR37;
1180    
1181  /* If PCRE_UCP is set, we change the values for \d etc. */  /* If PCRE_UCP is set, we change the values for \d etc. */
1182    
1183  if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)  if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1184    c -= (ESC_DU - ESC_D);    escape += (ESC_DU - ESC_D);
1185    
1186  /* Set the pointer to the final character before returning. */  /* Set the pointer to the final character before returning. */
1187    
1188  *ptrptr = ptr;  *ptrptr = ptr;
1189  return c;  *chptr = c;
1190    return escape;
1191  }  }
1192    
   
   
1193  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1194  /*************************************************  /*************************************************
1195  *               Handle \P and \p                 *  *               Handle \P and \p                 *
# Line 917  Returns:         type value from ucp_typ Line 1210  Returns:         type value from ucp_typ
1210  */  */
1211    
1212  static int  static int
1213  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1214  {  {
1215  int c, i, bot, top;  pcre_uchar c;
1216  const uschar *ptr = *ptrptr;  int i, bot, top;
1217  char name[32];  const pcre_uchar *ptr = *ptrptr;
1218    pcre_uchar name[32];
1219    
1220  c = *(++ptr);  c = *(++ptr);
1221  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 938  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1232  if (c == CHAR_LEFT_CURLY_BRACKET)
1232      *negptr = TRUE;      *negptr = TRUE;
1233      ptr++;      ptr++;
1234      }      }
1235    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1236      {      {
1237      c = *(++ptr);      c = *(++ptr);
1238      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 962  else Line 1256  else
1256  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1257    
1258  bot = 0;  bot = 0;
1259  top = _pcre_utt_size;  top = PRIV(utt_size);
1260    
1261  while (bot < top)  while (bot < top)
1262    {    {
1263      int r;
1264    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1265    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1266    if (c == 0)    if (r == 0)
1267      {      {
1268      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1269      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1270      }      }
1271    if (c > 0) bot = i + 1; else top = i;    if (r > 0) bot = i + 1; else top = i;
1272    }    }
1273    
1274  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
# Line 991  return -1; Line 1286  return -1;
1286    
1287    
1288  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if (*p++ != CHAR_COMMA) return FALSE;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == CHAR_RIGHT_CURLY_BRACKET);  
 }  
   
   
   
 /*************************************************  
1289  *         Read repeat counts                     *  *         Read repeat counts                     *
1290  *************************************************/  *************************************************/
1291    
# Line 1042  Returns:         pointer to '}' on succe Line 1304  Returns:         pointer to '}' on succe
1304                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1305  */  */
1306    
1307  static const uschar *  static const pcre_uchar *
1308  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1309  {  {
1310  int min = 0;  int min = 0;
1311  int max = -1;  int max = -1;
# Line 1051  int max = -1; Line 1313  int max = -1;
1313  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1314  an integer overflow. */  an integer overflow. */
1315    
1316  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1317  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1318    {    {
1319    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1066  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1328  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1328    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1329      {      {
1330      max = 0;      max = 0;
1331      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1332      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1333        {        {
1334        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1099  top-level call starts at the beginning o Line 1361  top-level call starts at the beginning o
1361  start at a parenthesis. It scans along a pattern's text looking for capturing  start at a parenthesis. It scans along a pattern's text looking for capturing
1362  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1363  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1364  returns when it reaches a given numbered subpattern. We know that if (?P< is  returns when it reaches a given numbered subpattern. Recursion is used to keep
1365  encountered, the name will be terminated by '>' because that is checked in the  track of subpatterns that reset the capturing group numbers - the (?| feature.
1366  first pass. Recursion is used to keep track of subpatterns that reset the  
1367  capturing group numbers - the (?| feature.  This function was originally called only from the second pass, in which we know
1368    that if (?< or (?' or (?P< is encountered, the name will be correctly
1369    terminated because that is checked in the first pass. There is now one call to
1370    this function in the first pass, to check for a recursive back reference by
1371    name (so that we can make the whole group atomic). In this case, we need check
1372    only up to the current position in the pattern, and that is still OK because
1373    and previous occurrences will have been checked. To make this work, the test
1374    for "end of pattern" is a check against cd->end_pattern in the main loop,
1375    instead of looking for a binary zero. This means that the special first-pass
1376    call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1377    processing items within the loop are OK, because afterwards the main loop will
1378    terminate.)
1379    
1380  Arguments:  Arguments:
1381    ptrptr       address of the current character pointer (updated)    ptrptr       address of the current character pointer (updated)
# Line 1110  Arguments: Line 1383  Arguments:
1383    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1384    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1385    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1386      utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode
1387    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1388    
1389  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1390  */  */
1391    
1392  static int  static int
1393  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1394    BOOL xmode, int *count)    BOOL xmode, BOOL utf, int *count)
1395  {  {
1396  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1397  int start_count = *count;  int start_count = *count;
1398  int hwm_count = start_count;  int hwm_count = start_count;
1399  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1130  dealing with. The very first call may no Line 1404  dealing with. The very first call may no
1404  if (ptr[0] == CHAR_LEFT_PARENTHESIS)  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1405    {    {
1406    /* Handle specials such as (*SKIP) or (*UTF8) etc. */    /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1407    
1408    if (ptr[1] == CHAR_ASTERISK) ptr += 2;    if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1409    
1410    /* Handle a normal, unnamed capturing parenthesis. */    /* Handle a normal, unnamed capturing parenthesis. */
1411    
1412    else if (ptr[1] != CHAR_QUESTION_MARK)    else if (ptr[1] != CHAR_QUESTION_MARK)
# Line 1150  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1424  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1424      ptr += 3;      ptr += 3;
1425      dup_parens = TRUE;      dup_parens = TRUE;
1426      }      }
1427    
1428    /* Handle comments; all characters are allowed until a ket is reached. */    /* Handle comments; all characters are allowed until a ket is reached. */
1429    
1430    else if (ptr[2] == CHAR_NUMBER_SIGN)    else if (ptr[2] == CHAR_NUMBER_SIGN)
1431      {      {
1432      for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;      for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1433      goto FAIL_EXIT;      goto FAIL_EXIT;
1434      }      }
1435    
1436    /* Handle a condition. If it is an assertion, just carry on so that it    /* Handle a condition. If it is an assertion, just carry on so that it
1437    is processed as normal. If not, skip to the closing parenthesis of the    is processed as normal. If not, skip to the closing parenthesis of the
# Line 1185  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1459  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1459      if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&      if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1460          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1461        {        {
1462        int term;        pcre_uchar term;
1463        const uschar *thisname;        const pcre_uchar *thisname;
1464        *count += 1;        *count += 1;
1465        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1466        term = *ptr++;        term = *ptr++;
1467        if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;        if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1468        thisname = ptr;        thisname = ptr;
1469        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1470        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == (int)(ptr - thisname) &&
1471            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, (unsigned int)lorn) == 0)
1472          return *count;          return *count;
1473        term++;        term++;
1474        }        }
# Line 1202  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1476  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1476    }    }
1477    
1478  /* Past any initial parenthesis handling, scan for parentheses or vertical  /* Past any initial parenthesis handling, scan for parentheses or vertical
1479  bars. */  bars. Stop if we get to cd->end_pattern. Note that this is important for the
1480    first-pass call when this value is temporarily adjusted to stop at the current
1481    position. So DO NOT change this to a test for binary zero. */
1482    
1483  for (; *ptr != 0; ptr++)  for (; ptr < cd->end_pattern; ptr++)
1484    {    {
1485    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1486    
# Line 1235  for (; *ptr != 0; ptr++) Line 1511  for (; *ptr != 0; ptr++)
1511          {          {
1512          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1513            ptr+= 2;            ptr+= 2;
1514          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1515                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1516            ptr += 4;            ptr += 4;
1517          else          else
# Line 1278  for (; *ptr != 0; ptr++) Line 1554  for (; *ptr != 0; ptr++)
1554    
1555    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1556      {      {
1557      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      ptr++;
1558        while (*ptr != 0)
1559          {
1560          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1561          ptr++;
1562    #ifdef SUPPORT_UTF
1563          if (utf) FORWARDCHAR(ptr);
1564    #endif
1565          }
1566      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1567      continue;      continue;
1568      }      }
# Line 1287  for (; *ptr != 0; ptr++) Line 1571  for (; *ptr != 0; ptr++)
1571    
1572    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1573      {      {
1574      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1575      if (rc > 0) return rc;      if (rc > 0) return rc;
1576      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1577      }      }
# Line 1295  for (; *ptr != 0; ptr++) Line 1579  for (; *ptr != 0; ptr++)
1579    else if (*ptr == CHAR_RIGHT_PARENTHESIS)    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1580      {      {
1581      if (dup_parens && *count < hwm_count) *count = hwm_count;      if (dup_parens && *count < hwm_count) *count = hwm_count;
1582      goto FAIL_EXIT;      goto FAIL_EXIT;
1583      }      }
1584    
1585    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
# Line 1333  Arguments: Line 1617  Arguments:
1617    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1618    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1619    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1620      utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode
1621    
1622  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1623  */  */
1624    
1625  static int  static int
1626  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1627      BOOL utf)
1628  {  {
1629  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1630  int count = 0;  int count = 0;
1631  int rc;  int rc;
1632    
# Line 1351  matching closing parens. That is why we Line 1637  matching closing parens. That is why we
1637    
1638  for (;;)  for (;;)
1639    {    {
1640    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1641    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1642    }    }
1643    
# Line 1367  return rc; Line 1653  return rc;
1653    
1654  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1655  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1656  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, it makes sense to skip negative
1657  For some calls, it makes sense to skip negative forward and all backward  forward and all backward assertions, and also the \b assertion; for others it
1658  assertions, and also the \b assertion; for others it does not.  does not.
1659    
1660  Arguments:  Arguments:
1661    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1662    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1663    
1664  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1665  */  */
1666    
1667  static const uschar*  static const pcre_uchar*
1668  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const pcre_uchar *code, BOOL skipassert)
   BOOL skipassert)  
1669  {  {
1670  for (;;)  for (;;)
1671    {    {
1672    switch ((int)*code)    switch ((int)*code)
1673      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1674      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1675      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1676      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1677      if (!skipassert) return code;      if (!skipassert) return code;
1678      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1679      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1680      break;      break;
1681    
1682      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1414  for (;;) Line 1690  for (;;)
1690      case OP_RREF:      case OP_RREF:
1691      case OP_NRREF:      case OP_NRREF:
1692      case OP_DEF:      case OP_DEF:
1693      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1694      break;      break;
1695    
1696      default:      default:
# Line 1444  and doing the check at the end; a flag s Line 1720  and doing the check at the end; a flag s
1720    
1721  Arguments:  Arguments:
1722    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1723    options  the compiling options    utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1724    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1725    cd       the "compile data" structure    cd       the "compile data" structure
1726    
1727  Returns:   the fixed length,  Returns:   the fixed length,
1728               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1729               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1730               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1731                 or -4 if an unknown opcode was encountered (internal error)
1732  */  */
1733    
1734  static int  static int
1735  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1736  {  {
1737  int length = -1;  int length = -1;
1738    
1739  register int branchlength = 0;  register int branchlength = 0;
1740  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1741    
1742  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1743  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1468  branch, check the length against that of Line 1745  branch, check the length against that of
1745  for (;;)  for (;;)
1746    {    {
1747    int d;    int d;
1748    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1749    register int op = *cc;    register pcre_uchar op = *cc;
1750    
1751    switch (op)    switch (op)
1752      {      {
1753        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1754        OP_BRA (normal non-capturing bracket) because the other variants of these
1755        opcodes are all concerned with unlimited repeated groups, which of course
1756        are not of fixed length. */
1757    
1758      case OP_CBRA:      case OP_CBRA:
1759      case OP_BRA:      case OP_BRA:
1760      case OP_ONCE:      case OP_ONCE:
1761        case OP_ONCE_NC:
1762      case OP_COND:      case OP_COND:
1763      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1764      if (d < 0) return d;      if (d < 0) return d;
1765      branchlength += d;      branchlength += d;
1766      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1767      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1768      break;      break;
1769    
1770      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1771      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1772      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1773        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1774        because they all imply an unlimited repeat. */
1775    
1776      case OP_ALT:      case OP_ALT:
1777      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1778      case OP_END:      case OP_END:
1779        case OP_ACCEPT:
1780        case OP_ASSERT_ACCEPT:
1781      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1782        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1783      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1505  for (;;) Line 1791  for (;;)
1791    
1792      case OP_RECURSE:      case OP_RECURSE:
1793      if (!atend) return -3;      if (!atend) return -3;
1794      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1795      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1796      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1797      d = find_fixedlength(cs + 2, options, atend, cd);      d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1798      if (d < 0) return d;      if (d < 0) return d;
1799      branchlength += d;      branchlength += d;
1800      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1521  for (;;) Line 1807  for (;;)
1807      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1808      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1809      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1810      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1811        break;
1812    
1813      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1814    
1815      case OP_REVERSE:      case OP_MARK:
1816        case OP_PRUNE_ARG:
1817        case OP_SKIP_ARG:
1818        case OP_THEN_ARG:
1819        cc += cc[1] + PRIV(OP_lengths)[*cc];
1820        break;
1821    
1822        case OP_CALLOUT:
1823        case OP_CIRC:
1824        case OP_CIRCM:
1825        case OP_CLOSE:
1826        case OP_COMMIT:
1827      case OP_CREF:      case OP_CREF:
     case OP_NCREF:  
     case OP_RREF:  
     case OP_NRREF:  
1828      case OP_DEF:      case OP_DEF:
1829      case OP_OPT:      case OP_DOLL:
1830      case OP_CALLOUT:      case OP_DOLLM:
     case OP_SOD:  
     case OP_SOM:  
     case OP_SET_SOM:  
1831      case OP_EOD:      case OP_EOD:
1832      case OP_EODN:      case OP_EODN:
1833      case OP_CIRC:      case OP_FAIL:
1834      case OP_DOLL:      case OP_NCREF:
1835        case OP_NRREF:
1836      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1837        case OP_PRUNE:
1838        case OP_REVERSE:
1839        case OP_RREF:
1840        case OP_SET_SOM:
1841        case OP_SKIP:
1842        case OP_SOD:
1843        case OP_SOM:
1844        case OP_THEN:
1845      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1846      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1847      break;      break;
1848    
1849      /* Handle literal characters */      /* Handle literal characters */
1850    
1851      case OP_CHAR:      case OP_CHAR:
1852      case OP_CHARNC:      case OP_CHARI:
1853      case OP_NOT:      case OP_NOT:
1854        case OP_NOTI:
1855      branchlength++;      branchlength++;
1856      cc += 2;      cc += 2;
1857  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1858      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1859  #endif  #endif
1860      break;      break;
1861    
# Line 1562  for (;;) Line 1863  for (;;)
1863      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1864    
1865      case OP_EXACT:      case OP_EXACT:
1866      branchlength += GET2(cc,1);      case OP_EXACTI:
1867      cc += 4;      case OP_NOTEXACT:
1868  #ifdef SUPPORT_UTF8      case OP_NOTEXACTI:
1869      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      branchlength += (int)GET2(cc,1);
1870        cc += _pcre_utf8_table4[cc[-1] & 0x3f];      cc += 2 + IMM2_SIZE;
1871    #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1872        if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1873  #endif  #endif
1874      break;      break;
1875    
1876      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1877      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1878      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1879      cc += 4;        cc += 2;
1880        cc += 1 + IMM2_SIZE + 1;
1881      break;      break;
1882    
1883      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1583  for (;;) Line 1887  for (;;)
1887      cc += 2;      cc += 2;
1888      /* Fall through */      /* Fall through */
1889    
1890        case OP_HSPACE:
1891        case OP_VSPACE:
1892        case OP_NOT_HSPACE:
1893        case OP_NOT_VSPACE:
1894      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1895      case OP_DIGIT:      case OP_DIGIT:
1896      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1595  for (;;) Line 1903  for (;;)
1903      cc++;      cc++;
1904      break;      break;
1905    
1906      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1907        otherwise \C is coded as OP_ALLANY. */
1908    
1909      case OP_ANYBYTE:      case OP_ANYBYTE:
1910      return -2;      return -2;
1911    
1912      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1913    
1914  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1915      case OP_XCLASS:      case OP_XCLASS:
1916      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1917      /* Fall through */      /* Fall through */
1918  #endif  #endif
1919    
1920      case OP_CLASS:      case OP_CLASS:
1921      case OP_NCLASS:      case OP_NCLASS:
1922      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1923    
1924      switch (*cc)      switch (*cc)
1925        {        {
1926          case OP_CRPLUS:
1927          case OP_CRMINPLUS:
1928        case OP_CRSTAR:        case OP_CRSTAR:
1929        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1930        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1622  for (;;) Line 1933  for (;;)
1933    
1934        case OP_CRRANGE:        case OP_CRRANGE:
1935        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1936        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1937        branchlength += GET2(cc,1);        branchlength += (int)GET2(cc,1);
1938        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1939        break;        break;
1940    
1941        default:        default:
# Line 1634  for (;;) Line 1945  for (;;)
1945    
1946      /* Anything else is variable length */      /* Anything else is variable length */
1947    
1948      default:      case OP_ANYNL:
1949        case OP_BRAMINZERO:
1950        case OP_BRAPOS:
1951        case OP_BRAPOSZERO:
1952        case OP_BRAZERO:
1953        case OP_CBRAPOS:
1954        case OP_EXTUNI:
1955        case OP_KETRMAX:
1956        case OP_KETRMIN:
1957        case OP_KETRPOS:
1958        case OP_MINPLUS:
1959        case OP_MINPLUSI:
1960        case OP_MINQUERY:
1961        case OP_MINQUERYI:
1962        case OP_MINSTAR:
1963        case OP_MINSTARI:
1964        case OP_MINUPTO:
1965        case OP_MINUPTOI:
1966        case OP_NOTMINPLUS:
1967        case OP_NOTMINPLUSI:
1968        case OP_NOTMINQUERY:
1969        case OP_NOTMINQUERYI:
1970        case OP_NOTMINSTAR:
1971        case OP_NOTMINSTARI:
1972        case OP_NOTMINUPTO:
1973        case OP_NOTMINUPTOI:
1974        case OP_NOTPLUS:
1975        case OP_NOTPLUSI:
1976        case OP_NOTPOSPLUS:
1977        case OP_NOTPOSPLUSI:
1978        case OP_NOTPOSQUERY:
1979        case OP_NOTPOSQUERYI:
1980        case OP_NOTPOSSTAR:
1981        case OP_NOTPOSSTARI:
1982        case OP_NOTPOSUPTO:
1983        case OP_NOTPOSUPTOI:
1984        case OP_NOTQUERY:
1985        case OP_NOTQUERYI:
1986        case OP_NOTSTAR:
1987        case OP_NOTSTARI:
1988        case OP_NOTUPTO:
1989        case OP_NOTUPTOI:
1990        case OP_PLUS:
1991        case OP_PLUSI:
1992        case OP_POSPLUS:
1993        case OP_POSPLUSI:
1994        case OP_POSQUERY:
1995        case OP_POSQUERYI:
1996        case OP_POSSTAR:
1997        case OP_POSSTARI:
1998        case OP_POSUPTO:
1999        case OP_POSUPTOI:
2000        case OP_QUERY:
2001        case OP_QUERYI:
2002        case OP_REF:
2003        case OP_REFI:
2004        case OP_SBRA:
2005        case OP_SBRAPOS:
2006        case OP_SCBRA:
2007        case OP_SCBRAPOS:
2008        case OP_SCOND:
2009        case OP_SKIPZERO:
2010        case OP_STAR:
2011        case OP_STARI:
2012        case OP_TYPEMINPLUS:
2013        case OP_TYPEMINQUERY:
2014        case OP_TYPEMINSTAR:
2015        case OP_TYPEMINUPTO:
2016        case OP_TYPEPLUS:
2017        case OP_TYPEPOSPLUS:
2018        case OP_TYPEPOSQUERY:
2019        case OP_TYPEPOSSTAR:
2020        case OP_TYPEPOSUPTO:
2021        case OP_TYPEQUERY:
2022        case OP_TYPESTAR:
2023        case OP_TYPEUPTO:
2024        case OP_UPTO:
2025        case OP_UPTOI:
2026      return -1;      return -1;
2027    
2028        /* Catch unrecognized opcodes so that when new ones are added they
2029        are not forgotten, as has happened in the past. */
2030    
2031        default:
2032        return -4;
2033      }      }
2034    }    }
2035  /* Control never gets here */  /* Control never gets here */
# Line 1656  length. Line 2050  length.
2050    
2051  Arguments:  Arguments:
2052    code        points to start of expression    code        points to start of expression
2053    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2054    number      the required bracket number or negative to find a lookbehind    number      the required bracket number or negative to find a lookbehind
2055    
2056  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2057  */  */
2058    
2059  const uschar *  const pcre_uchar *
2060  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2061  {  {
2062  for (;;)  for (;;)
2063    {    {
2064    register int c = *code;    register pcre_uchar c = *code;
2065    
2066    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
2067    
2068    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1680  for (;;) Line 2075  for (;;)
2075    
2076    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
2077      {      {
2078      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
2079      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2080      }      }
2081    
2082    /* Handle capturing bracket */    /* Handle capturing bracket */
2083    
2084    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
2085               c == OP_CBRAPOS || c == OP_SCBRAPOS)
2086      {      {
2087      int n = GET2(code, 1+LINK_SIZE);      int n = (int)GET2(code, 1+LINK_SIZE);
2088      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2089      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2090      }      }
2091    
2092    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
# Line 1718  for (;;) Line 2114  for (;;)
2114        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2115        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2116        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2117        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2118            code += 2;
2119        break;        break;
2120    
2121        case OP_MARK:        case OP_MARK:
2122        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2123        case OP_SKIP_ARG:        case OP_SKIP_ARG:
2124          code += code[1];
2125          break;
2126    
2127        case OP_THEN_ARG:        case OP_THEN_ARG:
2128        code += code[1];        code += code[1];
2129        break;        break;
# Line 1731  for (;;) Line 2131  for (;;)
2131    
2132      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2133    
2134      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2135    
2136    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2137    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2138    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2139    
2140  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2141      if (utf8) switch(c)      if (utf) switch(c)
2142        {        {
2143        case OP_CHAR:        case OP_CHAR:
2144        case OP_CHARNC:        case OP_CHARI:
2145        case OP_EXACT:        case OP_EXACT:
2146          case OP_EXACTI:
2147        case OP_UPTO:        case OP_UPTO:
2148          case OP_UPTOI:
2149        case OP_MINUPTO:        case OP_MINUPTO:
2150          case OP_MINUPTOI:
2151        case OP_POSUPTO:        case OP_POSUPTO:
2152          case OP_POSUPTOI:
2153        case OP_STAR:        case OP_STAR:
2154          case OP_STARI:
2155        case OP_MINSTAR:        case OP_MINSTAR:
2156          case OP_MINSTARI:
2157        case OP_POSSTAR:        case OP_POSSTAR:
2158          case OP_POSSTARI:
2159        case OP_PLUS:        case OP_PLUS:
2160          case OP_PLUSI:
2161        case OP_MINPLUS:        case OP_MINPLUS:
2162          case OP_MINPLUSI:
2163        case OP_POSPLUS:        case OP_POSPLUS:
2164          case OP_POSPLUSI:
2165        case OP_QUERY:        case OP_QUERY:
2166          case OP_QUERYI:
2167        case OP_MINQUERY:        case OP_MINQUERY:
2168          case OP_MINQUERYI:
2169        case OP_POSQUERY:        case OP_POSQUERY:
2170        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2171          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2172        break;        break;
2173        }        }
2174  #else  #else
2175      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2176  #endif  #endif
2177      }      }
2178    }    }
# Line 1776  instance of OP_RECURSE. Line 2189  instance of OP_RECURSE.
2189    
2190  Arguments:  Arguments:
2191    code        points to start of expression    code        points to start of expression
2192    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2193    
2194  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2195  */  */
2196    
2197  static const uschar *  static const pcre_uchar *
2198  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2199  {  {
2200  for (;;)  for (;;)
2201    {    {
2202    register int c = *code;    register pcre_uchar c = *code;
2203    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
2204    if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
2205    
# Line 1821  for (;;) Line 2234  for (;;)
2234        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2235        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2236        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2237        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2238            code += 2;
2239        break;        break;
2240    
2241        case OP_MARK:        case OP_MARK:
2242        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2243        case OP_SKIP_ARG:        case OP_SKIP_ARG:
2244          code += code[1];
2245          break;
2246    
2247        case OP_THEN_ARG:        case OP_THEN_ARG:
2248        code += code[1];        code += code[1];
2249        break;        break;
# Line 1834  for (;;) Line 2251  for (;;)
2251    
2252      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2253    
2254      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2255    
2256      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2257      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2258      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2259    
2260  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2261      if (utf8) switch(c)      if (utf) switch(c)
2262        {        {
2263        case OP_CHAR:        case OP_CHAR:
2264        case OP_CHARNC:        case OP_CHARI:
2265          case OP_NOT:
2266          case OP_NOTI:
2267        case OP_EXACT:        case OP_EXACT:
2268          case OP_EXACTI:
2269          case OP_NOTEXACT:
2270          case OP_NOTEXACTI:
2271        case OP_UPTO:        case OP_UPTO:
2272          case OP_UPTOI:
2273          case OP_NOTUPTO:
2274          case OP_NOTUPTOI:
2275        case OP_MINUPTO:        case OP_MINUPTO:
2276          case OP_MINUPTOI:
2277          case OP_NOTMINUPTO:
2278          case OP_NOTMINUPTOI:
2279        case OP_POSUPTO:        case OP_POSUPTO:
2280          case OP_POSUPTOI:
2281          case OP_NOTPOSUPTO:
2282          case OP_NOTPOSUPTOI:
2283        case OP_STAR:        case OP_STAR:
2284          case OP_STARI:
2285          case OP_NOTSTAR:
2286          case OP_NOTSTARI:
2287        case OP_MINSTAR:        case OP_MINSTAR:
2288          case OP_MINSTARI:
2289          case OP_NOTMINSTAR:
2290          case OP_NOTMINSTARI:
2291        case OP_POSSTAR:        case OP_POSSTAR:
2292          case OP_POSSTARI:
2293          case OP_NOTPOSSTAR:
2294          case OP_NOTPOSSTARI:
2295        case OP_PLUS:        case OP_PLUS:
2296          case OP_PLUSI:
2297          case OP_NOTPLUS:
2298          case OP_NOTPLUSI:
2299        case OP_MINPLUS:        case OP_MINPLUS:
2300          case OP_MINPLUSI:
2301          case OP_NOTMINPLUS:
2302          case OP_NOTMINPLUSI:
2303        case OP_POSPLUS:        case OP_POSPLUS:
2304          case OP_POSPLUSI:
2305          case OP_NOTPOSPLUS:
2306          case OP_NOTPOSPLUSI:
2307        case OP_QUERY:        case OP_QUERY:
2308          case OP_QUERYI:
2309          case OP_NOTQUERY:
2310          case OP_NOTQUERYI:
2311        case OP_MINQUERY:        case OP_MINQUERY:
2312          case OP_MINQUERYI:
2313          case OP_NOTMINQUERY:
2314          case OP_NOTMINQUERYI:
2315        case OP_POSQUERY:        case OP_POSQUERY:
2316        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2317          case OP_NOTPOSQUERY:
2318          case OP_NOTPOSQUERYI:
2319          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2320        break;        break;
2321        }        }
2322  #else  #else
2323      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2324  #endif  #endif
2325      }      }
2326    }    }
# Line 1885  bracket whose current branch will alread Line 2343  bracket whose current branch will alread
2343  Arguments:  Arguments:
2344    code        points to start of search    code        points to start of search
2345    endcode     points to where to stop    endcode     points to where to stop
2346    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2347    cd          contains pointers to tables etc.    cd          contains pointers to tables etc.
2348    
2349  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2350  */  */
2351    
2352  static BOOL  static BOOL
2353  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2354    compile_data *cd)    BOOL utf, compile_data *cd)
2355  {  {
2356  register int c;  register pcre_uchar c;
2357  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2358       code < endcode;       code < endcode;
2359       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2360    {    {
2361    const uschar *ccode;    const pcre_uchar *ccode;
2362    
2363    c = *code;    c = *code;
2364    
# Line 1914  for (code = first_significant_code(code Line 2372  for (code = first_significant_code(code
2372      continue;      continue;
2373      }      }
2374    
   /* Groups with zero repeats can of course be empty; skip them. */  
   
   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)  
     {  
     code += _pcre_OP_lengths[c];  
     do code += GET(code, 1); while (*code == OP_ALT);  
     c = *code;  
     continue;  
     }  
   
2375    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
2376    implies a subroutine call, we can scan it. */    implies a backward reference subroutine call, we can scan it. If it's a
2377      forward reference subroutine call, we can't. To detect forward reference
2378      we have to scan up the list that is kept in the workspace. This function is
2379      called only when doing the real compile, not during the pre-compile that
2380      measures the size of the compiled pattern. */
2381    
2382    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2383      {      {
2384      BOOL empty_branch = FALSE;      const pcre_uchar *scode;
2385      const uschar *scode = cd->start_code + GET(code, 1);      BOOL empty_branch;
2386    
2387        /* Test for forward reference */
2388    
2389        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2390          if ((int)GET(scode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2391    
2392        /* Not a forward reference, test for completed backward reference */
2393    
2394        empty_branch = FALSE;
2395        scode = cd->start_code + GET(code, 1);
2396      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2397    
2398        /* Completed backwards reference */
2399    
2400      do      do
2401        {        {
2402        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf, cd))
2403          {          {
2404          empty_branch = TRUE;          empty_branch = TRUE;
2405          break;          break;
# Line 1942  for (code = first_significant_code(code Line 2407  for (code = first_significant_code(code
2407        scode += GET(scode, 1);        scode += GET(scode, 1);
2408        }        }
2409      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2410    
2411      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2412      continue;      continue;
2413      }      }
2414    
2415      /* Groups with zero repeats can of course be empty; skip them. */
2416    
2417      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2418          c == OP_BRAPOSZERO)
2419        {
2420        code += PRIV(OP_lengths)[c];
2421        do code += GET(code, 1); while (*code == OP_ALT);
2422        c = *code;
2423        continue;
2424        }
2425    
2426      /* A nested group that is already marked as "could be empty" can just be
2427      skipped. */
2428    
2429      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2430          c == OP_SCBRA || c == OP_SCBRAPOS)
2431        {
2432        do code += GET(code, 1); while (*code == OP_ALT);
2433        c = *code;
2434        continue;
2435        }
2436    
2437    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2438    
2439    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2440          c == OP_CBRA || c == OP_CBRAPOS ||
2441          c == OP_ONCE || c == OP_ONCE_NC ||
2442          c == OP_COND)
2443      {      {
2444      BOOL empty_branch;      BOOL empty_branch;
2445      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1964  for (code = first_significant_code(code Line 2455  for (code = first_significant_code(code
2455        empty_branch = FALSE;        empty_branch = FALSE;
2456        do        do
2457          {          {
2458          if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2459            empty_branch = TRUE;            empty_branch = TRUE;
2460          code += GET(code, 1);          code += GET(code, 1);
2461          }          }
# Line 1982  for (code = first_significant_code(code Line 2473  for (code = first_significant_code(code
2473      {      {
2474      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2475      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2476      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2477      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2478      here. */      here. */
2479    
2480  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2481      case OP_XCLASS:      case OP_XCLASS:
2482      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2483      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 1994  for (code = first_significant_code(code Line 2485  for (code = first_significant_code(code
2485    
2486      case OP_CLASS:      case OP_CLASS:
2487      case OP_NCLASS:      case OP_NCLASS:
2488      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2489    
2490  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2491      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2492  #endif  #endif
2493    
# Line 2035  for (code = first_significant_code(code Line 2526  for (code = first_significant_code(code
2526      case OP_ALLANY:      case OP_ALLANY:
2527      case OP_ANYBYTE:      case OP_ANYBYTE:
2528      case OP_CHAR:      case OP_CHAR:
2529      case OP_CHARNC:      case OP_CHARI:
2530      case OP_NOT:      case OP_NOT:
2531        case OP_NOTI:
2532      case OP_PLUS:      case OP_PLUS:
2533      case OP_MINPLUS:      case OP_MINPLUS:
2534      case OP_POSPLUS:      case OP_POSPLUS:
# Line 2068  for (code = first_significant_code(code Line 2560  for (code = first_significant_code(code
2560      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2561      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2562      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2563      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2564          code += 2;
2565      break;      break;
2566    
2567      /* End of branch */      /* End of branch */
# Line 2076  for (code = first_significant_code(code Line 2569  for (code = first_significant_code(code
2569      case OP_KET:      case OP_KET:
2570      case OP_KETRMAX:      case OP_KETRMAX:
2571      case OP_KETRMIN:      case OP_KETRMIN:
2572        case OP_KETRPOS:
2573      case OP_ALT:      case OP_ALT:
2574      return TRUE;      return TRUE;
2575    
2576      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2577      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
2578    
2579  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2580      case OP_STAR:      case OP_STAR:
2581        case OP_STARI:
2582      case OP_MINSTAR:      case OP_MINSTAR:
2583        case OP_MINSTARI:
2584      case OP_POSSTAR:      case OP_POSSTAR:
2585        case OP_POSSTARI:
2586      case OP_QUERY:      case OP_QUERY:
2587        case OP_QUERYI:
2588      case OP_MINQUERY:      case OP_MINQUERY:
2589        case OP_MINQUERYI:
2590      case OP_POSQUERY:      case OP_POSQUERY:
2591      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      case OP_POSQUERYI:
2592        if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2593      break;      break;
2594    
2595      case OP_UPTO:      case OP_UPTO:
2596        case OP_UPTOI:
2597      case OP_MINUPTO:      case OP_MINUPTO:
2598        case OP_MINUPTOI:
2599      case OP_POSUPTO:      case OP_POSUPTO:
2600      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      case OP_POSUPTOI:
2601        if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2602      break;      break;
2603  #endif  #endif
2604    
# Line 2105  for (code = first_significant_code(code Line 2608  for (code = first_significant_code(code
2608      case OP_MARK:      case OP_MARK:
2609      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
2610      case OP_SKIP_ARG:      case OP_SKIP_ARG:
2611        code += code[1];
2612        break;
2613    
2614      case OP_THEN_ARG:      case OP_THEN_ARG:
2615      code += code[1];      code += code[1];
2616      break;      break;
# Line 2129  return TRUE; Line 2635  return TRUE;
2635  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2636  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2637  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2638    This function is called only during the real compile, not during the
2639    pre-compile.
2640    
2641  Arguments:  Arguments:
2642    code        points to start of the recursion    code        points to start of the recursion
2643    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2644    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2645    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2646    cd          pointers to tables etc    cd          pointers to tables etc
2647    
2648  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2649  */  */
2650    
2651  static BOOL  static BOOL
2652  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2653    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2654  {  {
2655  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2656    {    {
2657    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2658      return FALSE;      return FALSE;
2659    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2660    }    }
# Line 2179  where Perl recognizes it as the POSIX cl Line 2687  where Perl recognizes it as the POSIX cl
2687  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2688  I think.  I think.
2689    
2690    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2691    It seems that the appearance of a nested POSIX class supersedes an apparent
2692    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2693    a digit.
2694    
2695    In Perl, unescaped square brackets may also appear as part of class names. For
2696    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2697    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2698    seem right at all. PCRE does not allow closing square brackets in POSIX class
2699    names.
2700    
2701  Arguments:  Arguments:
2702    ptr      pointer to the initial [    ptr      pointer to the initial [
2703    endptr   where to return the end pointer    endptr   where to return the end pointer
# Line 2187  Returns:   TRUE or FALSE Line 2706  Returns:   TRUE or FALSE
2706  */  */
2707    
2708  static BOOL  static BOOL
2709  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2710  {  {
2711  int terminator;          /* Don't combine these lines; the Solaris cc */  pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
2712  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2713  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2714    {    {
2715    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2716        ptr++;
2717      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2718      else
2719      {      {
     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;  
2720      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2721        {        {
2722        *endptr = ptr;        *endptr = ptr;
2723        return TRUE;        return TRUE;
2724        }        }
2725        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2726             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2727              ptr[1] == CHAR_EQUALS_SIGN) &&
2728            check_posix_syntax(ptr, endptr))
2729          return FALSE;
2730      }      }
2731    }    }
2732  return FALSE;  return FALSE;
# Line 2224  Returns:     a value representing the na Line 2750  Returns:     a value representing the na
2750  */  */
2751    
2752  static int  static int
2753  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2754  {  {
2755  const char *pn = posix_names;  const char *pn = posix_names;
2756  register int yield = 0;  register int yield = 0;
2757  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2758    {    {
2759    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2760      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
2761    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2762    yield++;    yield++;
2763    }    }
# Line 2263  value in the reference (which is a group Line 2789  value in the reference (which is a group
2789  Arguments:  Arguments:
2790    group      points to the start of the group    group      points to the start of the group
2791    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2792    utf8       TRUE in UTF-8 mode    utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
2793    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2794    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm   the hwm forward reference pointer at the start of the group
2795    
# Line 2271  Returns:     nothing Line 2797  Returns:     nothing
2797  */  */
2798    
2799  static void  static void
2800  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2801    uschar *save_hwm)    pcre_uchar *save_hwm)
2802  {  {
2803  uschar *ptr = group;  pcre_uchar *ptr = group;
2804    
2805  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2806    {    {
2807    int offset;    int offset;
2808    uschar *hc;    pcre_uchar *hc;
2809    
2810    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2811    reference. */    reference. */
2812    
2813    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2814      {      {
2815      offset = GET(hc, 0);      offset = (int)GET(hc, 0);
2816      if (cd->start_code + offset == ptr + 1)      if (cd->start_code + offset == ptr + 1)
2817        {        {
2818        PUT(hc, 0, offset + adjust);        PUT(hc, 0, offset + adjust);
# Line 2299  while ((ptr = (uschar *)find_recurse(ptr Line 2825  while ((ptr = (uschar *)find_recurse(ptr
2825    
2826    if (hc >= cd->hwm)    if (hc >= cd->hwm)
2827      {      {
2828      offset = GET(ptr, 1);      offset = (int)GET(ptr, 1);
2829      if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);      if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2830      }      }
2831    
# Line 2324  Arguments: Line 2850  Arguments:
2850  Returns:         new code pointer  Returns:         new code pointer
2851  */  */
2852    
2853  static uschar *  static pcre_uchar *
2854  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2855  {  {
2856  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2857  *code++ = 255;  *code++ = 255;
2858  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2859  PUT(code, LINK_SIZE, 0);                       /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2860  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2861  }  }
2862    
2863    
# Line 2353  Returns:             nothing Line 2879  Returns:             nothing
2879  */  */
2880    
2881  static void  static void
2882  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2883  {  {
2884  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2885  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
# Line 2367  PUT(previous_callout, 2 + LINK_SIZE, len Line 2893  PUT(previous_callout, 2 + LINK_SIZE, len
2893  *************************************************/  *************************************************/
2894    
2895  /* This function is passed the start and end of a class range, in UTF-8 mode  /* This function is passed the start and end of a class range, in UTF-8 mode
2896  with UCP support. It searches up the characters, looking for internal ranges of  with UCP support. It searches up the characters, looking for ranges of
2897  characters in the "other" case. Each call returns the next one, updating the  characters in the "other" case. Each call returns the next one, updating the
2898  start address.  start address. A character with multiple other cases is returned on its own
2899    with a special return value.
2900    
2901  Arguments:  Arguments:
2902    cptr        points to starting character value; updated    cptr        points to starting character value; updated
# Line 2377  Arguments: Line 2904  Arguments:
2904    ocptr       where to put start of othercase range    ocptr       where to put start of othercase range
2905    odptr       where to put end of othercase range    odptr       where to put end of othercase range
2906    
2907  Yield:        TRUE when range returned; FALSE when no more  Yield:        -1 when no more
2908                   0 when a range is returned
2909                  >0 the CASESET offset for char with multiple other cases
2910                    in this case, ocptr contains the original
2911  */  */
2912    
2913  static BOOL  static int
2914  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,  get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
2915    unsigned int *odptr)    pcre_uint32 *odptr)
2916  {  {
2917  unsigned int c, othercase, next;  pcre_uint32 c, othercase, next;
2918    int co;
2919    
2920    /* Find the first character that has an other case. If it has multiple other
2921    cases, return its case offset value. */
2922    
2923  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2924    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }    {
2925      if ((co = UCD_CASESET(c)) != 0)
2926        {
2927        *ocptr = c++;   /* Character that has the set */
2928        *cptr = c;      /* Rest of input range */
2929        return co;
2930        }
2931      if ((othercase = UCD_OTHERCASE(c)) != c) break;
2932      }
2933    
2934  if (c > d) return FALSE;  if (c > d) return -1;  /* Reached end of range */
2935    
2936  *ocptr = othercase;  *ocptr = othercase;
2937  next = othercase + 1;  next = othercase + 1;
# Line 2400  for (++c; c <= d; c++) Line 2942  for (++c; c <= d; c++)
2942    next++;    next++;
2943    }    }
2944    
2945  *odptr = next - 1;  *odptr = next - 1;     /* End of othercase range */
2946  *cptr = c;  *cptr = c;             /* Rest of input range */
2947    return 0;
 return TRUE;  
2948  }  }
2949    
2950    
# Line 2425  Returns:       TRUE if auto-possessifyin Line 2966  Returns:       TRUE if auto-possessifyin
2966  */  */
2967    
2968  static BOOL  static BOOL
2969  check_char_prop(int c, int ptype, int pdata, BOOL negated)  check_char_prop(pcre_uint32 c, int ptype, int pdata, BOOL negated)
2970  {  {
2971    #ifdef SUPPORT_UCP
2972    const pcre_uint32 *p;
2973    #endif
2974    
2975  const ucd_record *prop = GET_UCD(c);  const ucd_record *prop = GET_UCD(c);
2976    
2977  switch(ptype)  switch(ptype)
2978    {    {
2979    case PT_LAMP:    case PT_LAMP:
# Line 2436  switch(ptype) Line 2982  switch(ptype)
2982            prop->chartype == ucp_Lt) == negated;            prop->chartype == ucp_Lt) == negated;
2983    
2984    case PT_GC:    case PT_GC:
2985    return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2986    
2987    case PT_PC:    case PT_PC:
2988    return (pdata == prop->chartype) == negated;    return (pdata == prop->chartype) == negated;
# Line 2447  switch(ptype) Line 2993  switch(ptype)
2993    /* These are specials */    /* These are specials */
2994    
2995    case PT_ALNUM:    case PT_ALNUM:
2996    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2997            _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2998    
2999    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
3000    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
3001            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
3002            == negated;            == negated;
3003    
3004    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
3005    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
3006            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
3007            c == CHAR_FF || c == CHAR_CR)            c == CHAR_FF || c == CHAR_CR)
3008            == negated;            == negated;
3009    
3010    case PT_WORD:    case PT_WORD:
3011    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
3012            _pcre_ucp_gentype[prop->chartype] == ucp_N ||            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
3013            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
3014    
3015    #ifdef SUPPORT_UCP
3016      case PT_CLIST:
3017      p = PRIV(ucd_caseless_sets) + prop->caseset;
3018      for (;;)
3019        {
3020        if ((unsigned int)c < *p) return !negated;
3021        if ((unsigned int)c == *p++) return negated;
3022        }
3023      break;  /* Control never reaches here */
3024    #endif
3025    }    }
3026    
3027  return FALSE;  return FALSE;
3028  }  }
3029  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 2482  sense to automatically possessify the re Line 3040  sense to automatically possessify the re
3040    
3041  Arguments:  Arguments:
3042    previous      pointer to the repeated opcode    previous      pointer to the repeated opcode
3043    utf8          TRUE in UTF-8 mode    utf           TRUE in UTF-8 / UTF-16 / UTF-32 mode
3044    ptr           next character in pattern    ptr           next character in pattern
3045    options       options bits    options       options bits
3046    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 2491  Returns:        TRUE if possessifying is Line 3049  Returns:        TRUE if possessifying is
3049  */  */
3050    
3051  static BOOL  static BOOL
3052  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
3053    int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
3054  {  {
3055  int c, next;  pcre_uint32 c = NOTACHAR;
3056  int op_code = *previous++;  pcre_uint32 next;
3057    int escape;
3058    pcre_uchar op_code = *previous++;
3059    
3060  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
3061    
# Line 2503  if ((options & PCRE_EXTENDED) != 0) Line 3063  if ((options & PCRE_EXTENDED) != 0)
3063    {    {
3064    for (;;)    for (;;)
3065      {      {
3066      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3067      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3068        {        {
3069        while (*(++ptr) != 0)        ptr++;
3070          while (*ptr != 0)
3071            {
3072          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3073            ptr++;
3074    #ifdef SUPPORT_UTF
3075            if (utf) FORWARDCHAR(ptr);
3076    #endif
3077            }
3078        }        }
3079      else break;      else break;
3080      }      }
# Line 2519  value is a character, a negative value i Line 3086  value is a character, a negative value i
3086  if (*ptr == CHAR_BACKSLASH)  if (*ptr == CHAR_BACKSLASH)
3087    {    {
3088    int temperrorcode = 0;    int temperrorcode = 0;
3089    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);    escape = check_escape(&ptr, &next, &temperrorcode, cd->bracount, options, FALSE);
3090    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
3091    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
3092    }    }
3093    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
3094    {    {
3095  #ifdef SUPPORT_UTF8    escape = 0;
3096    if (utf8) { GETCHARINC(next, ptr); } else  #ifdef SUPPORT_UTF
3097      if (utf) { GETCHARINC(next, ptr); } else
3098  #endif  #endif
3099    next = *ptr++;    next = *ptr++;
3100    }    }
   
3101  else return FALSE;  else return FALSE;
3102    
3103  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2540  if ((options & PCRE_EXTENDED) != 0) Line 3106  if ((options & PCRE_EXTENDED) != 0)
3106    {    {
3107    for (;;)    for (;;)
3108      {      {
3109      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3110      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3111        {        {
3112        while (*(++ptr) != 0)        ptr++;
3113          while (*ptr != 0)
3114            {
3115          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3116            ptr++;
3117    #ifdef SUPPORT_UTF
3118            if (utf) FORWARDCHAR(ptr);
3119    #endif
3120            }
3121        }        }
3122      else break;      else break;
3123      }      }
# Line 2553  if ((options & PCRE_EXTENDED) != 0) Line 3126  if ((options & PCRE_EXTENDED) != 0)
3126  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
3127    
3128  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3129    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3130      return FALSE;      return FALSE;
3131    
3132  /* Now compare the next item with the previous opcode. First, handle cases when  /* If the previous item is a character, get its value. */
 the next item is a character. */  
3133    
3134  if (next >= 0) switch(op_code)  if (op_code == OP_CHAR || op_code == OP_CHARI ||
3135        op_code == OP_NOT || op_code == OP_NOTI)
3136      //if (escape == 0) switch(op_code)
3137    {    {
3138    case OP_CHAR:  #ifdef SUPPORT_UTF
 #ifdef SUPPORT_UTF8  
3139    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3140  #else  #else
3141    c = *previous;    c = *previous;
3142  #endif  #endif
3143    return c != next;    }
3144    
3145    /* For CHARNC (caseless character) we must check the other case. If we have  /* Now compare the next item with the previous opcode. First, handle cases when
3146    Unicode property support, we can use it to test the other case of  the next item is a character. */
   high-valued characters. */  
3147    
3148    case OP_CHARNC:  if (escape == 0)
3149  #ifdef SUPPORT_UTF8    {
3150    GETCHARTEST(c, previous);    /* For a caseless UTF match, the next character may have more than one other
3151  #else    case, which maps to the special PT_CLIST property. Check this first. */
3152    c = *previous;  
3153    #ifdef SUPPORT_UCP
3154      if (utf && c != NOTACHAR && (options & PCRE_CASELESS) != 0)
3155        {
3156        int ocs = UCD_CASESET(next);
3157        if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, op_code >= OP_NOT);
3158        }
3159  #endif  #endif
3160    if (c == next) return FALSE;  
3161  #ifdef SUPPORT_UTF8    switch(op_code)
   if (utf8)  
3162      {      {
3163      unsigned int othercase;      case OP_CHAR:
3164      if (next < 128) othercase = cd->fcc[next]; else      return c != next;
3165    
3166        /* For CHARI (caseless character) we must check the other case. If we have
3167        Unicode property support, we can use it to test the other case of
3168        high-valued characters. We know that next can have only one other case,
3169        because multi-other-case characters are dealt with above. */
3170    
3171        case OP_CHARI:
3172        if (c == next) return FALSE;
3173    #ifdef SUPPORT_UTF
3174        if (utf)
3175          {
3176          pcre_uint32 othercase;
3177          if (next < 128) othercase = cd->fcc[next]; else
3178  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3179      othercase = UCD_OTHERCASE((unsigned int)next);        othercase = UCD_OTHERCASE(next);
3180  #else  #else
3181      othercase = NOTACHAR;        othercase = NOTACHAR;
3182  #endif  #endif
3183      return (unsigned int)c != othercase;        return c != othercase;
3184      }        }
3185    else      else
3186  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3187    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */      return (c != TABLE_GET(next, cd->fcc, next));  /* Not UTF */
3188    
3189    /* For OP_NOT, its data is always a single-byte character. */      case OP_NOT:
3190        return c == next;
3191    case OP_NOT:  
3192    if ((c = *previous) == next) return TRUE;      case OP_NOTI:
3193    if ((options & PCRE_CASELESS) == 0) return FALSE;      if (c == next) return TRUE;
3194  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3195    if (utf8)      if (utf)
3196      {        {
3197      unsigned int othercase;        pcre_uint32 othercase;
3198      if (next < 128) othercase = cd->fcc[next]; else        if (next < 128) othercase = cd->fcc[next]; else
3199  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3200      othercase = UCD_OTHERCASE(next);        othercase = UCD_OTHERCASE(next);
3201  #else  #else
3202      othercase = NOTACHAR;        othercase = NOTACHAR;
3203  #endif  #endif
3204      return (unsigned int)c == othercase;        return c == othercase;
3205      }        }
3206    else      else
3207  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3208    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */      return (c == TABLE_GET(next, cd->fcc, next));  /* Not UTF */
3209    
3210    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3211    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */      When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3212    
3213    case OP_DIGIT:      case OP_DIGIT:
3214    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;      return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3215    
3216    case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
3217    return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;      return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3218    
3219    case OP_WHITESPACE:      case OP_WHITESPACE:
3220    return next > 127 || (cd->ctypes[next] & ctype_space) == 0;      return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3221    
3222    case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
3223    return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;      return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3224    
3225    case OP_WORDCHAR:      case OP_WORDCHAR:
3226    return next > 127 || (cd->ctypes[next] & ctype_word) == 0;      return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3227    
3228    case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
3229    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;      return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3230    
3231    case OP_HSPACE:      case OP_HSPACE:
3232    case OP_NOT_HSPACE:      case OP_NOT_HSPACE:
3233    switch(next)      switch(next)
3234      {        {
3235      case 0x09:        HSPACE_CASES:
3236      case 0x20:        return op_code == OP_NOT_HSPACE;
     case 0xa0:  
     case 0x1680:  
     case 0x180e:  
     case 0x2000:  
     case 0x2001:  
     case 0x2002:  
     case 0x2003:  
     case 0x2004:  
     case 0x2005:  
     case 0x2006:  
     case 0x2007:  
     case 0x2008:  
     case 0x2009:  
     case 0x200A:  
     case 0x202f:  
     case 0x205f:  
     case 0x3000:  
     return op_code == OP_NOT_HSPACE;  
     default:  
     return op_code != OP_NOT_HSPACE;  
     }  
3237    
3238    case OP_ANYNL:        default:
3239    case OP_VSPACE:        return op_code != OP_NOT_HSPACE;
3240    case OP_NOT_VSPACE:        }
3241    switch(next)  
3242      {      case OP_ANYNL:
3243      case 0x0a:      case OP_VSPACE:
3244      case 0x0b:      case OP_NOT_VSPACE:
3245      case 0x0c:      switch(next)
3246      case 0x0d:        {
3247      case 0x85:        VSPACE_CASES:
3248      case 0x2028:        return op_code == OP_NOT_VSPACE;
3249      case 0x2029:  
3250      return op_code == OP_NOT_VSPACE;        default:
3251      default:        return op_code != OP_NOT_VSPACE;
3252      return op_code != OP_NOT_VSPACE;        }
     }  
3253    
3254  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3255    case OP_PROP:      case OP_PROP:
3256    return check_char_prop(next, previous[0], previous[1], FALSE);      return check_char_prop(next, (int)previous[0], (int)previous[1], FALSE);
3257    
3258    case OP_NOTPROP:      case OP_NOTPROP:
3259    return check_char_prop(next, previous[0], previous[1], TRUE);      return check_char_prop(next, (int)previous[0], (int)previous[1], TRUE);
3260  #endif  #endif
3261    
3262    default:      default:
3263    return FALSE;      return FALSE;
3264        }
3265    }    }
3266    
   
3267  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3268  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3269  generated only when PCRE_UCP is *not* set, that is, when only ASCII  generated only when PCRE_UCP is *not* set, that is, when only ASCII
# Line 2705  replaced by OP_PROP codes when PCRE_UCP Line 3273  replaced by OP_PROP codes when PCRE_UCP
3273  switch(op_code)  switch(op_code)
3274    {    {
3275    case OP_CHAR:    case OP_CHAR:
3276    case OP_CHARNC:    case OP_CHARI:
3277  #ifdef SUPPORT_UTF8    switch(escape)
   GETCHARTEST(c, previous);  
 #else  
   c = *previous;  
 #endif  
   switch(-next)  
3278      {      {
3279      case ESC_d:      case ESC_d:
3280      return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;      return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3281    
3282      case ESC_D:      case ESC_D:
3283      return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3284    
3285      case ESC_s:      case ESC_s:
3286      return c > 127 || (cd->ctypes[c] & ctype_space) == 0;      return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3287    
3288      case ESC_S:      case ESC_S:
3289      return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3290    
3291      case ESC_w:      case ESC_w:
3292      return c > 127 || (cd->ctypes[c] & ctype_word) == 0;      return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3293    
3294      case ESC_W:      case ESC_W:
3295      return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3296    
3297      case ESC_h:      case ESC_h:
3298      case ESC_H:      case ESC_H:
3299      switch(c)      switch(c)
3300        {        {
3301        case 0x09:        HSPACE_CASES:
3302        case 0x20:        return escape != ESC_h;
3303        case 0xa0:  
       case 0x1680:  
       case 0x180e:  
       case 0x2000:  
       case 0x2001:  
       case 0x2002:  
       case 0x2003:  
       case 0x2004:  
       case 0x2005:  
       case 0x2006:  
       case 0x2007:  
       case 0x2008:  
       case 0x2009:  
       case 0x200A:  
       case 0x202f:  
       case 0x205f:  
       case 0x3000:  
       return -next != ESC_h;  
3304        default:        default:
3305        return -next == ESC_h;        return escape == ESC_h;
3306        }        }
3307    
3308      case ESC_v:      case ESC_v:
3309      case ESC_V:      case ESC_V:
3310      switch(c)      switch(c)
3311        {        {
3312        case 0x0a:        VSPACE_CASES:
3313        case 0x0b:        return escape != ESC_v;
3314        case 0x0c:  
       case 0x0d:  
       case 0x85:  
       case 0x2028:  
       case 0x2029:  
       return -next != ESC_v;  
3315        default:        default:
3316        return -next == ESC_v;        return escape == ESC_v;
3317        }        }
3318    
3319      /* When PCRE_UCP is set, these values get generated for \d etc. Find      /* When PCRE_UCP is set, these values get generated for \d etc. Find
3320      their substitutions and process them. The result will always be either      their substitutions and process them. The result will always be either
3321      -ESC_p or -ESC_P. Then fall through to process those values. */      ESC_p or ESC_P. Then fall through to process those values. */
3322    
3323  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3324      case ESC_du:      case ESC_du:
# Line 2788  switch(op_code) Line 3329  switch(op_code)
3329      case ESC_SU:      case ESC_SU:
3330        {        {
3331        int temperrorcode = 0;        int temperrorcode = 0;
3332        ptr = substitutes[-next - ESC_DU];        ptr = substitutes[escape - ESC_DU];
3333        next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);        escape = check_escape(&ptr, &next, &temperrorcode, 0, options, FALSE);
3334        if (temperrorcode != 0) return FALSE;        if (temperrorcode != 0) return FALSE;
3335        ptr++;    /* For compatibility */        ptr++;    /* For compatibility */
3336        }        }
# Line 2811  switch(op_code) Line 3352  switch(op_code)
3352        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
3353    
3354        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3355          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3356            return FALSE;            return FALSE;
3357    
3358        /* Do the property check. */        /* Do the property check. */
3359    
3360        return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);        return check_char_prop(c, ptype, pdata, (escape == ESC_P) != negated);
3361        }        }
3362  #endif  #endif
3363    
# Line 2831  switch(op_code) Line 3372  switch(op_code)
3372    these op-codes are never generated.) */    these op-codes are never generated.) */
3373    
3374    case OP_DIGIT:    case OP_DIGIT:
3375    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||    return escape == ESC_D || escape == ESC_s || escape == ESC_W ||
3376           next == -ESC_h || next == -ESC_v || next == -ESC_R;           escape == ESC_h || escape == ESC_v || escape == ESC_R;
3377    
3378    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3379    return next == -ESC_d;    return escape == ESC_d;
3380    
3381    case OP_WHITESPACE:    case OP_WHITESPACE:
3382    return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;    return escape == ESC_S || escape == ESC_d || escape == ESC_w;
3383    
3384    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3385    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return escape == ESC_s || escape == ESC_h || escape == ESC_v || escape == ESC_R;
3386    
3387    case OP_HSPACE:    case OP_HSPACE:
3388    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||    return escape == ESC_S || escape == ESC_H || escape == ESC_d ||
3389           next == -ESC_w || next == -ESC_v || next == -ESC_R;           escape == ESC_w || escape == ESC_v || escape == ESC_R;
3390    
3391    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3392    return next == -ESC_h;    return escape == ESC_h;
3393    
3394    /* Can't have \S in here because VT matches \S (Perl anomaly) */    /* Can't have \S in here because VT matches \S (Perl anomaly) */
3395    case OP_ANYNL:    case OP_ANYNL:
3396    case OP_VSPACE:    case OP_VSPACE:
3397    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    return escape == ESC_V || escape == ESC_d || escape == ESC_w;
3398    
3399    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3400    return next == -ESC_v || next == -ESC_R;    return escape == ESC_v || escape == ESC_R;
3401    
3402    case OP_WORDCHAR:    case OP_WORDCHAR:
3403    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||    return escape == ESC_W || escape == ESC_s || escape == ESC_h ||
3404           next == -ESC_v || next == -ESC_R;           escape == ESC_v || escape == ESC_R;
3405    
3406    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3407    return next == -ESC_w || next == -ESC_d;    return escape == ESC_w || escape == ESC_d;
3408    
3409    default:    default:
3410    return FALSE;    return FALSE;
# Line 2875  switch(op_code) Line 3416  switch(op_code)
3416    
3417    
3418  /*************************************************  /*************************************************
3419    *        Add a character or range to a class     *
3420    *************************************************/
3421    
3422    /* This function packages up the logic of adding a character or range of
3423    characters to a class. The character values in the arguments will be within the
3424    valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3425    mutually recursive with the function immediately below.
3426    
3427    Arguments:
3428      classbits     the bit map for characters < 256
3429      uchardptr     points to the pointer for extra data
3430      options       the options word
3431      cd            contains pointers to tables etc.
3432      start         start of range character
3433      end           end of range character
3434    
3435    Returns:        the number of < 256 characters added
3436                    the pointer to extra data is updated
3437    */
3438    
3439    static int
3440    add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3441      compile_data *cd, pcre_uint32 start, pcre_uint32 end)
3442    {
3443    pcre_uint32 c;
3444    int n8 = 0;
3445    
3446    /* If caseless matching is required, scan the range and process alternate
3447    cases. In Unicode, there are 8-bit characters that have alternate cases that
3448    are greater than 255 and vice-versa. Sometimes we can just extend the original
3449    range. */
3450    
3451    if ((options & PCRE_CASELESS) != 0)
3452      {
3453    #ifdef SUPPORT_UCP
3454      if ((options & PCRE_UTF8) != 0)
3455        {
3456        int rc;
3457        pcre_uint32 oc, od;
3458    
3459        options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
3460        c = start;
3461    
3462        while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3463          {
3464          /* Handle a single character that has more than one other case. */
3465    
3466          if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3467            PRIV(ucd_caseless_sets) + rc, oc);
3468    
3469          /* Do nothing if the other case range is within the original range. */
3470    
3471          else if (oc >= start && od <= end) continue;
3472    
3473          /* Extend the original range if there is overlap, noting that if oc < c, we
3474          can't have od > end because a subrange is always shorter than the basic
3475          range. Otherwise, use a recursive call to add the additional range. */
3476    
3477          else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3478          else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */
3479          else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3480          }
3481        }
3482      else
3483    #endif  /* SUPPORT_UCP */
3484    
3485      /* Not UTF-mode, or no UCP */
3486    
3487      for (c = start; c <= end && c < 256; c++)
3488        {
3489        SETBIT(classbits, cd->fcc[c]);
3490        n8++;
3491        }
3492      }
3493    
3494    /* Now handle the original range. Adjust the final value according to the bit
3495    length - this means that the same lists of (e.g.) horizontal spaces can be used
3496    in all cases. */
3497    
3498    #if defined COMPILE_PCRE8
3499    #ifdef SUPPORT_UTF
3500      if ((options & PCRE_UTF8) == 0)
3501    #endif
3502      if (end > 0xff) end = 0xff;
3503    
3504    #elif defined COMPILE_PCRE16
3505    #ifdef SUPPORT_UTF
3506      if ((options & PCRE_UTF16) == 0)
3507    #endif
3508      if (end > 0xffff) end = 0xffff;
3509    
3510    #endif /* COMPILE_PCRE[8|16] */
3511    
3512    /* If all characters are less than 256, use the bit map. Otherwise use extra
3513    data. */
3514    
3515    if (end < 0x100)
3516      {
3517      for (c = start; c <= end; c++)
3518        {
3519        n8++;
3520        SETBIT(classbits, c);
3521        }
3522      }
3523    
3524    else
3525      {
3526      pcre_uchar *uchardata = *uchardptr;
3527    
3528    #ifdef SUPPORT_UTF
3529      if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
3530        {
3531        if (start < end)
3532          {
3533          *uchardata++ = XCL_RANGE;
3534          uchardata += PRIV(ord2utf)(start, uchardata);
3535          uchardata += PRIV(ord2utf)(end, uchardata);
3536          }
3537        else if (start == end)
3538          {
3539          *uchardata++ = XCL_SINGLE;
3540          uchardata += PRIV(ord2utf)(start, uchardata);
3541          }
3542        }
3543      else
3544    #endif  /* SUPPORT_UTF */
3545    
3546      /* Without UTF support, character values are constrained by the bit length,
3547      and can only be > 256 for 16-bit and 32-bit libraries. */
3548    
3549    #ifdef COMPILE_PCRE8
3550        {}
3551    #else
3552      if (start < end)
3553        {
3554        *uchardata++ = XCL_RANGE;
3555        *uchardata++ = start;
3556        *uchardata++ = end;
3557        }
3558      else if (start == end)
3559        {
3560        *uchardata++ = XCL_SINGLE;
3561        *uchardata++ = start;
3562        }
3563    #endif
3564    
3565      *uchardptr = uchardata;   /* Updata extra data pointer */
3566      }
3567    
3568    return n8;    /* Number of 8-bit characters */
3569    }
3570    
3571    
3572    
3573    
3574    /*************************************************
3575    *        Add a list of characters to a class     *
3576    *************************************************/
3577    
3578    /* This function is used for adding a list of case-equivalent characters to a
3579    class, and also for adding a list of horizontal or vertical whitespace. If the
3580    list is in order (which it should be), ranges of characters are detected and
3581    handled appropriately. This function is mutually recursive with the function
3582    above.
3583    
3584    Arguments:
3585      classbits     the bit map for characters < 256
3586      uchardptr     points to the pointer for extra data
3587      options       the options word
3588      cd            contains pointers to tables etc.
3589      p             points to row of 32-bit values, terminated by NOTACHAR
3590      except        character to omit; this is used when adding lists of
3591                      case-equivalent characters to avoid including the one we
3592                      already know about
3593    
3594    Returns:        the number of < 256 characters added
3595                    the pointer to extra data is updated
3596    */
3597    
3598    static int
3599    add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3600      compile_data *cd, const pcre_uint32 *p, unsigned int except)
3601    {
3602    int n8 = 0;
3603    while (p[0] < NOTACHAR)
3604      {
3605      int n = 0;
3606      if (p[0] != except)
3607        {
3608        while(p[n+1] == p[0] + n + 1) n++;
3609        n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
3610        }
3611      p += n + 1;
3612      }
3613    return n8;
3614    }
3615    
3616    
3617    
3618    /*************************************************
3619    *    Add characters not in a list to a class     *
3620    *************************************************/
3621    
3622    /* This function is used for adding the complement of a list of horizontal or
3623    vertical whitespace to a class. The list must be in order.
3624    
3625    Arguments:
3626      classbits     the bit map for characters < 256
3627      uchardptr     points to the pointer for extra data
3628      options       the options word
3629      cd            contains pointers to tables etc.
3630      p             points to row of 32-bit values, terminated by NOTACHAR
3631    
3632    Returns:        the number of < 256 characters added
3633                    the pointer to extra data is updated
3634    */
3635    
3636    static int
3637    add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
3638      int options, compile_data *cd, const pcre_uint32 *p)
3639    {
3640    BOOL utf = (options & PCRE_UTF8) != 0;
3641    int n8 = 0;
3642    if (p[0] > 0)
3643      n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
3644    while (p[0] < NOTACHAR)
3645      {
3646      while (p[1] == p[0] + 1) p++;
3647      n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
3648        (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3649      p++;
3650      }
3651    return n8;
3652    }
3653    
3654    
3655    
3656    /*************************************************
3657  *           Compile one branch                   *  *           Compile one branch                   *
3658  *************************************************/  *************************************************/
3659    
# Line 2889  Arguments: Line 3668  Arguments:
3668    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3669    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3670    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3671    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr    place to put the first required character
3672    reqbyteptr     set to the last literal character required, else < 0    firstcharflagsptr place to put the first character flags, or a negative number
3673      reqcharptr     place to put the last required character
3674      reqcharflagsptr place to put the last required character flags, or a negative number
3675    bcptr          points to current branch chain    bcptr          points to current branch chain
3676      cond_depth     conditional nesting depth
3677    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3678    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3679                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 2901  Returns:         TRUE on success Line 3683  Returns:         TRUE on success
3683  */  */
3684    
3685  static BOOL  static BOOL
3686  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3687    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr,
3688      pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
3689      pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
3690      branch_chain *bcptr, int cond_depth,
3691    compile_data *cd, int *lengthptr)    compile_data *cd, int *lengthptr)
3692  {  {
3693  int repeat_type, op_type;  int repeat_type, op_type;
3694  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3695  int bravalue = 0;  int bravalue = 0;
3696  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3697  int firstbyte, reqbyte;  pcre_uint32 firstchar, reqchar;
3698  int zeroreqbyte, zerofirstbyte;  pcre_int32 firstcharflags, reqcharflags;
3699  int req_caseopt, reqvary, tempreqvary;  pcre_uint32 zeroreqchar, zerofirstchar;
3700  int options = *optionsptr;  pcre_int32 zeroreqcharflags, zerofirstcharflags;
3701    pcre_int32 req_caseopt, reqvary, tempreqvary;
3702    int options = *optionsptr;               /* May change dynamically */
3703  int after_manual_callout = 0;  int after_manual_callout = 0;
3704  int length_prevgroup = 0;  int length_prevgroup = 0;
3705  register int c;  register pcre_uint32 c;
3706  register uschar *code = *codeptr;  int escape;
3707  uschar *last_code = code;  register pcre_uchar *code = *codeptr;
3708  uschar *orig_code = code;  pcre_uchar *last_code = code;
3709  uschar *tempcode;  pcre_uchar *orig_code = code;
3710    pcre_uchar *tempcode;
3711  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3712  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3713  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3714  const uschar *tempptr;  const pcre_uchar *tempptr;
3715  const uschar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
3716  uschar *previous = NULL;  pcre_uchar *previous = NULL;
3717  uschar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
3718  uschar *save_hwm = NULL;  pcre_uchar *save_hwm = NULL;
3719  uschar classbits[32];  pcre_uint8 classbits[32];
3720    
3721  #ifdef SUPPORT_UTF8  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3722  BOOL class_utf8;  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3723  BOOL utf8 = (options & PCRE_UTF8) != 0;  dynamically as we process the pattern. */
3724  uschar *class_utf8data;  
3725  uschar *class_utf8data_base;  #ifdef SUPPORT_UTF
3726  uschar utf8_char[6];  /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
3727    BOOL utf = (options & PCRE_UTF8) != 0;
3728    pcre_uchar utf_chars[6];
3729  #else  #else
3730  BOOL utf8 = FALSE;  BOOL utf = FALSE;
3731  uschar *utf8_char = NULL;  #endif
3732    
3733    /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
3734    class_uchardata always so that it can be passed to add_to_class() always,
3735    though it will not be used in non-UTF 8-bit cases. This avoids having to supply
3736    alternative calls for the different cases. */
3737    
3738    pcre_uchar *class_uchardata;
3739    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3740    BOOL xclass;
3741    pcre_uchar *class_uchardata_base;
3742  #endif  #endif
3743    
3744  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 2952  greedy_non_default = greedy_default ^ 1; Line 3752  greedy_non_default = greedy_default ^ 1;
3752    
3753  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3754  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3755  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3756  find one.  find one.
3757    
3758  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3759  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3760  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3761  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3762    
3763  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
3764    firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
3765    
3766  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3767  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3768  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3769  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3770    value. This is used only for ASCII characters. */
3771    
3772  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3773    
3774  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3775    
# Line 2979  for (;; ptr++) Line 3781  for (;; ptr++)
3781    BOOL is_quantifier;    BOOL is_quantifier;
3782    BOOL is_recurse;    BOOL is_recurse;
3783    BOOL reset_bracount;    BOOL reset_bracount;
3784    int class_charcount;    int class_has_8bitchar;
3785    int class_lastchar;    int class_one_char;
3786    int newoptions;    int newoptions;
3787    int recno;    int recno;
3788    int refsign;    int refsign;
3789    int skipbytes;    int skipbytes;
3790    int subreqbyte;    pcre_uint32 subreqchar, subfirstchar;
3791    int subfirstbyte;    pcre_int32 subreqcharflags, subfirstcharflags;
3792    int terminator;    int terminator;
3793    int mclength;    int mclength;
3794    uschar mcbuffer[8];    int tempbracount;
3795      pcre_uint32 ec;
3796      pcre_uchar mcbuffer[8];
3797    
3798    /* Get next byte in the pattern */    /* Get next character in the pattern */
3799    
3800    c = *ptr;    c = *ptr;
3801    
# Line 3013  for (;; ptr++) Line 3817  for (;; ptr++)
3817  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3818      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3819  #endif  #endif
3820      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3821            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3822        {        {
3823        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3824        goto FAILED;        goto FAILED;
# Line 3036  for (;; ptr++) Line 3841  for (;; ptr++)
3841        }        }
3842    
3843      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3844      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3845          (int)(code - last_code), c, c));
3846    
3847      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3848      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 3046  for (;; ptr++) Line 3852  for (;; ptr++)
3852        {        {
3853        if (previous > orig_code)        if (previous > orig_code)
3854          {          {
3855          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3856          code -= previous - orig_code;          code -= previous - orig_code;
3857          previous = orig_code;          previous = orig_code;
3858          }          }
# Line 3062  for (;; ptr++) Line 3868  for (;; ptr++)
3868    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3869    reference list. */    reference list. */
3870    
3871    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3872               WORK_SIZE_SAFETY_MARGIN)
3873      {      {
3874      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3875      goto FAILED;      goto FAILED;
# Line 3110  for (;; ptr++) Line 3917  for (;; ptr++)
3917      previous_callout = NULL;      previous_callout = NULL;
3918      }      }
3919    
3920    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3921    
3922    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3923      {      {
3924      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3925      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3926        {        {
3927        while (*(++ptr) != 0)        ptr++;
3928          while (*ptr != 0)
3929          {          {
3930          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3931            ptr++;
3932    #ifdef SUPPORT_UTF
3933            if (utf) FORWARDCHAR(ptr);
3934    #endif
3935          }          }
3936        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3937    
# Line 3142  for (;; ptr++) Line 3954  for (;; ptr++)
3954      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3955      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3956      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3957      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3958      *reqbyteptr = reqbyte;      *firstcharflagsptr = firstcharflags;
3959        *reqcharptr = reqchar;
3960        *reqcharflagsptr = reqcharflags;
3961      *codeptr = code;      *codeptr = code;
3962      *ptrptr = ptr;      *ptrptr = ptr;
3963      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3164  for (;; ptr++) Line 3978  for (;; ptr++)
3978      the setting of any following char as a first character. */      the setting of any following char as a first character. */
3979    
3980      case CHAR_CIRCUMFLEX_ACCENT:      case CHAR_CIRCUMFLEX_ACCENT:
3981        previous = NULL;
3982      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3983        {        {
3984        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
3985          *code++ = OP_CIRCM;
3986        }        }
3987      previous = NULL;      else *code++ = OP_CIRC;
     *code++ = OP_CIRC;  
3988      break;      break;
3989    
3990      case CHAR_DOLLAR_SIGN:      case CHAR_DOLLAR_SIGN:
3991      previous = NULL;      previous = NULL;
3992      *code++ = OP_DOLL;      *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3993      break;      break;
3994    
3995      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3996      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3997    
3998      case CHAR_DOT:      case CHAR_DOT:
3999      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4000      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
4001      zeroreqbyte = reqbyte;      zerofirstcharflags = firstcharflags;
4002        zeroreqchar = reqchar;
4003        zeroreqcharflags = reqcharflags;
4004      previous = code;      previous = code;
4005      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4006      break;      break;
# Line 3238  for (;; ptr++) Line 4055  for (;; ptr++)
4055          {          {
4056          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
4057            ptr++;            ptr++;
4058          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
                           STR_Q STR_BACKSLASH STR_E, 3) == 0)  
4059            ptr += 3;            ptr += 3;
4060          else          else
4061            break;            break;
# Line 3258  for (;; ptr++) Line 4074  for (;; ptr++)
4074          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4075        {        {
4076        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
4077        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4078        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
4079          zerofirstcharflags = firstcharflags;
4080        break;        break;
4081        }        }
4082    
# Line 3269  for (;; ptr++) Line 4086  for (;; ptr++)
4086    
4087      should_flip_negation = FALSE;      should_flip_negation = FALSE;
4088    
4089      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class:
4090      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero if the class contains at least one <
4091      valued UTF-8 characters, we don't yet do any optimization. */      256 character; class_one_char will be 1 if the class contains just one
4092        character. */
4093    
4094      class_charcount = 0;      class_has_8bitchar = 0;
4095      class_lastchar = -1;      class_one_char = 0;
4096    
4097      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
4098      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains fewer than two
4099      than 256), because in that case the compiled code doesn't use the bit map.      8-bit characters because in that case the compiled code doesn't use the bit
4100      */      map. */
4101    
4102      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
4103    
4104  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4105      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;
4106      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4107      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* Save the start */
4108  #endif  #endif
4109    
4110      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3295  for (;; ptr++) Line 4113  for (;; ptr++)
4113    
4114      if (c != 0) do      if (c != 0) do
4115        {        {
4116        const uschar *oldptr;        const pcre_uchar *oldptr;
4117    
4118  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4119        if (utf8 && c > 127)        if (utf && HAS_EXTRALEN(c))
4120          {                           /* Braces are required because the */          {                           /* Braces are required because the */
4121          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4122          }          }
4123    #endif
4124    
4125        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4126          /* In the pre-compile phase, accumulate the length of any extra
4127        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
4128        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
4129        (which is on the stack). */        (which is on the stack). We have to remember that there was XCLASS data,
4130          however. */
4131        if (lengthptr != NULL)  
4132          {        if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4133          *lengthptr += class_utf8data - class_utf8data_base;          {
4134          class_utf8data = class_utf8data_base;          xclass = TRUE;
4135            *lengthptr += class_uchardata - class_uchardata_base;
4136            class_uchardata = class_uchardata_base;
4137          }          }
   
4138  #endif  #endif
4139    
4140        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3341  for (;; ptr++) Line 4162  for (;; ptr++)
4162          {          {
4163          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
4164          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
4165          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
4166          uschar pbits[32];          pcre_uint8 pbits[32];
4167    
4168          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
4169            {            {
# Line 3369  for (;; ptr++) Line 4190  for (;; ptr++)
4190          alpha. This relies on the fact that the class table starts with          alpha. This relies on the fact that the class table starts with
4191          alpha, lower, upper as the first 3 entries. */          alpha, lower, upper as the first 3 entries. */
4192    
4193          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4194            posix_class = 0;            posix_class = 0;
4195    
4196          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
# Line 3397  for (;; ptr++) Line 4218  for (;; ptr++)
4218          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
4219    
4220          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
4221            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
4222    
4223          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
4224    
# Line 3412  for (;; ptr++) Line 4233  for (;; ptr++)
4233              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4234            }            }
4235    
4236          /* Not see if we need to remove any special characters. An option          /* Now see if we need to remove any special characters. An option
4237          value of 1 removes vertical space and 2 removes underscore. */          value of 1 removes vertical space and 2 removes underscore. */
4238    
4239          if (tabopt < 0) tabopt = -tabopt;          if (tabopt < 0) tabopt = -tabopt;
# Line 3428  for (;; ptr++) Line 4249  for (;; ptr++)
4249            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4250    
4251          ptr = tempptr + 1;          ptr = tempptr + 1;
4252          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 character. */
4253            class_has_8bitchar = 1;
4254            /* Every class contains at least two characters. */
4255            class_one_char = 2;
4256          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
4257          }          }
4258    
4259        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
4260        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
4261        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
4262        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
4263        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_one_char bigger
4264        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
4265          as literal characters (by default), or are faulted if
4266        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
4267    
4268        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
4269          {          {
4270          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, TRUE);
4271    
4272          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
4273    
4274          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (escape == 0)
4275          else if (-c == ESC_Q)            /* Handle start of quoted string */            c = ec;
4276            else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4277            else if (escape == ESC_N)            /* \N is not supported in a class */
4278              {
4279              *errorcodeptr = ERR71;
4280              goto FAILED;
4281              }
4282            else if (escape == ESC_Q)            /* Handle start of quoted string */
4283            {            {
4284            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4285              {              {
# Line 3455  for (;; ptr++) Line 4288  for (;; ptr++)
4288            else inescq = TRUE;            else inescq = TRUE;
4289            continue;            continue;
4290            }            }
4291          else if (-c == ESC_E) continue;  /* Ignore orphan \E */          else if (escape == ESC_E) continue;  /* Ignore orphan \E */
4292    
4293          if (c < 0)          else
4294            {            {
4295            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
4296            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
4297              class_has_8bitchar++;
4298              /* Every class contains at least two characters. */
4299              class_one_char += 2;
4300    
4301            switch (-c)            switch (escape)
4302              {              {
4303  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4304              case ESC_du:     /* These are the values given for \d etc */              case ESC_du:     /* These are the values given for \d etc */
# Line 3472  for (;; ptr++) Line 4308  for (;; ptr++)
4308              case ESC_su:     /* of the default ASCII testing. */              case ESC_su:     /* of the default ASCII testing. */
4309              case ESC_SU:              case ESC_SU:
4310              nestptr = ptr;              nestptr = ptr;
4311              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
4312              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
4313              continue;              continue;
4314  #endif  #endif
4315              case ESC_d:              case ESC_d:
# Line 3494  for (;; ptr++) Line 4330  for (;; ptr++)
4330              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4331              continue;              continue;
4332    
4333                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4334                if it was previously set by something earlier in the character
4335                class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and
4336                EBCDIC, so we lazily just adjust the appropriate bit. */
4337    
4338              case ESC_s:              case ESC_s:
4339              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
4340              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
4341                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4342              continue;              continue;
4343    
4344              case ESC_S:              case ESC_S:
# Line 3504  for (;; ptr++) Line 4346  for (;; ptr++)
4346              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4347              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
4348              continue;              continue;
4349    
4350                /* The rest apply in both UCP and non-UCP cases. */
4351    
4352              case ESC_h:              case ESC_h:
4353              SETBIT(classbits, 0x09); /* VT */              (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4354              SETBIT(classbits, 0x20); /* SPACE */                PRIV(hspace_list), NOTACHAR);
             SETBIT(classbits, 0xa0); /* NSBP */  
 #ifdef SUPPORT_UTF8  
             if (utf8)  
               {  
               class_utf8 = TRUE;  
               *class_utf8data++ = XCL_SINGLE;  
               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);  
               *class_utf8data++ = XCL_SINGLE;  
               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);  
               *class_utf8data++ = XCL_SINGLE;  
               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);  
               *class_utf8data++ = XCL_SINGLE;  
               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);  
               *class_utf8data++ = XCL_SINGLE;  
               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);  
               }  
 #endif  
4355              continue;              continue;
4356    
4357              case ESC_H:              case ESC_H:
4358              for (c = 0; c < 32; c++)              (void)add_not_list_to_class(classbits, &class_uchardata, options,
4359                {                cd, PRIV(hspace_list));
               int x = 0xff;  
               switch (c)  
                 {  
                 case 0x09/8: x ^= 1 << (0x09%8); break;  
                 case 0x20/8: x ^= 1 << (0x20%8); break;  
                 case 0xa0/8: x ^= 1 << (0xa0%8); break;  
                 default: break;  
                 }  
               classbits[c] |= x;  
               }  
   
 #ifdef SUPPORT_UTF8  
             if (utf8)  
               {  
               class_utf8 = TRUE;  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);  
               }  
 #endif  
4360              continue;              continue;
4361    
4362              case ESC_v:              case ESC_v:
4363              SETBIT(classbits, 0x0a); /* LF */              (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4364              SETBIT(classbits, 0x0b); /* VT */                PRIV(vspace_list), NOTACHAR);
             SETBIT(classbits, 0x0c); /* FF */  
             SETBIT(classbits, 0x0d); /* CR */  
             SETBIT(classbits, 0x85); /* NEL */  
 #ifdef SUPPORT_UTF8  
             if (utf8)  
               {  
               class_utf8 = TRUE;  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);  
               }  
 #endif  
4365              continue;              continue;
4366    
4367              case ESC_V:              case ESC_V:
4368              for (c = 0; c < 32; c++)              (void)add_not_list_to_class(classbits, &class_uchardata, options,
4369                {                cd, PRIV(vspace_list));
               int x = 0xff;  
               switch (c)  
                 {  
                 case 0x0a/8: x ^= 1 << (0x0a%8);  
                              x ^= 1 << (0x0b%8);  
                              x ^= 1 << (0x0c%8);  
                              x ^= 1 << (0x0d%8);  
                              break;  
                 case 0x85/8: x ^= 1 << (0x85%8); break;  
                 default: break;  
                 }  
               classbits[c] |= x;  
               }  
   
 #ifdef SUPPORT_UTF8  
             if (utf8)  
               {  
               class_utf8 = TRUE;  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);  
               *class_utf8data++ = XCL_RANGE;  
               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);  
               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);  
               }  
 #endif  
4370              continue;              continue;
4371    
4372  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
# Line 3629  for (;; ptr++) Line 4377  for (;; ptr++)
4377                int pdata;                int pdata;
4378                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4379                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
4380                class_utf8 = TRUE;                *class_uchardata++ = ((escape == ESC_p) != negated)?
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
4381                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4382                *class_utf8data++ = ptype;                *class_uchardata++ = ptype;
4383                *class_utf8data++ = pdata;                *class_uchardata++ = pdata;
4384                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4385                continue;                continue;
4386                }                }
4387  #endif  #endif
# Line 3648  for (;; ptr++) Line 4395  for (;; ptr++)
4395                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4396                goto FAILED;                goto FAILED;
4397                }                }
4398              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4399              c = *ptr;              /* Get the final character and fall through */              class_one_char -= 2;     /* Undo the speculative increase. */
4400                c = *ptr;                /* Get the final character and fall through */
4401              break;              break;
4402              }              }
4403            }            }
4404    
4405          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if the escape just defined a single character (c >= 0).
4406          greater than 256 in UTF-8 mode. */          This may be greater than 256. */
4407    
4408            escape = 0;
4409    
4410          }   /* End of backslash handling */          }   /* End of backslash handling */
4411    
4412        /* A single character may be followed by '-' to form a range. However,        /* A character may be followed by '-' to form a range. However, Perl does
4413        Perl does not permit ']' to be the end of the range. A '-' character        not permit ']' to be the end of the range. A '-' character at the end is
4414        at the end is treated as a literal. Perl ignores orphaned \E sequences        treated as a literal. Perl ignores orphaned \E sequences entirely. The
4415        entirely. The code for handling \Q and \E is messy. */        code for handling \Q and \E is messy. */
4416    
4417        CHECK_RANGE:        CHECK_RANGE:
4418        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3670  for (;; ptr++) Line 4420  for (;; ptr++)
4420          inescq = FALSE;          inescq = FALSE;
4421          ptr += 2;          ptr += 2;
4422          }          }
   
4423        oldptr = ptr;        oldptr = ptr;
4424    
4425        /* Remember \r or \n */        /* Remember if \r or \n were explicitly used */
4426    
4427        if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;        if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4428    
# Line 3681  for (;; ptr++) Line 4430  for (;; ptr++)
4430    
4431        if (!inescq && ptr[1] == CHAR_MINUS)        if (!inescq && ptr[1] == CHAR_MINUS)
4432          {          {
4433          int d;          pcre_uint32 d;
4434          ptr += 2;          ptr += 2;
4435          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4436    
# Line 3696  for (;; ptr++) Line 4445  for (;; ptr++)
4445            inescq = TRUE;            inescq = TRUE;
4446            break;            break;
4447            }            }
4448    
4449            /* Minus (hyphen) at the end of a class is treated as a literal, so put
4450            back the pointer and jump to handle the character that preceded it. */
4451    
4452          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4453            {            {
4454            ptr = oldptr;            ptr = oldptr;
4455            goto LONE_SINGLE_CHARACTER;            goto CLASS_SINGLE_CHARACTER;
4456            }            }
4457    
4458            /* Otherwise, we have a potential range; pick up the next character */
4459    
4460  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4461          if (utf8)          if (utf)
4462            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4463            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4464            }            }
# Line 3718  for (;; ptr++) Line 4472  for (;; ptr++)
4472    
4473          if (!inescq && d == CHAR_BACKSLASH)          if (!inescq && d == CHAR_BACKSLASH)
4474            {            {
4475            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            int descape;
4476              descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
4477            if (*errorcodeptr != 0) goto FAILED;            if (*errorcodeptr != 0) goto FAILED;
4478    
4479            /* \b is backspace; any other special means the '-' was literal */            /* \b is backspace; any other special means the '-' was literal. */
4480    
4481            if (d < 0)            if (descape != 0)
4482              {              {
4483              if (d == -ESC_b) d = CHAR_BS; else              if (descape == ESC_b) d = CHAR_BS; else
4484                {                {
4485                ptr = oldptr;                ptr = oldptr;
4486                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
4487                }                }
4488              }              }
4489            }            }
4490    
4491          /* Check that the two values are in the correct order. Optimize          /* Check that the two values are in the correct order. Optimize
4492          one-character ranges */          one-character ranges. */
4493    
4494          if (d < c)          if (d < c)
4495            {            {
4496            *errorcodeptr = ERR8;            *errorcodeptr = ERR8;
4497            goto FAILED;            goto FAILED;
4498            }            }
4499            if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
4500    
4501          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          /* We have found a character range, so single character optimizations
4502            cannot be done anymore. Any value greater than 1 indicates that there
4503            is more than one character. */
4504    
4505            class_one_char = 2;
4506    
4507          /* Remember \r or \n */          /* Remember an explicit \r or \n, and add the range to the class. */
4508    
4509          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4510    
4511          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          class_has_8bitchar +=
4512          matching, we have to use an XCLASS with extra data items. Caseless            add_to_class(classbits, &class_uchardata, options, cd, c, d);
4513          matching for characters > 127 is available only if UCP support is  
         available. */  
   
 #ifdef SUPPORT_UTF8  
         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))  
           {  
           class_utf8 = TRUE;  
   
           /* With UCP support, we can find the other case equivalents of  
           the relevant characters. There may be several ranges. Optimize how  
           they fit with the basic range. */  
   
 #ifdef SUPPORT_UCP  
           if ((options & PCRE_CASELESS) != 0)  
             {  
             unsigned int occ, ocd;  
             unsigned int cc = c;  
             unsigned int origd = d;  
             while (get_othercase_range(&cc, origd, &occ, &ocd))  
               {  
               if (occ >= (unsigned int)c &&  
                   ocd <= (unsigned int)d)  
      &n