/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 533 by ph10, Wed Jun 2 19:02:41 2010 UTC revision 836 by ph10, Wed Dec 28 17:16:11 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
# Line 88  so this number is very generous. Line 92  so this number is very generous.
92  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
93  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
94  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
96    filled up by repetitions of forward references, for example patterns like
97    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98    that the workspace is expanded using malloc() in this situation. The value
99    below is therefore a minimum, and we put a maximum on it for safety. The
100    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101    kicks in at the same number of forward references in all cases. */
102    
103  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105    
106  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
107  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
108    
109  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
110    
111    /* Private flags added to firstchar and reqchar. */
112    
113    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
114    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
115    
116    /* Repeated character flags. */
117    
118    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
119    
120  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 231  static const char posix_names[] = Line 250  static const char posix_names[] =
250    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
252    
253  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
254    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255    
256  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 261  static const int posix_class_maps[] = { Line 280  static const int posix_class_maps[] = {
280    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
281  };  };
282    
283  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
284  substitutes must be in the order of the names, defined above, and there are  substitutes must be in the order of the names, defined above, and there are
285  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
286    
287  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
288  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
289    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
292    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
295      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297    static const pcre_uchar string_pXsp[] = {
298      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300    static const pcre_uchar string_PXwd[] = {
301      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303    static const pcre_uchar string_pXwd[] = {
304      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    
307    static const pcre_uchar *substitutes[] = {
308      string_PNd,           /* \D */
309      string_pNd,           /* \d */
310      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
311      string_pXsp,          /* \s */
312      string_PXwd,          /* \W */
313      string_pXwd           /* \w */
314  };  };
315    
316  static const uschar *posix_substitutes[] = {  static const pcre_uchar string_pL[] =   {
317    (uschar *)"\\p{L}",     /* alpha */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318    (uschar *)"\\p{Ll}",    /* lower */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319    (uschar *)"\\p{Lu}",    /* upper */  static const pcre_uchar string_pLl[] =  {
320    (uschar *)"\\p{Xan}",   /* alnum */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321    NULL,                   /* ascii */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pLu[] =  {
323    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325    NULL,                   /* graph */  static const pcre_uchar string_pXan[] = {
326    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327    NULL,                   /* punct */    CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_h[] =    {
329    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_h, '\0' };
330    NULL,                   /* xdigit */  static const pcre_uchar string_pXps[] = {
331      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333    static const pcre_uchar string_PL[] =   {
334      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336    static const pcre_uchar string_PLl[] =  {
337      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339    static const pcre_uchar string_PLu[] =  {
340      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342    static const pcre_uchar string_PXan[] = {
343      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345    static const pcre_uchar string_H[] =    {
346      CHAR_BACKSLASH, CHAR_H, '\0' };
347    static const pcre_uchar string_PXps[] = {
348      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350    
351    static const pcre_uchar *posix_substitutes[] = {
352      string_pL,            /* alpha */
353      string_pLl,           /* lower */
354      string_pLu,           /* upper */
355      string_pXan,          /* alnum */
356      NULL,                 /* ascii */
357      string_h,             /* blank */
358      NULL,                 /* cntrl */
359      string_pNd,           /* digit */
360      NULL,                 /* graph */
361      NULL,                 /* print */
362      NULL,                 /* punct */
363      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
364      string_pXwd,          /* word */
365      NULL,                 /* xdigit */
366    /* Negated cases */    /* Negated cases */
367    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
368    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
369    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
370    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
371    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
372    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
373    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
374    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
375    NULL,                   /* ^graph */    NULL,                 /* ^graph */
376    NULL,                   /* ^print */    NULL,                 /* ^print */
377    NULL,                   /* ^punct */    NULL,                 /* ^punct */
378    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
379    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */
380    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
381  };  };
382  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383  #endif  #endif
384    
385  #define STRING(a)  # a  #define STRING(a)  # a
386  #define XSTRING(s) STRING(s)  #define XSTRING(s) STRING(s)
# Line 393  static const char error_texts[] = Line 466  static const char error_texts[] =
466    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
467    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
468    /* 55 */    /* 55 */
469    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
470    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
471    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
472    "a numbered reference must not be zero\0"    "a numbered reference must not be zero\0"
# Line 407  static const char error_texts[] = Line 480  static const char error_texts[] =
480    /* 65 */    /* 65 */
481    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
482    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
483    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
484      "\\c must be followed by an ASCII character\0"
485      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486      /* 70 */
487      "internal error: unknown opcode in find_fixedlength()\0"
488      "\\N is not supported in a class\0"
489      "too many forward references\0"
490      "disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff)\0"
491    ;    ;
492    
493  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 426  For convenience, we use the same bit def Line 506  For convenience, we use the same bit def
506    
507  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
508    
509    /* Using a simple comparison for decimal numbers rather than a memory read
510    is much faster, and the resulting code is simpler (the compiler turns it
511    into a subtraction and unsigned comparison). */
512    
513    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
514    
515  #ifndef EBCDIC  #ifndef EBCDIC
516    
517  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
518  UTF-8 mode. */  UTF-8 mode. */
519    
520  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
521    {    {
522    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
523    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 470  static const unsigned char digitab[] = Line 556  static const unsigned char digitab[] =
556    
557  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
558    
559  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
560    {    {
561    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
562    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 505  static const unsigned char digitab[] = Line 591  static const unsigned char digitab[] =
591    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
592    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
593    
594  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
595    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
596    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
597    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 544  static const unsigned char ebcdic_charta Line 630  static const unsigned char ebcdic_charta
630  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
631    
632  static BOOL  static BOOL
633    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
634      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
635    
636    
# Line 576  return s; Line 662  return s;
662    
663    
664  /*************************************************  /*************************************************
665    *           Expand the workspace                 *
666    *************************************************/
667    
668    /* This function is called during the second compiling phase, if the number of
669    forward references fills the existing workspace, which is originally a block on
670    the stack. A larger block is obtained from malloc() unless the ultimate limit
671    has been reached or the increase will be rather small.
672    
673    Argument: pointer to the compile data block
674    Returns:  0 if all went well, else an error number
675    */
676    
677    static int
678    expand_workspace(compile_data *cd)
679    {
680    pcre_uchar *newspace;
681    int newsize = cd->workspace_size * 2;
682    
683    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
684    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
685        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
686     return ERR72;
687    
688    newspace = (PUBL(malloc))(IN_UCHARS(newsize));
689    if (newspace == NULL) return ERR21;
690    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
691    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
692    if (cd->workspace_size > COMPILE_WORK_SIZE)
693      (PUBL(free))((void *)cd->start_workspace);
694    cd->start_workspace = newspace;
695    cd->workspace_size = newsize;
696    return 0;
697    }
698    
699    
700    
701    /*************************************************
702    *            Check for counted repeat            *
703    *************************************************/
704    
705    /* This function is called when a '{' is encountered in a place where it might
706    start a quantifier. It looks ahead to see if it really is a quantifier or not.
707    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
708    where the ddds are digits.
709    
710    Arguments:
711      p         pointer to the first char after '{'
712    
713    Returns:    TRUE or FALSE
714    */
715    
716    static BOOL
717    is_counted_repeat(const pcre_uchar *p)
718    {
719    if (!IS_DIGIT(*p)) return FALSE;
720    p++;
721    while (IS_DIGIT(*p)) p++;
722    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
723    
724    if (*p++ != CHAR_COMMA) return FALSE;
725    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
726    
727    if (!IS_DIGIT(*p)) return FALSE;
728    p++;
729    while (IS_DIGIT(*p)) p++;
730    
731    return (*p == CHAR_RIGHT_CURLY_BRACKET);
732    }
733    
734    
735    
736    /*************************************************
737  *            Handle escapes                      *  *            Handle escapes                      *
738  *************************************************/  *************************************************/
739    
# Line 600  Returns:         zero or positive => a d Line 758  Returns:         zero or positive => a d
758  */  */
759    
760  static int  static int
761  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
762    int options, BOOL isclass)    int options, BOOL isclass)
763  {  {
764  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
765  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
766  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
767    pcre_int32 c;
768    int i;
769    
770  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
771  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
# Line 619  in a table. A non-zero result is somethi Line 779  in a table. A non-zero result is somethi
779  Otherwise further processing may be required. */  Otherwise further processing may be required. */
780    
781  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
782  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
783    else if (c < CHAR_0 || c > CHAR_z) {}
784  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
785    
786  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
787  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
788    else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
789  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
790  #endif  #endif
791    
# Line 631  else if ((i = escapes[c - 0x48]) != 0) Line 793  else if ((i = escapes[c - 0x48]) != 0)
793    
794  else  else
795    {    {
796    const uschar *oldptr;    const pcre_uchar *oldptr;
797    BOOL braced, negated;    BOOL braced, negated;
798    
799    switch (c)    switch (c)
# Line 641  else Line 803  else
803    
804      case CHAR_l:      case CHAR_l:
805      case CHAR_L:      case CHAR_L:
806        *errorcodeptr = ERR37;
807        break;
808    
809      case CHAR_u:      case CHAR_u:
810        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
811          {
812          /* In JavaScript, \u must be followed by four hexadecimal numbers.
813          Otherwise it is a lowercase u letter. */
814          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
815            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
816            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
817            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
818            {
819            c = 0;
820            for (i = 0; i < 4; ++i)
821              {
822              register int cc = *(++ptr);
823    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
824              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
825              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
826    #else           /* EBCDIC coding */
827              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
828              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
829    #endif
830              }
831            }
832          }
833        else
834          *errorcodeptr = ERR37;
835        break;
836    
837      case CHAR_U:      case CHAR_U:
838      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
839        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
840      break;      break;
841    
842      /* \g must be followed by one of a number of specific things:      /* In a character class, \g is just a literal "g". Outside a character
843        class, \g must be followed by one of a number of specific things:
844    
845      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
846      backreference. If negative, it is a relative backreference. This is a Perl      backreference. If negative, it is a relative backreference. This is a Perl
# Line 663  else Line 857  else
857      the -ESC_g code (cf \k). */      the -ESC_g code (cf \k). */
858    
859      case CHAR_g:      case CHAR_g:
860        if (isclass) break;
861      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
862        {        {
863        c = -ESC_g;        c = -ESC_g;
# Line 673  else Line 868  else
868    
869      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
870        {        {
871        const uschar *p;        const pcre_uchar *p;
872        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
873          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
874        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
875          {          {
876          c = -ESC_k;          c = -ESC_k;
# Line 693  else Line 888  else
888        }        }
889      else negated = FALSE;      else negated = FALSE;
890    
891        /* The integer range is limited by the machine's int representation. */
892      c = 0;      c = 0;
893      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
894          {
895          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
896            {
897            c = -1;
898            break;
899            }
900        c = c * 10 + *(++ptr) - CHAR_0;        c = c * 10 + *(++ptr) - CHAR_0;
901          }
902      if (c < 0)   /* Integer overflow */      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
903        {        {
904          while (IS_DIGIT(ptr[1]))
905            ptr++;
906        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
907        break;        break;
908        }        }
# Line 746  else Line 950  else
950      if (!isclass)      if (!isclass)
951        {        {
952        oldptr = ptr;        oldptr = ptr;
953          /* The integer range is limited by the machine's int representation. */
954        c -= CHAR_0;        c -= CHAR_0;
955        while ((digitab[ptr[1]] & ctype_digit) != 0)        while (IS_DIGIT(ptr[1]))
956            {
957            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
958              {
959              c = -1;
960              break;
961              }
962          c = c * 10 + *(++ptr) - CHAR_0;          c = c * 10 + *(++ptr) - CHAR_0;
963        if (c < 0)    /* Integer overflow */          }
964          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
965          {          {
966            while (IS_DIGIT(ptr[1]))
967              ptr++;
968          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
969          break;          break;
970          }          }
# Line 783  else Line 997  else
997      c -= CHAR_0;      c -= CHAR_0;
998      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
999          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
1000      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf && c > 0xff) *errorcodeptr = ERR51;
1001      break;      break;
1002    
1003      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
1004      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1005      treated as a data character. */      If not, { is treated as a data character. */
1006    
1007      case CHAR_x:      case CHAR_x:
1008        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1009          {
1010          /* In JavaScript, \x must be followed by two hexadecimal numbers.
1011          Otherwise it is a lowercase x letter. */
1012          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1013            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1014            {
1015            c = 0;
1016            for (i = 0; i < 2; ++i)
1017              {
1018              register int cc = *(++ptr);
1019    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1020              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1021              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1022    #else           /* EBCDIC coding */
1023              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1024              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1025    #endif
1026              }
1027            }
1028          break;
1029          }
1030    
1031      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1032        {        {
1033        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
       int count = 0;  
1034    
1035        c = 0;        c = 0;
1036        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1037          {          {
1038          register int cc = *pt++;          register int cc = *pt++;
1039          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
         count++;  
1040    
1041  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1042          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
# Line 810  else Line 1045  else
1045          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1046          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1047  #endif  #endif
1048    
1049    #ifdef COMPILE_PCRE8
1050            if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1051    #else
1052    #ifdef COMPILE_PCRE16
1053            if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1054    #endif
1055    #endif
1056            }
1057    
1058          if (c < 0)
1059            {
1060            while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1061            *errorcodeptr = ERR34;
1062          }          }
1063    
1064        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1065          {          {
1066          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1067          ptr = pt;          ptr = pt;
1068          break;          break;
1069          }          }
# Line 826  else Line 1075  else
1075      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1076    
1077      c = 0;      c = 0;
1078      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1079        {        {
1080        int cc;                                  /* Some compilers don't like */        int cc;                                  /* Some compilers don't like */
1081        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
# Line 841  else Line 1090  else
1090      break;      break;
1091    
1092      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1093      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
1094        coding is ASCII-specific, but then the whole concept of \cx is
1095      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1096    
1097      case CHAR_c:      case CHAR_c:
# Line 851  else Line 1101  else
1101        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
1102        break;        break;
1103        }        }
1104    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1105  #ifndef EBCDIC  /* ASCII/UTF-8 coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
1106          {
1107          *errorcodeptr = ERR68;
1108          break;
1109          }
1110      if (c >= CHAR_a && c <= CHAR_z) c -= 32;      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1111      c ^= 0x40;      c ^= 0x40;
1112  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1113      if (c >= CHAR_a && c <= CHAR_z) c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1114      c ^= 0xC0;      c ^= 0xC0;
1115  #endif  #endif
# Line 879  else Line 1133  else
1133    }    }
1134    
1135  /* Perl supports \N{name} for character names, as well as plain \N for "not  /* Perl supports \N{name} for character names, as well as plain \N for "not
1136  newline". PCRE does not support \N{name}. */  newline". PCRE does not support \N{name}. However, it does support
1137    quantification such as \N{2,3}. */
1138    
1139  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1140         !is_counted_repeat(ptr+2))
1141    *errorcodeptr = ERR37;    *errorcodeptr = ERR37;
1142    
1143  /* If PCRE_UCP is set, we change the values for \d etc. */  /* If PCRE_UCP is set, we change the values for \d etc. */
# Line 917  Returns:         type value from ucp_typ Line 1173  Returns:         type value from ucp_typ
1173  */  */
1174    
1175  static int  static int
1176  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1177  {  {
1178  int c, i, bot, top;  int c, i, bot, top;
1179  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1180  char name[32];  pcre_uchar name[32];
1181    
1182  c = *(++ptr);  c = *(++ptr);
1183  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 938  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1194  if (c == CHAR_LEFT_CURLY_BRACKET)
1194      *negptr = TRUE;      *negptr = TRUE;
1195      ptr++;      ptr++;
1196      }      }
1197    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1198      {      {
1199      c = *(++ptr);      c = *(++ptr);
1200      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 962  else Line 1218  else
1218  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1219    
1220  bot = 0;  bot = 0;
1221  top = _pcre_utt_size;  top = PRIV(utt_size);
1222    
1223  while (bot < top)  while (bot < top)
1224    {    {
1225    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1226    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1227    if (c == 0)    if (c == 0)
1228      {      {
1229      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1230      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1231      }      }
1232    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1233    }    }
# Line 991  return -1; Line 1247  return -1;
1247    
1248    
1249  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if (*p++ != CHAR_COMMA) return FALSE;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == CHAR_RIGHT_CURLY_BRACKET);  
 }  
   
   
   
 /*************************************************  
1250  *         Read repeat counts                     *  *         Read repeat counts                     *
1251  *************************************************/  *************************************************/
1252    
# Line 1042  Returns:         pointer to '}' on succe Line 1265  Returns:         pointer to '}' on succe
1265                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1266  */  */
1267    
1268  static const uschar *  static const pcre_uchar *
1269  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1270  {  {
1271  int min = 0;  int min = 0;
1272  int max = -1;  int max = -1;
# Line 1051  int max = -1; Line 1274  int max = -1;
1274  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1275  an integer overflow. */  an integer overflow. */
1276    
1277  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1278  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1279    {    {
1280    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1066  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1289  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1289    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1290      {      {
1291      max = 0;      max = 0;
1292      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1293      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1294        {        {
1295        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1099  top-level call starts at the beginning o Line 1322  top-level call starts at the beginning o
1322  start at a parenthesis. It scans along a pattern's text looking for capturing  start at a parenthesis. It scans along a pattern's text looking for capturing
1323  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1324  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1325  returns when it reaches a given numbered subpattern. We know that if (?P< is  returns when it reaches a given numbered subpattern. Recursion is used to keep
1326  encountered, the name will be terminated by '>' because that is checked in the  track of subpatterns that reset the capturing group numbers - the (?| feature.
1327  first pass. Recursion is used to keep track of subpatterns that reset the  
1328  capturing group numbers - the (?| feature.  This function was originally called only from the second pass, in which we know
1329    that if (?< or (?' or (?P< is encountered, the name will be correctly
1330    terminated because that is checked in the first pass. There is now one call to
1331    this function in the first pass, to check for a recursive back reference by
1332    name (so that we can make the whole group atomic). In this case, we need check
1333    only up to the current position in the pattern, and that is still OK because
1334    and previous occurrences will have been checked. To make this work, the test
1335    for "end of pattern" is a check against cd->end_pattern in the main loop,
1336    instead of looking for a binary zero. This means that the special first-pass
1337    call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1338    processing items within the loop are OK, because afterwards the main loop will
1339    terminate.)
1340    
1341  Arguments:  Arguments:
1342    ptrptr       address of the current character pointer (updated)    ptrptr       address of the current character pointer (updated)
# Line 1110  Arguments: Line 1344  Arguments:
1344    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1345    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1346    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1347      utf          TRUE if we are in UTF-8 / UTF-16 mode
1348    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1349    
1350  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1351  */  */
1352    
1353  static int  static int
1354  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1355    BOOL xmode, int *count)    BOOL xmode, BOOL utf, int *count)
1356  {  {
1357  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1358  int start_count = *count;  int start_count = *count;
1359  int hwm_count = start_count;  int hwm_count = start_count;
1360  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1129  dealing with. The very first call may no Line 1364  dealing with. The very first call may no
1364    
1365  if (ptr[0] == CHAR_LEFT_PARENTHESIS)  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1366    {    {
1367    if (ptr[1] == CHAR_QUESTION_MARK &&    /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1368        ptr[2] == CHAR_VERTICAL_LINE)  
1369      if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1370    
1371      /* Handle a normal, unnamed capturing parenthesis. */
1372    
1373      else if (ptr[1] != CHAR_QUESTION_MARK)
1374        {
1375        *count += 1;
1376        if (name == NULL && *count == lorn) return *count;
1377        ptr++;
1378        }
1379    
1380      /* All cases now have (? at the start. Remember when we are in a group
1381      where the parenthesis numbers are duplicated. */
1382    
1383      else if (ptr[2] == CHAR_VERTICAL_LINE)
1384      {      {
1385      ptr += 3;      ptr += 3;
1386      dup_parens = TRUE;      dup_parens = TRUE;
1387      }      }
1388    
1389    /* Handle a normal, unnamed capturing parenthesis */    /* Handle comments; all characters are allowed until a ket is reached. */
1390    
1391    else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)    else if (ptr[2] == CHAR_NUMBER_SIGN)
1392      {      {
1393      *count += 1;      for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1394      if (name == NULL && *count == lorn) return *count;      goto FAIL_EXIT;
     ptr++;  
1395      }      }
1396    
1397    /* Handle a condition. If it is an assertion, just carry on so that it    /* Handle a condition. If it is an assertion, just carry on so that it
1398    is processed as normal. If not, skip to the closing parenthesis of the    is processed as normal. If not, skip to the closing parenthesis of the
1399    condition (there can't be any nested parens. */    condition (there can't be any nested parens). */
1400    
1401    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1402      {      {
# Line 1159  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1408  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1408        }        }
1409      }      }
1410    
1411    /* We have either (? or (* and not a condition */    /* Start with (? but not a condition. */
1412    
1413    else    else
1414      {      {
# Line 1172  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1421  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1421          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1422        {        {
1423        int term;        int term;
1424        const uschar *thisname;        const pcre_uchar *thisname;
1425        *count += 1;        *count += 1;
1426        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1427        term = *ptr++;        term = *ptr++;
# Line 1180  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1429  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1429        thisname = ptr;        thisname = ptr;
1430        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1431        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1432            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, lorn) == 0)
1433          return *count;          return *count;
1434        term++;        term++;
1435        }        }
# Line 1188  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1437  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1437    }    }
1438    
1439  /* Past any initial parenthesis handling, scan for parentheses or vertical  /* Past any initial parenthesis handling, scan for parentheses or vertical
1440  bars. */  bars. Stop if we get to cd->end_pattern. Note that this is important for the
1441    first-pass call when this value is temporarily adjusted to stop at the current
1442    position. So DO NOT change this to a test for binary zero. */
1443    
1444  for (; *ptr != 0; ptr++)  for (; ptr < cd->end_pattern; ptr++)
1445    {    {
1446    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1447    
# Line 1221  for (; *ptr != 0; ptr++) Line 1472  for (; *ptr != 0; ptr++)
1472          {          {
1473          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1474            ptr+= 2;            ptr+= 2;
1475          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1476                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1477            ptr += 4;            ptr += 4;
1478          else          else
# Line 1264  for (; *ptr != 0; ptr++) Line 1515  for (; *ptr != 0; ptr++)
1515    
1516    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1517      {      {
1518      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      ptr++;
1519        while (*ptr != 0)
1520          {
1521          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1522          ptr++;
1523    #ifdef SUPPORT_UTF
1524          if (utf) FORWARDCHAR(ptr);
1525    #endif
1526          }
1527      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1528      continue;      continue;
1529      }      }
# Line 1273  for (; *ptr != 0; ptr++) Line 1532  for (; *ptr != 0; ptr++)
1532    
1533    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1534      {      {
1535      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1536      if (rc > 0) return rc;      if (rc > 0) return rc;
1537      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1538      }      }
# Line 1281  for (; *ptr != 0; ptr++) Line 1540  for (; *ptr != 0; ptr++)
1540    else if (*ptr == CHAR_RIGHT_PARENTHESIS)    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1541      {      {
1542      if (dup_parens && *count < hwm_count) *count = hwm_count;      if (dup_parens && *count < hwm_count) *count = hwm_count;
1543      *ptrptr = ptr;      goto FAIL_EXIT;
     return -1;  
1544      }      }
1545    
1546    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
# Line 1320  Arguments: Line 1578  Arguments:
1578    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1579    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1580    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1581      utf          TRUE if we are in UTF-8 / UTF-16 mode
1582    
1583  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1584  */  */
1585    
1586  static int  static int
1587  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1588      BOOL utf)
1589  {  {
1590  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1591  int count = 0;  int count = 0;
1592  int rc;  int rc;
1593    
# Line 1338  matching closing parens. That is why we Line 1598  matching closing parens. That is why we
1598    
1599  for (;;)  for (;;)
1600    {    {
1601    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1602    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1603    }    }
1604    
# Line 1354  return rc; Line 1614  return rc;
1614    
1615  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1616  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1617  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, it makes sense to skip negative
1618  For some calls, it makes sense to skip negative forward and all backward  forward and all backward assertions, and also the \b assertion; for others it
1619  assertions, and also the \b assertion; for others it does not.  does not.
1620    
1621  Arguments:  Arguments:
1622    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1623    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1624    
1625  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1626  */  */
1627    
1628  static const uschar*  static const pcre_uchar*
1629  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const pcre_uchar *code, BOOL skipassert)
   BOOL skipassert)  
1630  {  {
1631  for (;;)  for (;;)
1632    {    {
1633    switch ((int)*code)    switch ((int)*code)
1634      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1635      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1636      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1637      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1638      if (!skipassert) return code;      if (!skipassert) return code;
1639      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1640      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1641      break;      break;
1642    
1643      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1401  for (;;) Line 1651  for (;;)
1651      case OP_RREF:      case OP_RREF:
1652      case OP_NRREF:      case OP_NRREF:
1653      case OP_DEF:      case OP_DEF:
1654      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1655      break;      break;
1656    
1657      default:      default:
# Line 1431  and doing the check at the end; a flag s Line 1681  and doing the check at the end; a flag s
1681    
1682  Arguments:  Arguments:
1683    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1684    options  the compiling options    utf      TRUE in UTF-8 / UTF-16 mode
1685    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1686    cd       the "compile data" structure    cd       the "compile data" structure
1687    
1688  Returns:   the fixed length,  Returns:   the fixed length,
1689               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1690               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1691               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1692                 or -4 if an unknown opcode was encountered (internal error)
1693  */  */
1694    
1695  static int  static int
1696  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1697  {  {
1698  int length = -1;  int length = -1;
1699    
1700  register int branchlength = 0;  register int branchlength = 0;
1701  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1702    
1703  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1704  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1455  branch, check the length against that of Line 1706  branch, check the length against that of
1706  for (;;)  for (;;)
1707    {    {
1708    int d;    int d;
1709    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1710    register int op = *cc;    register int op = *cc;
1711    
1712    switch (op)    switch (op)
1713      {      {
1714        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1715        OP_BRA (normal non-capturing bracket) because the other variants of these
1716        opcodes are all concerned with unlimited repeated groups, which of course
1717        are not of fixed length. */
1718    
1719      case OP_CBRA:      case OP_CBRA:
1720      case OP_BRA:      case OP_BRA:
1721      case OP_ONCE:      case OP_ONCE:
1722        case OP_ONCE_NC:
1723      case OP_COND:      case OP_COND:
1724      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1725      if (d < 0) return d;      if (d < 0) return d;
1726      branchlength += d;      branchlength += d;
1727      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1728      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1729      break;      break;
1730    
1731      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1732      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1733      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1734        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1735        because they all imply an unlimited repeat. */
1736    
1737      case OP_ALT:      case OP_ALT:
1738      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1739      case OP_END:      case OP_END:
1740        case OP_ACCEPT:
1741        case OP_ASSERT_ACCEPT:
1742      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1743        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1744      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1492  for (;;) Line 1752  for (;;)
1752    
1753      case OP_RECURSE:      case OP_RECURSE:
1754      if (!atend) return -3;      if (!atend) return -3;
1755      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1756      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1757      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1758      d = find_fixedlength(cs + 2, options, atend, cd);      d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1759      if (d < 0) return d;      if (d < 0) return d;
1760      branchlength += d;      branchlength += d;
1761      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1508  for (;;) Line 1768  for (;;)
1768      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1769      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1770      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1771      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1772        break;
1773    
1774      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1775    
1776      case OP_REVERSE:      case OP_MARK:
1777        case OP_PRUNE_ARG:
1778        case OP_SKIP_ARG:
1779        case OP_THEN_ARG:
1780        cc += cc[1] + PRIV(OP_lengths)[*cc];
1781        break;
1782    
1783        case OP_CALLOUT:
1784        case OP_CIRC:
1785        case OP_CIRCM:
1786        case OP_CLOSE:
1787        case OP_COMMIT:
1788      case OP_CREF:      case OP_CREF:
     case OP_NCREF:  
     case OP_RREF:  
     case OP_NRREF:  
1789      case OP_DEF:      case OP_DEF:
1790      case OP_OPT:      case OP_DOLL:
1791      case OP_CALLOUT:      case OP_DOLLM:
     case OP_SOD:  
     case OP_SOM:  
     case OP_SET_SOM:  
1792      case OP_EOD:      case OP_EOD:
1793      case OP_EODN:      case OP_EODN:
1794      case OP_CIRC:      case OP_FAIL:
1795      case OP_DOLL:      case OP_NCREF:
1796        case OP_NRREF:
1797      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1798        case OP_PRUNE:
1799        case OP_REVERSE:
1800        case OP_RREF:
1801        case OP_SET_SOM:
1802        case OP_SKIP:
1803        case OP_SOD:
1804        case OP_SOM:
1805        case OP_THEN:
1806      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1807      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1808      break;      break;
1809    
1810      /* Handle literal characters */      /* Handle literal characters */
1811    
1812      case OP_CHAR:      case OP_CHAR:
1813      case OP_CHARNC:      case OP_CHARI:
1814      case OP_NOT:      case OP_NOT:
1815        case OP_NOTI:
1816      branchlength++;      branchlength++;
1817      cc += 2;      cc += 2;
1818  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1819      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1820  #endif  #endif
1821      break;      break;
1822    
# Line 1549  for (;;) Line 1824  for (;;)
1824      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1825    
1826      case OP_EXACT:      case OP_EXACT:
1827        case OP_EXACTI:
1828        case OP_NOTEXACT:
1829        case OP_NOTEXACTI:
1830      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1831      cc += 4;      cc += 2 + IMM2_SIZE;
1832  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1833      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1834  #endif  #endif
1835      break;      break;
1836    
1837      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1838      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1839      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1840      cc += 4;      cc += 1 + IMM2_SIZE + 1;
1841      break;      break;
1842    
1843      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1570  for (;;) Line 1847  for (;;)
1847      cc += 2;      cc += 2;
1848      /* Fall through */      /* Fall through */
1849    
1850        case OP_HSPACE:
1851        case OP_VSPACE:
1852        case OP_NOT_HSPACE:
1853        case OP_NOT_VSPACE:
1854      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1855      case OP_DIGIT:      case OP_DIGIT:
1856      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1582  for (;;) Line 1863  for (;;)
1863      cc++;      cc++;
1864      break;      break;
1865    
1866      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1867        otherwise \C is coded as OP_ALLANY. */
1868    
1869      case OP_ANYBYTE:      case OP_ANYBYTE:
1870      return -2;      return -2;
1871    
1872      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1873    
1874  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1875      case OP_XCLASS:      case OP_XCLASS:
1876      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1877      /* Fall through */      /* Fall through */
1878  #endif  #endif
1879    
1880      case OP_CLASS:      case OP_CLASS:
1881      case OP_NCLASS:      case OP_NCLASS:
1882      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1883    
1884      switch (*cc)      switch (*cc)
1885        {        {
1886          case OP_CRPLUS:
1887          case OP_CRMINPLUS:
1888        case OP_CRSTAR:        case OP_CRSTAR:
1889        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1890        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1609  for (;;) Line 1893  for (;;)
1893    
1894        case OP_CRRANGE:        case OP_CRRANGE:
1895        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1896        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1897        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1898        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1899        break;        break;
1900    
1901        default:        default:
# Line 1621  for (;;) Line 1905  for (;;)
1905    
1906      /* Anything else is variable length */      /* Anything else is variable length */
1907    
1908      default:      case OP_ANYNL:
1909        case OP_BRAMINZERO:
1910        case OP_BRAPOS:
1911        case OP_BRAPOSZERO:
1912        case OP_BRAZERO:
1913        case OP_CBRAPOS:
1914        case OP_EXTUNI:
1915        case OP_KETRMAX:
1916        case OP_KETRMIN:
1917        case OP_KETRPOS:
1918        case OP_MINPLUS:
1919        case OP_MINPLUSI:
1920        case OP_MINQUERY:
1921        case OP_MINQUERYI:
1922        case OP_MINSTAR:
1923        case OP_MINSTARI:
1924        case OP_MINUPTO:
1925        case OP_MINUPTOI:
1926        case OP_NOTMINPLUS:
1927        case OP_NOTMINPLUSI:
1928        case OP_NOTMINQUERY:
1929        case OP_NOTMINQUERYI:
1930        case OP_NOTMINSTAR:
1931        case OP_NOTMINSTARI:
1932        case OP_NOTMINUPTO:
1933        case OP_NOTMINUPTOI:
1934        case OP_NOTPLUS:
1935        case OP_NOTPLUSI:
1936        case OP_NOTPOSPLUS:
1937        case OP_NOTPOSPLUSI:
1938        case OP_NOTPOSQUERY:
1939        case OP_NOTPOSQUERYI:
1940        case OP_NOTPOSSTAR:
1941        case OP_NOTPOSSTARI:
1942        case OP_NOTPOSUPTO:
1943        case OP_NOTPOSUPTOI:
1944        case OP_NOTQUERY:
1945        case OP_NOTQUERYI:
1946        case OP_NOTSTAR:
1947        case OP_NOTSTARI:
1948        case OP_NOTUPTO:
1949        case OP_NOTUPTOI:
1950        case OP_PLUS:
1951        case OP_PLUSI:
1952        case OP_POSPLUS:
1953        case OP_POSPLUSI:
1954        case OP_POSQUERY:
1955        case OP_POSQUERYI:
1956        case OP_POSSTAR:
1957        case OP_POSSTARI:
1958        case OP_POSUPTO:
1959        case OP_POSUPTOI:
1960        case OP_QUERY:
1961        case OP_QUERYI:
1962        case OP_REF:
1963        case OP_REFI:
1964        case OP_SBRA:
1965        case OP_SBRAPOS:
1966        case OP_SCBRA:
1967        case OP_SCBRAPOS:
1968        case OP_SCOND:
1969        case OP_SKIPZERO:
1970        case OP_STAR:
1971        case OP_STARI:
1972        case OP_TYPEMINPLUS:
1973        case OP_TYPEMINQUERY:
1974        case OP_TYPEMINSTAR:
1975        case OP_TYPEMINUPTO:
1976        case OP_TYPEPLUS:
1977        case OP_TYPEPOSPLUS:
1978        case OP_TYPEPOSQUERY:
1979        case OP_TYPEPOSSTAR:
1980        case OP_TYPEPOSUPTO:
1981        case OP_TYPEQUERY:
1982        case OP_TYPESTAR:
1983        case OP_TYPEUPTO:
1984        case OP_UPTO:
1985        case OP_UPTOI:
1986      return -1;      return -1;
1987    
1988        /* Catch unrecognized opcodes so that when new ones are added they
1989        are not forgotten, as has happened in the past. */
1990    
1991        default:
1992        return -4;
1993      }      }
1994    }    }
1995  /* Control never gets here */  /* Control never gets here */
# Line 1643  length. Line 2010  length.
2010    
2011  Arguments:  Arguments:
2012    code        points to start of expression    code        points to start of expression
2013    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2014    number      the required bracket number or negative to find a lookbehind    number      the required bracket number or negative to find a lookbehind
2015    
2016  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2017  */  */
2018    
2019  const uschar *  const pcre_uchar *
2020  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2021  {  {
2022  for (;;)  for (;;)
2023    {    {
2024    register int c = *code;    register int c = *code;
2025    
2026    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
2027    
2028    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1667  for (;;) Line 2035  for (;;)
2035    
2036    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
2037      {      {
2038      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
2039      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2040      }      }
2041    
2042    /* Handle capturing bracket */    /* Handle capturing bracket */
2043    
2044    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
2045               c == OP_CBRAPOS || c == OP_SCBRAPOS)
2046      {      {
2047      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
2048      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2049      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2050      }      }
2051    
2052    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
# Line 1705  for (;;) Line 2074  for (;;)
2074        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2075        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2076        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2077        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2078            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2079        break;        break;
2080    
2081        case OP_MARK:        case OP_MARK:
2082        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2083        case OP_SKIP_ARG:        case OP_SKIP_ARG:
2084          code += code[1];
2085          break;
2086    
2087        case OP_THEN_ARG:        case OP_THEN_ARG:
2088        code += code[1];        code += code[1];
2089        break;        break;
# Line 1718  for (;;) Line 2091  for (;;)
2091    
2092      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2093    
2094      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2095    
2096    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2097    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2098    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2099    
2100  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2101      if (utf8) switch(c)      if (utf) switch(c)
2102        {        {
2103        case OP_CHAR:        case OP_CHAR:
2104        case OP_CHARNC:        case OP_CHARI:
2105        case OP_EXACT:        case OP_EXACT:
2106          case OP_EXACTI:
2107        case OP_UPTO:        case OP_UPTO:
2108          case OP_UPTOI:
2109        case OP_MINUPTO:        case OP_MINUPTO:
2110          case OP_MINUPTOI:
2111        case OP_POSUPTO:        case OP_POSUPTO:
2112          case OP_POSUPTOI:
2113        case OP_STAR:        case OP_STAR:
2114          case OP_STARI:
2115        case OP_MINSTAR:        case OP_MINSTAR:
2116          case OP_MINSTARI:
2117        case OP_POSSTAR:        case OP_POSSTAR:
2118          case OP_POSSTARI:
2119        case OP_PLUS:        case OP_PLUS:
2120          case OP_PLUSI:
2121        case OP_MINPLUS:        case OP_MINPLUS:
2122          case OP_MINPLUSI:
2123        case OP_POSPLUS:        case OP_POSPLUS:
2124          case OP_POSPLUSI:
2125        case OP_QUERY:        case OP_QUERY:
2126          case OP_QUERYI:
2127        case OP_MINQUERY:        case OP_MINQUERY:
2128          case OP_MINQUERYI:
2129        case OP_POSQUERY:        case OP_POSQUERY:
2130        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2131          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2132        break;        break;
2133        }        }
2134  #else  #else
2135      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2136  #endif  #endif
2137      }      }
2138    }    }
# Line 1763  instance of OP_RECURSE. Line 2149  instance of OP_RECURSE.
2149    
2150  Arguments:  Arguments:
2151    code        points to start of expression    code        points to start of expression
2152    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2153    
2154  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2155  */  */
2156    
2157  static const uschar *  static const pcre_uchar *
2158  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2159  {  {
2160  for (;;)  for (;;)
2161    {    {
# Line 1808  for (;;) Line 2194  for (;;)
2194        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2195        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2196        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2197        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2198            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2199        break;        break;
2200    
2201        case OP_MARK:        case OP_MARK:
2202        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2203        case OP_SKIP_ARG:        case OP_SKIP_ARG:
2204          code += code[1];
2205          break;
2206    
2207        case OP_THEN_ARG:        case OP_THEN_ARG:
2208        code += code[1];        code += code[1];
2209        break;        break;
# Line 1821  for (;;) Line 2211  for (;;)
2211    
2212      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2213    
2214      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2215    
2216      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2217      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2218      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2219    
2220  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2221      if (utf8) switch(c)      if (utf) switch(c)
2222        {        {
2223        case OP_CHAR:        case OP_CHAR:
2224        case OP_CHARNC:        case OP_CHARI:
2225        case OP_EXACT:        case OP_EXACT:
2226          case OP_EXACTI:
2227        case OP_UPTO:        case OP_UPTO:
2228          case OP_UPTOI:
2229        case OP_MINUPTO:        case OP_MINUPTO:
2230          case OP_MINUPTOI:
2231        case OP_POSUPTO:        case OP_POSUPTO:
2232          case OP_POSUPTOI:
2233        case OP_STAR:        case OP_STAR:
2234          case OP_STARI:
2235        case OP_MINSTAR:        case OP_MINSTAR:
2236          case OP_MINSTARI:
2237        case OP_POSSTAR:        case OP_POSSTAR:
2238          case OP_POSSTARI:
2239        case OP_PLUS:        case OP_PLUS:
2240          case OP_PLUSI:
2241        case OP_MINPLUS:        case OP_MINPLUS:
2242          case OP_MINPLUSI:
2243        case OP_POSPLUS:        case OP_POSPLUS:
2244          case OP_POSPLUSI:
2245        case OP_QUERY:        case OP_QUERY:
2246          case OP_QUERYI:
2247        case OP_MINQUERY:        case OP_MINQUERY:
2248          case OP_MINQUERYI:
2249        case OP_POSQUERY:        case OP_POSQUERY:
2250        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2251          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2252        break;        break;
2253        }        }
2254  #else  #else
2255      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2256  #endif  #endif
2257      }      }
2258    }    }
# Line 1872  bracket whose current branch will alread Line 2275  bracket whose current branch will alread
2275  Arguments:  Arguments:
2276    code        points to start of search    code        points to start of search
2277    endcode     points to where to stop    endcode     points to where to stop
2278    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2279    cd          contains pointers to tables etc.    cd          contains pointers to tables etc.
2280    
2281  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2282  */  */
2283    
2284  static BOOL  static BOOL
2285  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2286    compile_data *cd)    BOOL utf, compile_data *cd)
2287  {  {
2288  register int c;  register int c;
2289  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2290       code < endcode;       code < endcode;
2291       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2292    {    {
2293    const uschar *ccode;    const pcre_uchar *ccode;
2294    
2295    c = *code;    c = *code;
2296    
# Line 1901  for (code = first_significant_code(code Line 2304  for (code = first_significant_code(code
2304      continue;      continue;
2305      }      }
2306    
   /* Groups with zero repeats can of course be empty; skip them. */  
   
   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)  
     {  
     code += _pcre_OP_lengths[c];  
     do code += GET(code, 1); while (*code == OP_ALT);  
     c = *code;  
     continue;  
     }  
   
2307    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
2308    implies a subroutine call, we can scan it. */    implies a backward reference subroutine call, we can scan it. If it's a
2309      forward reference subroutine call, we can't. To detect forward reference
2310      we have to scan up the list that is kept in the workspace. This function is
2311      called only when doing the real compile, not during the pre-compile that
2312      measures the size of the compiled pattern. */
2313    
2314    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2315      {      {
2316      BOOL empty_branch = FALSE;      const pcre_uchar *scode;
2317      const uschar *scode = cd->start_code + GET(code, 1);      BOOL empty_branch;
2318    
2319        /* Test for forward reference */
2320    
2321        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2322          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2323    
2324        /* Not a forward reference, test for completed backward reference */
2325    
2326        empty_branch = FALSE;
2327        scode = cd->start_code + GET(code, 1);
2328      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2329    
2330        /* Completed backwards reference */
2331    
2332      do      do
2333        {        {
2334        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf, cd))
2335          {          {
2336          empty_branch = TRUE;          empty_branch = TRUE;
2337          break;          break;
# Line 1929  for (code = first_significant_code(code Line 2339  for (code = first_significant_code(code
2339        scode += GET(scode, 1);        scode += GET(scode, 1);
2340        }        }
2341      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2342    
2343      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2344      continue;      continue;
2345      }      }
2346    
2347      /* Groups with zero repeats can of course be empty; skip them. */
2348    
2349      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2350          c == OP_BRAPOSZERO)
2351        {
2352        code += PRIV(OP_lengths)[c];
2353        do code += GET(code, 1); while (*code == OP_ALT);
2354        c = *code;
2355        continue;
2356        }
2357    
2358      /* A nested group that is already marked as "could be empty" can just be
2359      skipped. */
2360    
2361      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2362          c == OP_SCBRA || c == OP_SCBRAPOS)
2363        {
2364        do code += GET(code, 1); while (*code == OP_ALT);
2365        c = *code;
2366        continue;
2367        }
2368    
2369    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2370    
2371    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2372          c == OP_CBRA || c == OP_CBRAPOS ||
2373          c == OP_ONCE || c == OP_ONCE_NC ||
2374          c == OP_COND)
2375      {      {
2376      BOOL empty_branch;      BOOL empty_branch;
2377      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1951  for (code = first_significant_code(code Line 2387  for (code = first_significant_code(code
2387        empty_branch = FALSE;        empty_branch = FALSE;
2388        do        do
2389          {          {
2390          if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2391            empty_branch = TRUE;            empty_branch = TRUE;
2392          code += GET(code, 1);          code += GET(code, 1);
2393          }          }
# Line 1969  for (code = first_significant_code(code Line 2405  for (code = first_significant_code(code
2405      {      {
2406      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2407      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2408      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2409      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2410      here. */      here. */
2411    
2412  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2413      case OP_XCLASS:      case OP_XCLASS:
2414      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2415      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 1981  for (code = first_significant_code(code Line 2417  for (code = first_significant_code(code
2417    
2418      case OP_CLASS:      case OP_CLASS:
2419      case OP_NCLASS:      case OP_NCLASS:
2420      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2421    
2422  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2423      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2424  #endif  #endif
2425    
# Line 2022  for (code = first_significant_code(code Line 2458  for (code = first_significant_code(code
2458      case OP_ALLANY:      case OP_ALLANY:
2459      case OP_ANYBYTE:      case OP_ANYBYTE:
2460      case OP_CHAR:      case OP_CHAR:
2461      case OP_CHARNC:      case OP_CHARI:
2462      case OP_NOT:      case OP_NOT:
2463        case OP_NOTI:
2464      case OP_PLUS:      case OP_PLUS:
2465      case OP_MINPLUS:      case OP_MINPLUS:
2466      case OP_POSPLUS:      case OP_POSPLUS:
# Line 2055  for (code = first_significant_code(code Line 2492  for (code = first_significant_code(code
2492      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2493      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2494      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2495      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP
2496          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2497      break;      break;
2498    
2499      /* End of branch */      /* End of branch */
# Line 2063  for (code = first_significant_code(code Line 2501  for (code = first_significant_code(code
2501      case OP_KET:      case OP_KET:
2502      case OP_KETRMAX:      case OP_KETRMAX:
2503      case OP_KETRMIN:      case OP_KETRMIN:
2504        case OP_KETRPOS:
2505      case OP_ALT:      case OP_ALT:
2506      return TRUE;      return TRUE;
2507    
2508      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2509      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
2510    
2511  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2512      case OP_STAR:      case OP_STAR:
2513        case OP_STARI:
2514      case OP_MINSTAR:      case OP_MINSTAR:
2515        case OP_MINSTARI:
2516      case OP_POSSTAR:      case OP_POSSTAR:
2517        case OP_POSSTARI:
2518      case OP_QUERY:      case OP_QUERY:
2519        case OP_QUERYI:
2520      case OP_MINQUERY:      case OP_MINQUERY:
2521        case OP_MINQUERYI:
2522      case OP_POSQUERY:      case OP_POSQUERY:
2523      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      case OP_POSQUERYI:
2524        if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2525      break;      break;
2526    
2527      case OP_UPTO:      case OP_UPTO:
2528        case OP_UPTOI:
2529      case OP_MINUPTO:      case OP_MINUPTO:
2530        case OP_MINUPTOI:
2531      case OP_POSUPTO:      case OP_POSUPTO:
2532      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      case OP_POSUPTOI:
2533        if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2534      break;      break;
2535  #endif  #endif
2536    
# Line 2092  for (code = first_significant_code(code Line 2540  for (code = first_significant_code(code
2540      case OP_MARK:      case OP_MARK:
2541      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
2542      case OP_SKIP_ARG:      case OP_SKIP_ARG:
2543        code += code[1];
2544        break;
2545    
2546      case OP_THEN_ARG:      case OP_THEN_ARG:
2547      code += code[1];      code += code[1];
2548      break;      break;
# Line 2116  return TRUE; Line 2567  return TRUE;
2567  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2568  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2569  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2570    This function is called only during the real compile, not during the
2571    pre-compile.
2572    
2573  Arguments:  Arguments:
2574    code        points to start of the recursion    code        points to start of the recursion
2575    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2576    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2577    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2578    cd          pointers to tables etc    cd          pointers to tables etc
2579    
2580  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2581  */  */
2582    
2583  static BOOL  static BOOL
2584  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2585    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2586  {  {
2587  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2588    {    {
2589    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2590      return FALSE;      return FALSE;
2591    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2592    }    }
# Line 2166  where Perl recognizes it as the POSIX cl Line 2619  where Perl recognizes it as the POSIX cl
2619  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2620  I think.  I think.
2621    
2622    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2623    It seems that the appearance of a nested POSIX class supersedes an apparent
2624    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2625    a digit.
2626    
2627    In Perl, unescaped square brackets may also appear as part of class names. For
2628    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2629    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2630    seem right at all. PCRE does not allow closing square brackets in POSIX class
2631    names.
2632    
2633  Arguments:  Arguments:
2634    ptr      pointer to the initial [    ptr      pointer to the initial [
2635    endptr   where to return the end pointer    endptr   where to return the end pointer
# Line 2174  Returns:   TRUE or FALSE Line 2638  Returns:   TRUE or FALSE
2638  */  */
2639    
2640  static BOOL  static BOOL
2641  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2642  {  {
2643  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2644  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2645  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2646    {    {
2647    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2648        ptr++;
2649      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2650      else
2651      {      {
     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;  
2652      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2653        {        {
2654        *endptr = ptr;        *endptr = ptr;
2655        return TRUE;        return TRUE;
2656        }        }
2657        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2658             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2659              ptr[1] == CHAR_EQUALS_SIGN) &&
2660            check_posix_syntax(ptr, endptr))
2661          return FALSE;
2662      }      }
2663    }    }
2664  return FALSE;  return FALSE;
# Line 2211  Returns:     a value representing the na Line 2682  Returns:     a value representing the na
2682  */  */
2683    
2684  static int  static int
2685  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2686  {  {
2687  const char *pn = posix_names;  const char *pn = posix_names;
2688  register int yield = 0;  register int yield = 0;
2689  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2690    {    {
2691    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2692      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2693    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2694    yield++;    yield++;
2695    }    }
# Line 2250  value in the reference (which is a group Line 2721  value in the reference (which is a group
2721  Arguments:  Arguments:
2722    group      points to the start of the group    group      points to the start of the group
2723    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2724    utf8       TRUE in UTF-8 mode    utf        TRUE in UTF-8 / UTF-16 mode
2725    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2726    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm   the hwm forward reference pointer at the start of the group
2727    
# Line 2258  Returns:     nothing Line 2729  Returns:     nothing
2729  */  */
2730    
2731  static void  static void
2732  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2733    uschar *save_hwm)    pcre_uchar *save_hwm)
2734  {  {
2735  uschar *ptr = group;  pcre_uchar *ptr = group;
2736    
2737  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2738    {    {
2739    int offset;    int offset;
2740    uschar *hc;    pcre_uchar *hc;
2741    
2742    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2743    reference. */    reference. */
# Line 2311  Arguments: Line 2782  Arguments:
2782  Returns:         new code pointer  Returns:         new code pointer
2783  */  */
2784    
2785  static uschar *  static pcre_uchar *
2786  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2787  {  {
2788  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2789  *code++ = 255;  *code++ = 255;
2790  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2791  PUT(code, LINK_SIZE, 0);                       /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2792  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2793  }  }
2794    
2795    
# Line 2340  Returns:             nothing Line 2811  Returns:             nothing
2811  */  */
2812    
2813  static void  static void
2814  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2815  {  {
2816  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2817  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
# Line 2407  Arguments: Line 2878  Arguments:
2878    ptype        the property type    ptype        the property type
2879    pdata        the data for the type    pdata        the data for the type
2880    negated      TRUE if it's a negated property (\P or \p{^)    negated      TRUE if it's a negated property (\P or \p{^)
2881    
2882  Returns:       TRUE if auto-possessifying is OK  Returns:       TRUE if auto-possessifying is OK
2883  */  */
2884    
2885  static BOOL  static BOOL
2886  check_char_prop(int c, int ptype, int pdata, BOOL negated)  check_char_prop(int c, int ptype, int pdata, BOOL negated)
# Line 2423  switch(ptype) Line 2894  switch(ptype)
2894            prop->chartype == ucp_Lt) == negated;            prop->chartype == ucp_Lt) == negated;
2895    
2896    case PT_GC:    case PT_GC:
2897    return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2898    
2899    case PT_PC:    case PT_PC:
2900    return (pdata == prop->chartype) == negated;    return (pdata == prop->chartype) == negated;
# Line 2434  switch(ptype) Line 2905  switch(ptype)
2905    /* These are specials */    /* These are specials */
2906    
2907    case PT_ALNUM:    case PT_ALNUM:
2908    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2909            _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2910    
2911    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2912    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2913            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2914            == negated;            == negated;
2915    
2916    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2917    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2918            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2919            c == CHAR_FF || c == CHAR_CR)            c == CHAR_FF || c == CHAR_CR)
2920            == negated;            == negated;
2921    
2922    case PT_WORD:    case PT_WORD:
2923    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2924            _pcre_ucp_gentype[prop->chartype] == ucp_N ||            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2925            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
2926    }    }
2927  return FALSE;  return FALSE;
2928  }  }
2929  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2930    
# Line 2469  sense to automatically possessify the re Line 2940  sense to automatically possessify the re
2940    
2941  Arguments:  Arguments:
2942    previous      pointer to the repeated opcode    previous      pointer to the repeated opcode
2943    utf8          TRUE in UTF-8 mode    utf           TRUE in UTF-8 / UTF-16 mode
2944    ptr           next character in pattern    ptr           next character in pattern
2945    options       options bits    options       options bits
2946    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 2478  Returns:        TRUE if possessifying is Line 2949  Returns:        TRUE if possessifying is
2949  */  */
2950    
2951  static BOOL  static BOOL
2952  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2953    int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2954  {  {
2955  int c, next;  pcre_int32 c, next;
2956  int op_code = *previous++;  int op_code = *previous++;
2957    
2958  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2490  if ((options & PCRE_EXTENDED) != 0) Line 2961  if ((options & PCRE_EXTENDED) != 0)
2961    {    {
2962    for (;;)    for (;;)
2963      {      {
2964      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2965      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2966        {        {
2967        while (*(++ptr) != 0)        ptr++;
2968          while (*ptr != 0)
2969            {
2970          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2971            ptr++;
2972    #ifdef SUPPORT_UTF
2973            if (utf) FORWARDCHAR(ptr);
2974    #endif
2975            }
2976        }        }
2977      else break;      else break;
2978      }      }
# Line 2510  if (*ptr == CHAR_BACKSLASH) Line 2988  if (*ptr == CHAR_BACKSLASH)
2988    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
2989    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
2990    }    }
2991    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
2992    {    {
2993  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2994    if (utf8) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
2995  #endif  #endif
2996    next = *ptr++;    next = *ptr++;
2997    }    }
   
2998  else return FALSE;  else return FALSE;
2999    
3000  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2527  if ((options & PCRE_EXTENDED) != 0) Line 3003  if ((options & PCRE_EXTENDED) != 0)
3003    {    {
3004    for (;;)    for (;;)
3005      {      {
3006      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3007      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3008        {        {
3009        while (*(++ptr) != 0)        ptr++;
3010          while (*ptr != 0)
3011            {
3012          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3013            ptr++;
3014    #ifdef SUPPORT_UTF
3015            if (utf) FORWARDCHAR(ptr);
3016    #endif
3017            }
3018        }        }
3019      else break;      else break;
3020      }      }
# Line 2540  if ((options & PCRE_EXTENDED) != 0) Line 3023  if ((options & PCRE_EXTENDED) != 0)
3023  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
3024    
3025  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3026    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3027      return FALSE;      return FALSE;
3028    
3029  /* Now compare the next item with the previous opcode. First, handle cases when  /* Now compare the next item with the previous opcode. First, handle cases when
# Line 2549  the next item is a character. */ Line 3032  the next item is a character. */
3032  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3033    {    {
3034    case OP_CHAR:    case OP_CHAR:
3035  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3036    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3037  #else  #else
3038    c = *previous;    c = *previous;
3039  #endif  #endif
3040    return c != next;    return c != next;
3041    
3042    /* For CHARNC (caseless character) we must check the other case. If we have    /* For CHARI (caseless character) we must check the other case. If we have
3043    Unicode property support, we can use it to test the other case of    Unicode property support, we can use it to test the other case of
3044    high-valued characters. */    high-valued characters. */
3045    
3046    case OP_CHARNC:    case OP_CHARI:
3047  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3048    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3049  #else  #else
3050    c = *previous;    c = *previous;
3051  #endif  #endif
3052    if (c == next) return FALSE;    if (c == next) return FALSE;
3053  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3054    if (utf8)    if (utf)
3055      {      {
3056      unsigned int othercase;      unsigned int othercase;
3057      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2580  if (next >= 0) switch(op_code) Line 3063  if (next >= 0) switch(op_code)
3063      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3064      }      }
3065    else    else
3066  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3067    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3068    
3069    /* For OP_NOT, its data is always a single-byte character. */    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
3070      opcodes are not used for multi-byte characters, because they are coded using
3071      an XCLASS instead. */
3072    
3073    case OP_NOT:    case OP_NOT:
3074      return (c = *previous) == next;
3075    
3076      case OP_NOTI:
3077    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
3078    if ((options & PCRE_CASELESS) == 0) return FALSE;  #ifdef SUPPORT_UTF
3079  #ifdef SUPPORT_UTF8    if (utf)
   if (utf8)  
3080      {      {
3081      unsigned int othercase;      unsigned int othercase;
3082      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2601  if (next >= 0) switch(op_code) Line 3088  if (next >= 0) switch(op_code)
3088      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3089      }      }
3090    else    else
3091  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3092    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3093    
3094    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3095    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3096    
3097    case OP_DIGIT:    case OP_DIGIT:
3098    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
# Line 2673  if (next >= 0) switch(op_code) Line 3160  if (next >= 0) switch(op_code)
3160  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3161    case OP_PROP:    case OP_PROP:
3162    return check_char_prop(next, previous[0], previous[1], FALSE);    return check_char_prop(next, previous[0], previous[1], FALSE);
3163    
3164    case OP_NOTPROP:    case OP_NOTPROP:
3165    return check_char_prop(next, previous[0], previous[1], TRUE);    return check_char_prop(next, previous[0], previous[1], TRUE);
3166  #endif  #endif
# Line 2683  if (next >= 0) switch(op_code) Line 3170  if (next >= 0) switch(op_code)
3170    }    }
3171    
3172    
3173  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3174  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3175  generated only when PCRE_UCP is *not* set, that is, when only ASCII  generated only when PCRE_UCP is *not* set, that is, when only ASCII
3176  characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are  characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3177  replaced by OP_PROP codes when PCRE_UCP is set. */  replaced by OP_PROP codes when PCRE_UCP is set. */
3178    
3179  switch(op_code)  switch(op_code)
3180    {    {
3181    case OP_CHAR:    case OP_CHAR:
3182    case OP_CHARNC:    case OP_CHARI:
3183  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3184    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3185  #else  #else
3186    c = *previous;    c = *previous;
3187  #endif  #endif
3188    switch(-next)    switch(-next)
3189      {      {
3190      case ESC_d:      case ESC_d:
# Line 2761  switch(op_code) Line 3248  switch(op_code)
3248        default:        default:
3249        return -next == ESC_v;        return -next == ESC_v;
3250        }        }
3251    
3252      /* When PCRE_UCP is set, these values get generated for \d etc. Find      /* When PCRE_UCP is set, these values get generated for \d etc. Find
3253      their substitutions and process them. The result will always be either      their substitutions and process them. The result will always be either
3254      -ESC_p or -ESC_P. Then fall through to process those values. */      -ESC_p or -ESC_P. Then fall through to process those values. */
3255    
3256  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3257      case ESC_du:      case ESC_du:
3258      case ESC_DU:      case ESC_DU:
# Line 2780  switch(op_code) Line 3267  switch(op_code)
3267        if (temperrorcode != 0) return FALSE;        if (temperrorcode != 0) return FALSE;
3268        ptr++;    /* For compatibility */        ptr++;    /* For compatibility */
3269        }        }
3270      /* Fall through */      /* Fall through */
3271    
3272      case ESC_p:      case ESC_p:
3273      case ESC_P:      case ESC_P:
3274        {        {
3275        int ptype, pdata, errorcodeptr;        int ptype, pdata, errorcodeptr;
3276        BOOL negated;        BOOL negated;
3277    
3278        ptr--;      /* Make ptr point at the p or P */        ptr--;      /* Make ptr point at the p or P */
3279        ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);        ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3280        if (ptype < 0) return FALSE;        if (ptype < 0) return FALSE;
3281        ptr++;      /* Point past the final curly ket */        ptr++;      /* Point past the final curly ket */
3282    
3283        /* If the property item is optional, we have to give up. (When generated        /* If the property item is optional, we have to give up. (When generated
3284        from \d etc by PCRE_UCP, this test will have been applied much earlier,        from \d etc by PCRE_UCP, this test will have been applied much earlier,
3285        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
3286    
3287        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3288          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3289            return FALSE;            return FALSE;
3290    
3291        /* Do the property check. */        /* Do the property check. */
3292    
3293        return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);        return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3294        }        }
3295  #endif  #endif
3296    
3297      default:      default:
3298      return FALSE;      return FALSE;
3299      }      }
3300    
3301    /* In principle, support for Unicode properties should be integrated here as    /* In principle, support for Unicode properties should be integrated here as
3302    well. It means re-organizing the above code so as to get hold of the property    well. It means re-organizing the above code so as to get hold of the property
3303    values before switching on the op-code. However, I wonder how many patterns    values before switching on the op-code. However, I wonder how many patterns
3304    combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,    combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3305    these op-codes are never generated.) */    these op-codes are never generated.) */
3306    
3307    case OP_DIGIT:    case OP_DIGIT:
3308    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
# Line 2831  switch(op_code) Line 3318  switch(op_code)
3318    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3319    
3320    case OP_HSPACE:    case OP_HSPACE:
3321    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3322           next == -ESC_w || next == -ESC_v || next == -ESC_R;           next == -ESC_w || next == -ESC_v || next == -ESC_R;
3323    
3324    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3325    return next == -ESC_h;    return next == -ESC_h;
3326    
3327    /* Can't have \S in here because VT matches \S (Perl anomaly) */    /* Can't have \S in here because VT matches \S (Perl anomaly) */
3328    case OP_ANYNL:    case OP_ANYNL:
3329    case OP_VSPACE:    case OP_VSPACE:
3330    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3331    
# Line 2846  switch(op_code) Line 3333  switch(op_code)
3333    return next == -ESC_v || next == -ESC_R;    return next == -ESC_v || next == -ESC_R;
3334    
3335    case OP_WORDCHAR:    case OP_WORDCHAR:
3336    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3337           next == -ESC_v || next == -ESC_R;           next == -ESC_v || next == -ESC_R;
3338    
3339    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
# Line 2876  Arguments: Line 3363  Arguments:
3363    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3364    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3365    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3366    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3367    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3368    bcptr          points to current branch chain    bcptr          points to current branch chain
3369      cond_depth     conditional nesting depth
3370    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3371    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3372                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 2888  Returns:         TRUE on success Line 3376  Returns:         TRUE on success
3376  */  */
3377    
3378  static BOOL  static BOOL
3379  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3380    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3381      pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3382    compile_data *cd, int *lengthptr)    compile_data *cd, int *lengthptr)
3383  {  {
3384  int repeat_type, op_type;  int repeat_type, op_type;
3385  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3386  int bravalue = 0;  int bravalue = 0;
3387  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3388  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3389  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3390  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3391  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3392  int after_manual_callout = 0;  int after_manual_callout = 0;
3393  int length_prevgroup = 0;  int length_prevgroup = 0;
3394  register int c;  register int c;
3395  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3396  uschar *last_code = code;  pcre_uchar *last_code = code;
3397  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3398  uschar *tempcode;  pcre_uchar *tempcode;
3399  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3400  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3401  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3402  const uschar *tempptr;  const pcre_uchar *tempptr;
3403  const uschar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
3404  uschar *previous = NULL;  pcre_uchar *previous = NULL;
3405  uschar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
3406  uschar *save_hwm = NULL;  pcre_uchar *save_hwm = NULL;
3407  uschar classbits[32];  pcre_uint8 classbits[32];
3408    
3409  #ifdef SUPPORT_UTF8  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3410  BOOL class_utf8;  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3411  BOOL utf8 = (options & PCRE_UTF8) != 0;  dynamically as we process the pattern. */
3412  uschar *class_utf8data;  
3413  uschar *class_utf8data_base;  #ifdef SUPPORT_UTF
3414  uschar utf8_char[6];  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3415    BOOL utf = (options & PCRE_UTF8) != 0;
3416    pcre_uchar utf_chars[6];
3417  #else  #else
3418  BOOL utf8 = FALSE;  BOOL utf = FALSE;
3419  uschar *utf8_char = NULL;  #endif
3420    
3421    /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3422    
3423    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3424    BOOL xclass;
3425    pcre_uchar *class_uchardata;
3426    pcre_uchar *class_uchardata_base;
3427  #endif  #endif
3428    
3429  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 2939  greedy_non_default = greedy_default ^ 1; Line 3437  greedy_non_default = greedy_default ^ 1;
3437    
3438  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3439  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3440  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3441  find one.  find one.
3442    
3443  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3444  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3445  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3446  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3447    
3448  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3449    
3450  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3451  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3452  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3453  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3454    value. This is used only for ASCII characters. */
3455    
3456  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3457    
3458  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3459    
# Line 2966  for (;; ptr++) Line 3465  for (;; ptr++)
3465    BOOL is_quantifier;    BOOL is_quantifier;
3466    BOOL is_recurse;    BOOL is_recurse;
3467    BOOL reset_bracount;    BOOL reset_bracount;
3468    int class_charcount;    int class_has_8bitchar;
3469    int class_lastchar;    int class_single_char;
3470    int newoptions;    int newoptions;
3471    int recno;    int recno;
3472    int refsign;    int refsign;
3473    int skipbytes;    int skipbytes;
3474    int subreqbyte;    int subreqchar;
3475    int subfirstbyte;    int subfirstchar;
3476    int terminator;    int terminator;
3477    int mclength;    int mclength;
3478    uschar mcbuffer[8];    int tempbracount;
3479      pcre_uchar mcbuffer[8];
3480    
3481    /* Get next byte in the pattern */    /* Get next character in the pattern */
3482    
3483    c = *ptr;    c = *ptr;
3484    
3485    /* If we are at the end of a nested substitution, revert to the outer level    /* If we are at the end of a nested substitution, revert to the outer level
3486    string. Nesting only happens one level deep. */    string. Nesting only happens one level deep. */
3487    
3488    if (c == 0 && nestptr != NULL)    if (c == 0 && nestptr != NULL)
# Line 3000  for (;; ptr++) Line 3500  for (;; ptr++)
3500  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3501      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3502  #endif  #endif
3503      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3504            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3505        {        {
3506        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3507        goto FAILED;        goto FAILED;
# Line 3023  for (;; ptr++) Line 3524  for (;; ptr++)
3524        }        }
3525    
3526      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3527      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3528          (int)(code - last_code), c, c));
3529    
3530      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3531      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
3532      if "previous" is NULL, reset the current code pointer to the start. */      if "previous" is NULL, reset the current code pointer to the start. */
# Line 3033  for (;; ptr++) Line 3535  for (;; ptr++)
3535        {        {
3536        if (previous > orig_code)        if (previous > orig_code)
3537          {          {
3538          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3539          code -= previous - orig_code;          code -= previous - orig_code;
3540          previous = orig_code;          previous = orig_code;
3541          }          }
# Line 3049  for (;; ptr++) Line 3551  for (;; ptr++)
3551    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3552    reference list. */    reference list. */
3553    
3554    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3555               WORK_SIZE_SAFETY_MARGIN)
3556      {      {
3557      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3558      goto FAILED;      goto FAILED;
# Line 3097  for (;; ptr++) Line 3600  for (;; ptr++)
3600      previous_callout = NULL;      previous_callout = NULL;
3601      }      }
3602    
3603    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3604    
3605    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3606      {      {
3607      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3608      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3609        {        {
3610        while (*(++ptr) != 0)        ptr++;
3611          while (*ptr != 0)
3612          {          {
3613          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3614            ptr++;
3615    #ifdef SUPPORT_UTF
3616            if (utf) FORWARDCHAR(ptr);
3617    #endif
3618          }          }
3619        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3620    
# Line 3129  for (;; ptr++) Line 3637  for (;; ptr++)
3637      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3638      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3639      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3640      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3641      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3642      *codeptr = code;      *codeptr = code;
3643      *ptrptr = ptr;      *ptrptr = ptr;
3644      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3151  for (;; ptr++) Line 3659  for (;; ptr++)
3659      the setting of any following char as a first character. */      the setting of any following char as a first character. */
3660    
3661      case CHAR_CIRCUMFLEX_ACCENT:      case CHAR_CIRCUMFLEX_ACCENT:
3662        previous = NULL;
3663      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3664        {        {
3665        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3666          *code++ = OP_CIRCM;
3667        }        }
3668      previous = NULL;      else *code++ = OP_CIRC;
     *code++ = OP_CIRC;  
3669      break;      break;
3670    
3671      case CHAR_DOLLAR_SIGN:      case CHAR_DOLLAR_SIGN:
3672      previous = NULL;      previous = NULL;
3673      *code++ = OP_DOLL;      *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3674      break;      break;
3675    
3676      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3677      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3678    
3679      case CHAR_DOT:      case CHAR_DOT:
3680      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3681      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3682      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3683      previous = code;      previous = code;
3684      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3685      break;      break;
# Line 3225  for (;; ptr++) Line 3734  for (;; ptr++)
3734          {          {
3735          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
3736            ptr++;            ptr++;
3737          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
                           STR_Q STR_BACKSLASH STR_E, 3) == 0)  
3738            ptr += 3;            ptr += 3;
3739          else          else
3740            break;            break;
# Line 3245  for (;; ptr++) Line 3753  for (;; ptr++)
3753          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3754        {        {
3755        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3756        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3757        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3758        break;        break;
3759        }        }
3760    
# Line 3256  for (;; ptr++) Line 3764  for (;; ptr++)
3764    
3765      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3766    
3767      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3768      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3769      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3770        a single character. */
3771    
3772      class_charcount = 0;      class_has_8bitchar = 0;
3773      class_lastchar = -1;      class_single_char = 0;
3774    
3775      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3776      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
3777      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3778      */      */
3779    
3780      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3781    
3782  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3783      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3784      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3785      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3786  #endif  #endif
3787    
3788      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3282  for (;; ptr++) Line 3791  for (;; ptr++)
3791    
3792      if (c != 0) do      if (c != 0) do
3793        {        {
3794        const uschar *oldptr;        const pcre_uchar *oldptr;
3795    
3796  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3797        if (utf8 && c > 127)        if (utf && HAS_EXTRALEN(c))
3798          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3799          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3800          }          }
3801    #endif
3802        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  
3803    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3804          /* In the pre-compile phase, accumulate the length of any extra
3805        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3806        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
3807        (which is on the stack). */        (which is on the stack). */
3808    
3809        if (lengthptr != NULL)        if (lengthptr != NULL)
3810          {          {
3811          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_uchardata - class_uchardata_base;
3812          class_utf8data = class_utf8data_base;          class_uchardata = class_uchardata_base;
3813          }          }
   
3814  #endif  #endif
3815    
3816        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3328  for (;; ptr++) Line 3838  for (;; ptr++)
3838          {          {
3839          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3840          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3841          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3842          uschar pbits[32];          pcre_uint8 pbits[32];
3843    
3844          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
3845            {            {
# Line 3358  for (;; ptr++) Line 3868  for (;; ptr++)
3868    
3869          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3870            posix_class = 0;            posix_class = 0;
3871    
3872          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
3873          different escape sequences that use Unicode properties. */          different escape sequences that use Unicode properties. */
3874    
3875  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3876          if ((options & PCRE_UCP) != 0)          if ((options & PCRE_UCP) != 0)
3877            {            {
3878            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3879            if (posix_substitutes[pc] != NULL)            if (posix_substitutes[pc] != NULL)
3880              {              {
3881              nestptr = tempptr + 1;              nestptr = tempptr + 1;
3882              ptr = posix_substitutes[pc] - 1;              ptr = posix_substitutes[pc] - 1;
3883              continue;              continue;
3884              }              }
3885            }            }
3886  #endif  #endif
3887          /* In the non-UCP case, we build the bit map for the POSIX class in a          /* In the non-UCP case, we build the bit map for the POSIX class in a
3888          chunk of local store because we may be adding and subtracting from it,          chunk of local store because we may be adding and subtracting from it,
3889          and we don't want to subtract bits that may be in the main map already.          and we don't want to subtract bits that may be in the main map already.
# Line 3384  for (;; ptr++) Line 3894  for (;; ptr++)
3894          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3895    
3896          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3897            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3898    
3899          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3900    
# Line 3415  for (;; ptr++) Line 3925  for (;; ptr++)
3925            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3926    
3927          ptr = tempptr + 1;          ptr = tempptr + 1;
3928          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3929            class_has_8bitchar = 1;
3930            /* Every class contains at least two characters. */
3931            class_single_char = 2;
3932          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3933          }          }
3934    
3935        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3936        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3937        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3938        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3939        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3940        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3941          as literal characters (by default), or are faulted if
3942        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3943    
3944        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3433  for (;; ptr++) Line 3947  for (;; ptr++)
3947          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3948    
3949          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3950            else if (-c == ESC_N)            /* \N is not supported in a class */
3951              {
3952              *errorcodeptr = ERR71;
3953              goto FAILED;
3954              }
3955          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3956            {            {
3957            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3446  for (;; ptr++) Line 3965  for (;; ptr++)
3965    
3966          if (c < 0)          if (c < 0)
3967            {            {
3968            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3969            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3970              class_has_8bitchar++;
3971              /* Every class contains at least two characters. */
3972              class_single_char += 2;
3973    
3974            switch (-c)            switch (-c)
3975              {              {
# Line 3460  for (;; ptr++) Line 3982  for (;; ptr++)
3982              case ESC_SU:              case ESC_SU:
3983              nestptr = ptr;              nestptr = ptr;
3984              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3985              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
3986              continue;              continue;
3987  #endif  #endif
3988              case ESC_d:              case ESC_d:
# Line 3481  for (;; ptr++) Line 4003  for (;; ptr++)
4003              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4004              continue;              continue;
4005    
4006                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4007                if it was previously set by something earlier in the character
4008                class. */
4009    
4010              case ESC_s:              case ESC_s:
4011              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
4012              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
4013                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4014              continue;              continue;
4015    
4016              case ESC_S:              case ESC_S:
# Line 3496  for (;; ptr++) Line 4023  for (;; ptr++)
4023              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
4024              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
4025              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
4026  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4027              if (utf8)              xclass = TRUE;
4028                *class_uchardata++ = XCL_SINGLE;
4029                *class_uchardata++ = 0x1680;
4030                *class_uchardata++ = XCL_SINGLE;
4031                *class_uchardata++ = 0x180e;
4032                *class_uchardata++ = XCL_RANGE;
4033                *class_uchardata++ = 0x2000;
4034                *class_uchardata++ = 0x200a;
4035                *class_uchardata++ = XCL_SINGLE;
4036                *class_uchardata++ = 0x202f;
4037                *class_uchardata++ = XCL_SINGLE;
4038                *class_uchardata++ = 0x205f;
4039                *class_uchardata++ = XCL_SINGLE;
4040                *class_uchardata++ = 0x3000;
4041    #elif defined SUPPORT_UTF
4042                if (utf)
4043                {                {
4044                class_utf8 = TRUE;                xclass = TRUE;
4045                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4046                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4047                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4048                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4049                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4050                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4051                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4052                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4053                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4054                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4055                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4056                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4057                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4058                }                }
4059  #endif  #endif
4060              continue;              continue;
# Line 3530  for (;; ptr++) Line 4072  for (;; ptr++)
4072                  }                  }
4073                classbits[c] |= x;                classbits[c] |= x;
4074                }                }
4075    #ifndef COMPILE_PCRE8
4076  #ifdef SUPPORT_UTF8              xclass = TRUE;
4077              if (utf8)              *class_uchardata++ = XCL_RANGE;
4078                *class_uchardata++ = 0x0100;
4079                *class_uchardata++ = 0x167f;
4080                *class_uchardata++ = XCL_RANGE;
4081                *class_uchardata++ = 0x1681;
4082                *class_uchardata++ = 0x180d;
4083                *class_uchardata++ = XCL_RANGE;
4084                *class_uchardata++ = 0x180f;
4085                *class_uchardata++ = 0x1fff;
4086                *class_uchardata++ = XCL_RANGE;
4087                *class_uchardata++ = 0x200b;
4088                *class_uchardata++ = 0x202e;
4089                *class_uchardata++ = XCL_RANGE;
4090                *class_uchardata++ = 0x2030;
4091                *class_uchardata++ = 0x205e;
4092                *class_uchardata++ = XCL_RANGE;
4093                *class_uchardata++ = 0x2060;
4094                *class_uchardata++ = 0x2fff;
4095                *class_uchardata++ = XCL_RANGE;
4096                *class_uchardata++ = 0x3001;
4097    #ifdef SUPPORT_UTF
4098                if (utf)
4099                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4100                else
4101    #endif
4102                  *class_uchardata++ = 0xffff;
4103    #elif defined SUPPORT_UTF
4104                if (utf)
4105                {                {
4106                class_utf8 = TRUE;                xclass = TRUE;
4107                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4108                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4109                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4110                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4111                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4112                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4113                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4114                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4115                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4116                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4117                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4118                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4119                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4120                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4121                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4122                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4123                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4124                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4125                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4126                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4127                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4128                }                }
4129  #endif  #endif
4130              continue;              continue;
# Line 3566  for (;; ptr++) Line 4135  for (;; ptr++)
4135              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
4136              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, 0x0d); /* CR */
4137              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, 0x85); /* NEL */
4138  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4139              if (utf8)              xclass = TRUE;
4140                *class_uchardata++ = XCL_RANGE;
4141                *class_uchardata++ = 0x2028;
4142                *class_uchardata++ = 0x2029;
4143    #elif defined SUPPORT_UTF
4144                if (utf)
4145                {                {
4146                class_utf8 = TRUE;                xclass = TRUE;
4147                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4148                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4149                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4150                }                }
4151  #endif  #endif
4152              continue;              continue;
# Line 3594  for (;; ptr++) Line 4168  for (;; ptr++)
4168                classbits[c] |= x;                classbits[c] |= x;
4169                }                }
4170    
4171  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4172              if (utf8)              xclass = TRUE;
4173                *class_uchardata++ = XCL_RANGE;
4174                *class_uchardata++ = 0x0100;
4175                *class_uchardata++ = 0x2027;
4176                *class_uchardata++ = XCL_RANGE;
4177                *class_uchardata++ = 0x202a;
4178    #ifdef SUPPORT_UTF
4179                if (utf)
4180                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4181                else
4182    #endif
4183                  *class_uchardata++ = 0xffff;
4184    #elif defined SUPPORT_UTF
4185                if (utf)
4186                {                {
4187                class_utf8 = TRUE;                xclass = TRUE;
4188                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4189                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4190                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4191                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4192                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4193                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4194                }                }
4195  #endif  #endif
4196              continue;              continue;
# Line 3616  for (;; ptr++) Line 4203  for (;; ptr++)
4203                int pdata;                int pdata;
4204                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4205                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
4206                class_utf8 = TRUE;                xclass = TRUE;
4207                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_uchardata++ = ((-c == ESC_p) != negated)?
4208                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4209                *class_utf8data++ = ptype;                *class_uchardata++ = ptype;
4210                *class_utf8data++ = pdata;                *class_uchardata++ = pdata;
4211                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4212                continue;                continue;
4213                }                }
4214  #endif  #endif
# Line 3635  for (;; ptr++) Line 4222  for (;; ptr++)
4222                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4223                goto FAILED;                goto FAILED;
4224                }                }
4225              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4226              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4227                c = *ptr;                /* Get the final character and fall through */
4228              break;              break;
4229              }              }
4230            }            }
4231    
4232          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4233          greater than 256 in UTF-8 mode. */          greater than 256. */
4234    
4235          }   /* End of backslash handling */          }   /* End of backslash handling */
4236    
# Line 3690  for (;; ptr++) Line 4278  for (;; ptr++)
4278            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4279            }            }
4280    
4281  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4282          if (utf8)          if (utf)
4283            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4284            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4285            }            }
# Line 3735  for (;; ptr++) Line 4323  for (;; ptr++)
4323    
4324          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4325    
4326            /* Since we found a character range, single character optimizations
4327            cannot be done anymore. */
4328            class_single_char = 2;
4329    
4330          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4331          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4332          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4333          available. */          available. */
4334    
4335  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4336          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4337    #elif defined  SUPPORT_UTF
4338            if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4339    #elif !(defined COMPILE_PCRE8)
4340            if (d > 255)
4341    #endif
4342    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4343            {            {
4344            class_utf8 = TRUE;            xclass = TRUE;
4345    
4346            /* With UCP support, we can find the other case equivalents of            /* With UCP support, we can find the other case equivalents of
4347            the relevant characters. There may be several ranges. Optimize how            the relevant characters. There may be several ranges. Optimize how
4348            they fit with the basic range. */            they fit with the basic range. */
4349    
4350  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4351    #ifndef COMPILE_PCRE8
4352              if (utf && (options & PCRE_CASELESS) != 0)
4353    #else
4354            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4355    #endif
4356              {              {
4357              unsigned int occ, ocd;              unsigned int occ, ocd;
4358              unsigned int cc = c;              unsigned int cc = c;
# Line 3776  for (;; ptr++) Line 4378  for (;; ptr++)
4378    
4379                if (occ == ocd)                if (occ == ocd)
4380                  {                  {
4381                  *class_utf8data++ = XCL_SINGLE;                  *class_uchardata++ = XCL_SINGLE;
4382                  }                  }
4383                else                else
4384                  {                  {
4385                  *class_utf8data++ = XCL_RANGE;                  *class_uchardata++ = XCL_RANGE;
4386                  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);                  class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4387                  }                  }
4388                class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);                class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4389                }                }
4390              }              }
4391  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 3791  for (;; ptr++) Line 4393  for (;; ptr++)
4393            /* Now record the original range, possibly modified for UCP caseless            /* Now record the original range, possibly modified for UCP caseless
4394            overlapping ranges. */            overlapping ranges. */
4395    
4396            *class_utf8data++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4397            class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #ifdef SUPPORT_UTF
4398            class_utf8data += _pcre_ord2utf8(d, class_utf8data);  #ifndef COMPILE_PCRE8
4399              if (utf)
4400                {
4401                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4402                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4403                }
4404              else
4405                {
4406                *class_uchardata++ = c;
4407                *class_uchardata++ = d;
4408                }
4409    #else
4410              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4411              class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4412    #endif
4413    #else /* SUPPORT_UTF */
4414              *class_uchardata++ = c;
4415              *class_uchardata++ = d;
4416    #endif /* SUPPORT_UTF */
4417    
4418            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4419            caseless matching for UTF-8 characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
4420            for the smaller ones. */            for the smaller ones. As for 16 bit characters without UTF, we
4421              can still use  */
4422    
4423  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4424            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4425  #else            if (utf)
4426            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  #endif
4427                continue;    /* With next character in the class */
4428    #endif  /* SUPPORT_UCP */
4429    
4430    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4431              if (utf)
4432                {
4433                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4434                /* Adjust upper limit and fall through to set up the map */
4435                d = 127;
4436                }
4437              else
4438                {
4439                if (c > 255) continue;
4440                /* Adjust upper limit and fall through to set up the map */
4441                d = 255;
4442                }
4443    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4444              if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4445            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
   
4446            d = 127;            d = 127;
4447    #else
4448  #endif  /* SUPPORT_UCP */            if (c > 255) continue;
4449              /* Adjust upper limit and fall through to set up the map */
4450              d = 255;
4451    #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4452            }            }
4453  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4454    
4455          /* We use the bit map for all cases when not in UTF-8 mode; else          /* We use the bit map for 8 bit mode, or when the characters fall
4456          ranges that lie entirely within 0-127 when there is UCP support; else          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
         for partial ranges without UCP support. */  
4457    
4458          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4459    
4460          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4461    
# Line 3826  for (;; ptr++) Line 4464  for (;; ptr++)
4464            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4465            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4466              {              {
4467              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c]; /* flip case */
4468              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
4469              }              }
4470            }            }
# Line 3840  for (;; ptr++) Line 4478  for (;; ptr++)
4478    
4479        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4480    
4481        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4482          if (class_single_char < 2) class_single_char++;
4483    
4484  #ifdef SUPPORT_UTF8        /* If class_charcount is 1, we saw precisely one character. As long as
4485        if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))        there were no negated characters >= 128 and there was no use of \p or \P,
4486          in other words, no use of any XCLASS features, we can optimize.
4487    
4488          In UTF-8 mode, we can optimize the negative case only if there were no
4489          characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4490          operate on single-bytes characters only. This is an historical hangover.
4491          Maybe one day we can tidy these opcodes to handle multi-byte characters.
4492    
4493          The optimization throws away the bit map. We turn the item into a
4494          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4495          Note that OP_NOT[I] does not support multibyte characters. In the positive
4496          case, it can cause firstchar to be set. Otherwise, there can be no first
4497          char if this item is first, whatever repeat count may follow. In the case
4498          of reqchar, save the previous value for reinstating. */
4499    
4500    #ifdef SUPPORT_UTF
4501          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4502            && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4503    #else
4504          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4505    #endif
4506          {          {
4507          class_utf8 = TRUE;          ptr++;
4508          *class_utf8data++ = XCL_SINGLE;          zeroreqchar = reqchar;
4509          class_utf8data += _pcre_ord2utf8(c, class_utf8data);  
4510            /* The OP_NOT[I] opcodes work on single characters only. */
4511    
4512            if (negate_class)
4513              {
4514              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4515              zerofirstchar = firstchar;
4516              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4517              *code++ = c;
4518              goto NOT_CHAR;
4519              }
4520    
4521            /* For a single, positive character, get the value into mcbuffer, and
4522            then we can handle this with the normal one-character code. */
4523    
4524    #ifdef SUPPORT_UTF
4525            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4526              mclength = PRIV(ord2utf)(c, mcbuffer);
4527            else
4528    #endif
4529              {
4530              mcbuffer[0] = c;
4531              mclength = 1;
4532              }
4533            goto ONE_CHAR;
4534            }       /* End of 1-char optimization */
4535    
4536          /* Handle a character that cannot go in the bit map. */
4537    
4538    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4539          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4540    #elif defined SUPPORT_UTF
4541          if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4542    #elif !(defined COMPILE_PCRE8)
4543          if (c > 255)
4544    #endif
4545    
4546    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4547            {
4548            xclass = TRUE;
4549            *class_uchardata++ = XCL_SINGLE;
4550    #ifdef SUPPORT_UTF
4551    #ifndef COMPILE_PCRE8
4552            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4553            if (!utf)
4554              *class_uchardata++ = c;
4555            else
4556    #endif
4557              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4558    #else /* SUPPORT_UTF */
4559            *class_uchardata++ = c;
4560    #endif /* SUPPORT_UTF */
4561    
4562  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4563    #ifdef COMPILE_PCRE8
4564          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4565    #else
4566            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4567            if (utf && (options & PCRE_CASELESS) != 0)
4568    #endif
4569            {            {
4570            unsigned int othercase;            unsigned int othercase;
4571            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((othercase = UCD_OTHERCASE(c)) != c)
4572              {              {
4573              *class_utf8data++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4574              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4575              }              }
4576            }            }
4577  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
4578    
4579          }          }
4580        else        else
4581  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4582    
4583        /* Handle a single-byte character */        /* Handle a single-byte character */
4584          {          {
4585            class_has_8bitchar = 1;
4586          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4587          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4588            {            {
4589            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c]; /* flip case */
4590            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4591            }            }
         class_charcount++;  
         class_lastchar = c;  
4592          }          }
4593        }        }
4594    
# Line 3895  for (;; ptr++) Line 4609  for (;; ptr++)
4609        goto FAILED;        goto FAILED;
4610        }        }
4611    
4612      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4613      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4614      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
4615      optimize.  
4616        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4617      In UTF-8 mode, we can optimize the negative case only if there were no      zerofirstchar = firstchar;
4618      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      zeroreqchar = reqchar;
     operate on single-bytes only. This is an historical hangover. Maybe one day  
     we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note  
     that OP_NOT does not support multibyte characters. In the positive case, it  
     can cause firstbyte to be set. Otherwise, there can be no first char if  
     this item is first, whatever repeat count may follow. In the case of  
     reqbyte, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF8  
     if (class_charcount == 1 && !class_utf8 &&  
       (!utf8 || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqbyte = reqbyte;  
   
       /* The OP_NOT opcode works on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
         zerofirstbyte = firstbyte;  
         *code++ = OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF8  
       if (utf8 && class_lastchar > 127)  
         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqbyte setting must remain unchanged after any kind of  
     repeat. */  
   
     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
     zerofirstbyte = firstbyte;  
     zeroreqbyte = reqbyte;  
4619    
4620      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4621      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
# Line 3964  for (;; ptr++) Line 4625  for (;; ptr++)
4625      be listed) there are no characters < 256, we can omit the bitmap in the      be listed) there are no characters < 256, we can omit the bitmap in the
4626      actual compiled code. */      actual compiled code. */
4627    
4628  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4629      if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4630    #elif !defined COMPILE_PCRE8
4631        if (xclass && !should_flip_negation)
4632    #endif
4633    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4634        {        {
4635        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4636        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
4637        code += LINK_SIZE;        code += LINK_SIZE;
4638        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT:0;
4639    
4640        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4641        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4642    
4643        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4644          {          {
4645          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4646          memmove(code + 32, code, class_utf8data - code);          memmove(code + (32 / sizeof(pcre_uchar)), code,
4647              IN_UCHARS(class_uchardata - code));
4648          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
4649          code = class_utf8data + 32;          code = class_uchardata + (32 / sizeof(pcre_uchar));
4650          }          }
4651        else code = class_utf8data;        else code = class_uchardata;
4652    
4653        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4654    
4655        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4656        break;   /* End of class handling */        break;   /* End of class handling */
4657        }        }
4658  #endif  #endif
4659    
4660      /* If there are no characters > 255, or they are all to be included or      /* If there are no characters > 255, or they are all to be included or
4661      excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the      excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4662      whole class was negated and whether there were negative specials such as \S      whole class was negated and whether there were negative specials such as \S
4663      (non-UCP) in the class. Then copy the 32-byte map into the code vector,      (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4664      negating it if necessary. */      negating it if necessary. */
4665    
4666      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4667      if (negate_class)      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
       {  
       if (lengthptr == NULL)    /* Save time in the pre-compile phase */  
         for (c = 0; c < 32; c++) code[c] = ~classbits[c];  
       }  
     else  
4668        {        {
4669          if (negate_class)
4670            for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4671        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4672        }        }
4673      code += 32;      code += 32 / sizeof(pcre_uchar);
4674        NOT_CHAR:
4675      break;      break;
4676    
4677    
# Line 4044  for (;; ptr++) Line 4708  for (;; ptr++)
4708    
4709      if (repeat_min == 0)      if (repeat_min == 0)
4710        {        {
4711        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */        firstchar = zerofirstchar;    /* Adjust for zero repeat */
4712        reqbyte = zeroreqbyte;        /* Ditto */        reqchar = zeroreqchar;        /* Ditto */
4713        }        }
4714    
4715      /* Remember whether this is a variable length repeat */      /* Remember whether this is a variable length repeat */
# Line 4055  for (;; ptr++) Line 4719  for (;; ptr++)
4719      op_type = 0;                    /* Default single-char op codes */      op_type = 0;                    /* Default single-char op codes */
4720      possessive_quantifier = FALSE;  /* Default not possessive quantifier */      possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4721    
4722      /* Save start of previous item, in case we have to move it up to make space      /* Save start of previous item, in case we have to move it up in order to
4723      for an inserted OP_ONCE for the additional '+' extension. */      insert something before it. */
4724    
4725      tempcode = previous;      tempcode = previous;
4726    
# Line 4079  for (;; ptr++) Line 4743  for (;; ptr++)
4743        }        }
4744      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4745    
4746        /* If previous was a recursion call, wrap it in atomic brackets so that
4747        previous becomes the atomic group. All recursions were so wrapped in the
4748        past, but it no longer happens for non-repeated recursions. In fact, the
4749        repeated ones could be re-implemented independently so as not to need this,
4750        but for the moment we rely on the code for repeating groups. */
4751    
4752        if (*previous == OP_RECURSE)
4753          {
4754          memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4755          *previous = OP_ONCE;
4756          PUT(previous, 1, 2 + 2*LINK_SIZE);
4757          previous[2 + 2*LINK_SIZE] = OP_KET;
4758          PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4759          code += 2 + 2 * LINK_SIZE;
4760          length_prevgroup = 3 + 3*LINK_SIZE;
4761    
4762          /* When actually compiling, we need to check whether this was a forward
4763          reference, and if so, adjust the offset. */
4764    
4765          if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4766            {
4767            int offset = GET(cd->hwm, -LINK_SIZE);
4768            if (offset == previous + 1 - cd->start_code)
4769              PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4770            }
4771          }
4772    
4773        /* Now handle repetition for the different types of item. */
4774    
4775      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4776      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
4777      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqchar - it might not be if a sequence such as x{3} is
4778      the first thing in a branch because the x will have gone into firstbyte      the first thing in a branch because the x will have gone into firstchar
4779      instead.  */      instead.  */
4780    
4781      if (*previous == OP_CHAR || *previous == OP_CHARNC)      if (*previous == OP_CHAR || *previous == OP_CHARI)
4782        {        {
4783        /* Deal with UTF-8 characters that take up more than one byte. It's        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4784    
4785          /* Deal with UTF characters that take up more than one character. It's
4786        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4787        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus UTF_LENGTH to flag that
4788        length rather than a small character. */        it's a length rather than a small character. */
4789    
4790  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4791        if (utf8 && (code[-1] & 0x80) != 0)        if (utf && NOT_FIRSTCHAR(code[-1]))
4792          {          {
4793          uschar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4794          while((*lastchar & 0xc0) == 0x80) lastchar--;          BACKCHAR(lastchar);
4795          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4796          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4797          c |= 0x80;                      /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4798          }          }
4799        else        else
4800  #endif  #endif /* SUPPORT_UTF */
   
       /* Handle the case of a single byte - either with no UTF8 support, or  
       with UTF-8 disabled, or for a UTF-8 character < 128. */  
4801    
4802          /* Handle the case of a single charater - either with no UTF support, or
4803          with UTF disabled, or for a single character UTF character. */
4804          {          {
4805          c = code[-1];          c = code[-1];
4806          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
4807          }          }
4808    
4809        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4119  for (;; ptr++) Line 4813  for (;; ptr++)
4813    
4814        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4815            repeat_max < 0 &&            repeat_max < 0 &&
4816            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4817          {          {
4818          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4819          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4131  for (;; ptr++) Line 4825  for (;; ptr++)
4825      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
4826      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
4827      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
4828      repeat_type. We can also test for auto-possessification. OP_NOT is      repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
4829      currently used only for single-byte chars. */      are currently used only for single-byte chars. */
4830    
4831      else if (*previous == OP_NOT)      else if (*previous == OP_NOT || *previous == OP_NOTI)
4832        {        {
4833        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
4834        c = previous[1];        c = previous[1];
4835        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4836            repeat_max < 0 &&            repeat_max < 0 &&
4837            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4838          {          {
4839          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4840          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4157  for (;; ptr++) Line 4851  for (;; ptr++)
4851    
4852      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
4853        {        {
4854        uschar *oldcode;        pcre_uchar *oldcode;
4855        int prop_type, prop_value;        int prop_type, prop_value;
4856        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4857        c = *previous;        c = *previous;
4858    
4859        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4860            repeat_max < 0 &&            repeat_max < 0 &&
4861            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4862          {          {
4863          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4864          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4244  for (;; ptr++) Line 4938  for (;; ptr++)
4938          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
4939          Unicode property match, there are two extra bytes that define the          Unicode property match, there are two extra bytes that define the
4940          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
4941          c, with the 0x80 bit as a flag. */          c, with the UTF_LENGTH bit as a flag. */
4942    
4943          if (repeat_max < 0)          if (repeat_max < 0)
4944            {            {
4945  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4946            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4947              {              {
4948              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4949              code += c & 7;              code += c & 7;
4950              }              }
4951            else            else
# Line 4273  for (;; ptr++) Line 4967  for (;; ptr++)
4967    
4968          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
4969            {            {
4970  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4971            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4972              {              {
4973              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4974              code += c & 7;              code += c & 7;
4975              }              }
4976            else            else
# Line 4303  for (;; ptr++) Line 4997  for (;; ptr++)
4997    
4998        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
4999    
5000  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5001        if (utf8 && c >= 128)        if (utf && (c & UTF_LENGTH) != 0)
5002          {          {
5003          memcpy(code, utf8_char, c & 7);          memcpy(code, utf_chars, IN_UCHARS(c & 7));
5004          code += c & 7;          code += c & 7;
5005          }          }
5006        else        else
# Line 4330  for (;; ptr++) Line 5024  for (;; ptr++)
5024    
5025      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
5026               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
5027  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5028               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
5029  #endif  #endif
5030               *previous == OP_REF)               *previous == OP_REF ||
5031                 *previous == OP_REFI)
5032        {        {
5033        if (repeat_max == 0)        if (repeat_max == 0)
5034          {          {
# Line 4367  for (;; ptr++) Line 5062  for (;; ptr++)
5062        }        }
5063    
5064      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
5065      cases. */      cases. Note that at this point we can encounter only the "basic" bracket
5066        opcodes such as BRA and CBRA, as this is the place where they get converted
5067        into the more special varieties such as BRAPOS and SBRA. A test for >=
5068        OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5069        ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5070        repetition of assertions, but now it does, for Perl compatibility. */
5071    
5072      else if (*previous == OP_BRA  || *previous == OP_CBRA ||      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
              *previous == OP_ONCE || *previous == OP_COND)  
5073        {        {
5074        register int i;        register int i;
       int ketoffset = 0;  
5075        int len = (int)(code - previous);        int len = (int)(code - previous);
5076        uschar *bralink = NULL;        pcre_uchar *bralink = NULL;
5077          pcre_uchar *brazeroptr = NULL;
5078    
5079        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5080          we just ignore the repeat. */
5081    
5082        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5083          {          goto END_REPEAT;
         *errorcodeptr = ERR55;  
         goto FAILED;  
         }  
5084    
5085        /* If the maximum repeat count is unlimited, find the end of the bracket        /* There is no sense in actually repeating assertions. The only potential
5086        by scanning through from the start, and compute the offset back to it        use of repetition is in cases when the assertion is optional. Therefore,
5087        from the current code pointer. There may be an OP_OPT setting following        if the minimum is greater than zero, just ignore the repeat. If the
5088        the final KET, so we can't find the end just by going back from the code        maximum is not not zero or one, set it to 1. */
5089        pointer. */  
5090          if (*previous < OP_ONCE)    /* Assertion */
5091        if (repeat_max == -1)          {
5092          {          if (repeat_min > 0) goto END_REPEAT;
5093          register uschar *ket = previous;          if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
         do ket += GET(ket, 1); while (*ket != OP_KET);  
         ketoffset = (int)(code - ket);  
5094          }          }
5095    
5096        /* The case of a zero minimum is special because of the need to stick        /* The case of a zero minimum is special because of the need to stick
# Line 4416  for (;; ptr++) Line 5111  for (;; ptr++)
5111          **   goto END_REPEAT;          **   goto END_REPEAT;
5112          **   }          **   }
5113    
5114          However, that fails when a group is referenced as a subroutine from          However, that fails when a group or a subgroup within it is referenced
5115          elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it          as a subroutine from elsewhere in the pattern, so now we stick in
5116          so that it is skipped on execution. As we don't have a list of which          OP_SKIPZERO in front of it so that it is skipped on execution. As we
5117          groups are referenced, we cannot do this selectively.          don't have a list of which groups are referenced, we cannot do this
5118            selectively.
5119    
5120          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5121          and do no more at this point. However, we do need to adjust any          and do no more at this point. However, we do need to adjust any
# Line 4431  for (;; ptr++) Line 5127  for (;; ptr++)
5127          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5128            {            {
5129            *code = OP_END;            *code = OP_END;
5130            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf, cd, save_hwm);
5131            memmove(previous+1, previous, len);            memmove(previous + 1, previous, IN_UCHARS(len));
5132            code++;            code++;
5133            if (repeat_max == 0)            if (repeat_max == 0)
5134              {              {
5135              *previous++ = OP_SKIPZERO;              *previous++ = OP_SKIPZERO;
5136              goto END_REPEAT;              goto END_REPEAT;
5137              }              }
5138              brazeroptr = previous;    /* Save for possessive optimizing */
5139            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
5140            }            }
5141    
# Line 4454  for (;; ptr++) Line 5151  for (;; ptr++)
5151            {            {
5152            int offset;            int offset;
5153            *code = OP_END;            *code = OP_END;
5154            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);            adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5155            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5156            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
5157            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
5158            *previous++ = OP_BRA;            *previous++ = OP_BRA;
# Line 4501  for (;; ptr++) Line 5198  for (;; ptr++)
5198              *lengthptr += delta;              *lengthptr += delta;
5199              }              }
5200    
5201            /* This is compiling for real */            /* This is compiling for real. If there is a set first byte for
5202              the group, and we have not yet set a "required byte", set it. Make
5203              sure there is enough workspace for copying forward references before
5204              doing the copy. */
5205    
5206            else            else
5207              {              {
5208              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5209    
5210              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5211                {                {
5212                uschar *hc;                pcre_uchar *hc;
5213                uschar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5214                memcpy(code, previous, len);                memcpy(code, previous, IN_UCHARS(len));
5215    
5216                  while (cd->hwm > cd->start_workspace + cd->workspace_size -
5217                         WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5218                    {
5219                    int save_offset = save_hwm - cd->start_workspace;
5220                    int this_offset = this_hwm - cd->start_workspace;
5221                    *errorcodeptr = expand_workspace(cd);
5222                    if (*errorcodeptr != 0) goto FAILED;
5223                    save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5224                    this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5225                    }
5226    
5227                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5228                  {                  {
5229                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 4560  for (;; ptr++) Line 5273  for (;; ptr++)
5273    
5274          else for (i = repeat_max - 1; i >= 0; i--)          else for (i = repeat_max - 1; i >= 0; i--)
5275            {            {
5276            uschar *hc;            pcre_uchar *hc;
5277            uschar *this_hwm = cd->hwm;            pcre_uchar *this_hwm = cd->hwm;
5278    
5279            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
5280    
# Line 4577  for (;; ptr++) Line 5290  for (;; ptr++)
5290              PUTINC(code, 0, offset);              PUTINC(code, 0, offset);
5291              }              }
5292    
5293            memcpy(code, previous, len);            memcpy(code, previous, IN_UCHARS(len));
5294    
5295              /* Ensure there is enough workspace for forward references before
5296              copying them. */
5297    
5298              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5299                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5300                {
5301                int save_offset = save_hwm - cd->start_workspace;
5302                int this_offset = this_hwm - cd->start_workspace;
5303                *errorcodeptr = expand_workspace(cd);
5304                if (*errorcodeptr != 0) goto FAILED;
5305                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5306                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5307                }
5308    
5309            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5310              {              {
5311              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 4594  for (;; ptr++) Line 5322  for (;; ptr++)
5322            {            {
5323            int oldlinkoffset;            int oldlinkoffset;
5324            int offset = (int)(code - bralink + 1);            int offset = (int)(code - bralink + 1);
5325            uschar *bra = code - offset;            pcre_uchar *bra = code - offset;
5326            oldlinkoffset = GET(bra, 1);            oldlinkoffset = GET(bra, 1);
5327            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5328            *code++ = OP_KET;            *code++ = OP_KET;
# Line 4603  for (;; ptr++) Line 5331  for (;; ptr++)
5331            }            }
5332          }          }
5333    
5334        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. For
5335        can't just offset backwards from the current code point, because we        ONCE brackets, that's all we need to do. However, possessively repeated
5336        don't know if there's been an options resetting after the ket. The        ONCE brackets can be converted into non-capturing brackets, as the
5337        correct offset was computed above.        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5338          deal with possessive ONCEs specially.
5339    
5340        Then, when we are doing the actual compile phase, check to see whether        Otherwise, when we are doing the actual compile phase, check to see
5341        this group is a non-atomic one that could match an empty string. If so,        whether this group is one that could match an empty string. If so,
5342        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5343        that runtime checking can be done. [This check is also applied to        that runtime checking can be done. [This check is also applied to ONCE
5344        atomic groups at runtime, but in a different way.] */        groups at runtime, but in a different way.]
5345    
5346          Then, if the quantifier was possessive and the bracket is not a
5347          conditional, we convert the BRA code to the POS form, and the KET code to
5348          KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5349          subpattern at both the start and at the end.) The use of special opcodes
5350          makes it possible to reduce greatly the stack usage in pcre_exec(). If
5351          the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5352    
5353          Then, if the minimum number of matches is 1 or 0, cancel the possessive
5354          flag so that the default action below, of wrapping everything inside
5355          atomic brackets, does not happen. When the minimum is greater than 1,
5356          there will be earlier copies of the group, and so we still have to wrap
5357          the whole thing. */
5358    
5359        else        else
5360          {          {
5361          uschar *ketcode = code - ketoffset;          pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5362          uschar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5363          *ketcode = OP_KETRMAX + repeat_type;  
5364          if (lengthptr == NULL && *bracode != OP_ONCE)          /* Convert possessive ONCE brackets to non-capturing */
5365    
5366            if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5367                possessive_quantifier) *bracode = OP_BRA;
5368    
5369            /* For non-possessive ONCE brackets, all we need to do is to
5370            set the KET. */
5371    
5372            if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5373              *ketcode = OP_KETRMAX + repeat_type;
5374    
5375            /* Handle non-ONCE brackets and possessive ONCEs (which have been
5376            converted to non-capturing above). */
5377    
5378            else
5379            {            {
5380            uschar *scode = bracode;            /* In the compile phase, check for empty string matching. */
5381            do  
5382              if (lengthptr == NULL)
5383              {              {
5384              if (could_be_empty_branch(scode, ketcode, utf8, cd))              pcre_uchar *scode = bracode;
5385                do
5386                {                {
5387                *bracode += OP_SBRA - OP_BRA;                if (could_be_empty_branch(scode, ketcode, utf, cd))
5388                break;                  {
5389                    *bracode += OP_SBRA - OP_BRA;
5390                    break;
5391                    }
5392                  scode += GET(scode, 1);
5393                  }
5394                while (*scode == OP_ALT);
5395                }
5396    
5397              /* Handle possessive quantifiers. */
5398    
5399              if (possessive_quantifier)
5400                {
5401                /* For COND brackets, we wrap the whole thing in a possessively
5402                repeated non-capturing bracket, because we have not invented POS
5403                versions of the COND opcodes. Because we are moving code along, we
5404                must ensure that any pending recursive references are updated. */
5405    
5406                if (*bracode == OP_COND || *bracode == OP_SCOND)
5407                  {
5408                  int nlen = (int)(code - bracode);
5409                  *code = OP_END;
5410