/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

code/trunk/pcre_compile.c revision 545 by ph10, Wed Jun 16 10:51:15 2010 UTC code/branches/pcre16/pcre_compile.c revision 774 by zherczeg, Thu Dec 1 06:08:45 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 97  overrun before it actually does run off Line 97  overrun before it actually does run off
97    
98  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    /* Private flags added to firstchar and reqchar. */
101    
102    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
103    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
104    
105  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
106  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 231  static const char posix_names[] = Line 235  static const char posix_names[] =
235    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
236    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
237    
238  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
239    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
240    
241  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 266  substitutes must be in the order of the Line 270  substitutes must be in the order of the
270  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
271    
272  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
273  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
274    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
275    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
276    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
277    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
278    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
279    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
280      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
281      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
282    static const pcre_uchar string_pXsp[] = {
283      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
284      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
285    static const pcre_uchar string_PXwd[] = {
286      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
287      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
288    static const pcre_uchar string_pXwd[] = {
289      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
290      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291    
292    static const pcre_uchar *substitutes[] = {
293      string_PNd,           /* \D */
294      string_pNd,           /* \d */
295      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
296      string_pXsp,          /* \s */
297      string_PXwd,          /* \W */
298      string_pXwd           /* \w */
299  };  };
300    
301  static const uschar *posix_substitutes[] = {  static const pcre_uchar string_pL[] =   {
302    (uschar *)"\\p{L}",     /* alpha */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
303    (uschar *)"\\p{Ll}",    /* lower */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
304    (uschar *)"\\p{Lu}",    /* upper */  static const pcre_uchar string_pLl[] =  {
305    (uschar *)"\\p{Xan}",   /* alnum */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
306    NULL,                   /* ascii */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
307    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pLu[] =  {
308    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
309    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
310    NULL,                   /* graph */  static const pcre_uchar string_pXan[] = {
311    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
312    NULL,                   /* punct */    CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
313    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_h[] =    {
314    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_h, '\0' };
315    NULL,                   /* xdigit */  static const pcre_uchar string_pXps[] = {
316      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
317      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
318    static const pcre_uchar string_PL[] =   {
319      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
320      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
321    static const pcre_uchar string_PLl[] =  {
322      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
323      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
324    static const pcre_uchar string_PLu[] =  {
325      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
326      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
327    static const pcre_uchar string_PXan[] = {
328      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
329      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
330    static const pcre_uchar string_H[] =    {
331      CHAR_BACKSLASH, CHAR_H, '\0' };
332    static const pcre_uchar string_PXps[] = {
333      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
334      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
335    
336    static const pcre_uchar *posix_substitutes[] = {
337      string_pL,            /* alpha */
338      string_pLl,           /* lower */
339      string_pLu,           /* upper */
340      string_pXan,          /* alnum */
341      NULL,                 /* ascii */
342      string_h,             /* blank */
343      NULL,                 /* cntrl */
344      string_pNd,           /* digit */
345      NULL,                 /* graph */
346      NULL,                 /* print */
347      NULL,                 /* punct */
348      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
349      string_pXwd,          /* word */
350      NULL,                 /* xdigit */
351    /* Negated cases */    /* Negated cases */
352    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
353    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
354    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
355    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
356    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
357    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
358    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
359    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
360    NULL,                   /* ^graph */    NULL,                 /* ^graph */
361    NULL,                   /* ^print */    NULL,                 /* ^print */
362    NULL,                   /* ^punct */    NULL,                 /* ^punct */
363    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
364    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */
365    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
366  };  };
367  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
368  #endif  #endif
369    
370  #define STRING(a)  # a  #define STRING(a)  # a
# Line 393  static const char error_texts[] = Line 451  static const char error_texts[] =
451    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
452    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
453    /* 55 */    /* 55 */
454    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
455    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
456    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
457    "a numbered reference must not be zero\0"    "a numbered reference must not be zero\0"
# Line 408  static const char error_texts[] = Line 466  static const char error_texts[] =
466    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
467    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
468    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
469      "\\c must be followed by an ASCII character\0"
470      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
471      /* 70 */
472      "internal error: unknown opcode in find_fixedlength()\0"
473    ;    ;
474    
475  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 426  For convenience, we use the same bit def Line 488  For convenience, we use the same bit def
488    
489  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
490    
491    /* Using a simple comparison for decimal numbers rather than a memory read
492    is much faster, and the resulting code is simpler (the compiler turns it
493    into a subtraction and unsigned comparison). */
494    
495    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
496    
497  #ifndef EBCDIC  #ifndef EBCDIC
498    
499  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
500  UTF-8 mode. */  UTF-8 mode. */
501    
502  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
503    {    {
504    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
505    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 470  static const unsigned char digitab[] = Line 538  static const unsigned char digitab[] =
538    
539  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
540    
541  static const unsigned char digitab[] =  static const pcre_unit8 digitab[] =
542    {    {
543    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
544    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 505  static const unsigned char digitab[] = Line 573  static const unsigned char digitab[] =
573    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
574    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
575    
576  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
577    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
578    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
579    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 544  static const unsigned char ebcdic_charta Line 612  static const unsigned char ebcdic_charta
612  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
613    
614  static BOOL  static BOOL
615    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
616      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
617    
618    
# Line 576  return s; Line 644  return s;
644    
645    
646  /*************************************************  /*************************************************
647    *            Check for counted repeat            *
648    *************************************************/
649    
650    /* This function is called when a '{' is encountered in a place where it might
651    start a quantifier. It looks ahead to see if it really is a quantifier or not.
652    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
653    where the ddds are digits.
654    
655    Arguments:
656      p         pointer to the first char after '{'
657    
658    Returns:    TRUE or FALSE
659    */
660    
661    static BOOL
662    is_counted_repeat(const pcre_uchar *p)
663    {
664    if (!IS_DIGIT(*p)) return FALSE;
665    p++;
666    while (IS_DIGIT(*p)) p++;
667    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
668    
669    if (*p++ != CHAR_COMMA) return FALSE;
670    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
671    
672    if (!IS_DIGIT(*p)) return FALSE;
673    p++;
674    while (IS_DIGIT(*p)) p++;
675    
676    return (*p == CHAR_RIGHT_CURLY_BRACKET);
677    }
678    
679    
680    
681    /*************************************************
682  *            Handle escapes                      *  *            Handle escapes                      *
683  *************************************************/  *************************************************/
684    
# Line 600  Returns:         zero or positive => a d Line 703  Returns:         zero or positive => a d
703  */  */
704    
705  static int  static int
706  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
707    int options, BOOL isclass)    int options, BOOL isclass)
708  {  {
709  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
710  const uschar *ptr = *ptrptr + 1;  const pcre_uchar *ptr = *ptrptr + 1;
711  int c, i;  int c, i;
712    
713  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
# Line 619  in a table. A non-zero result is somethi Line 722  in a table. A non-zero result is somethi
722  Otherwise further processing may be required. */  Otherwise further processing may be required. */
723    
724  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
725  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
726    else if (c < CHAR_0 || c > CHAR_z) {}
727  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
728    
729  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
730  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
731    else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
732  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
733  #endif  #endif
734    
# Line 631  else if ((i = escapes[c - 0x48]) != 0) Line 736  else if ((i = escapes[c - 0x48]) != 0)
736    
737  else  else
738    {    {
739    const uschar *oldptr;    const pcre_uchar *oldptr;
740    BOOL braced, negated;    BOOL braced, negated;
741    
742    switch (c)    switch (c)
# Line 641  else Line 746  else
746    
747      case CHAR_l:      case CHAR_l:
748      case CHAR_L:      case CHAR_L:
749        *errorcodeptr = ERR37;
750        break;
751    
752      case CHAR_u:      case CHAR_u:
753        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
754          {
755          /* In JavaScript, \u must be followed by four hexadecimal numbers.
756          Otherwise it is a lowercase u letter. */
757          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
758            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
759            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
760            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
761            {
762            c = 0;
763            for (i = 0; i < 4; ++i)
764              {
765              register int cc = *(++ptr);
766    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
767              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
768              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
769    #else           /* EBCDIC coding */
770              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
771              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
772    #endif
773              }
774            }
775          }
776        else
777          *errorcodeptr = ERR37;
778        break;
779    
780      case CHAR_U:      case CHAR_U:
781      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
782        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
783      break;      break;
784    
785      /* \g must be followed by one of a number of specific things:      /* In a character class, \g is just a literal "g". Outside a character
786        class, \g must be followed by one of a number of specific things:
787    
788      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
789      backreference. If negative, it is a relative backreference. This is a Perl      backreference. If negative, it is a relative backreference. This is a Perl
# Line 663  else Line 800  else
800      the -ESC_g code (cf \k). */      the -ESC_g code (cf \k). */
801    
802      case CHAR_g:      case CHAR_g:
803        if (isclass) break;
804      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
805        {        {
806        c = -ESC_g;        c = -ESC_g;
# Line 673  else Line 811  else
811    
812      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
813        {        {
814        const uschar *p;        const pcre_uchar *p;
815        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
816          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
817        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
818          {          {
819          c = -ESC_k;          c = -ESC_k;
# Line 693  else Line 831  else
831        }        }
832      else negated = FALSE;      else negated = FALSE;
833    
834        /* The integer range is limited by the machine's int representation. */
835      c = 0;      c = 0;
836      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
837          {
838          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
839            {
840            c = -1;
841            break;
842            }
843        c = c * 10 + *(++ptr) - CHAR_0;        c = c * 10 + *(++ptr) - CHAR_0;
844          }
845      if (c < 0)   /* Integer overflow */      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
846        {        {
847          while (IS_DIGIT(ptr[1]))
848            ptr++;
849        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
850        break;        break;
851        }        }
# Line 746  else Line 893  else
893      if (!isclass)      if (!isclass)
894        {        {
895        oldptr = ptr;        oldptr = ptr;
896          /* The integer range is limited by the machine's int representation. */
897        c -= CHAR_0;        c -= CHAR_0;
898        while ((digitab[ptr[1]] & ctype_digit) != 0)        while (IS_DIGIT(ptr[1]))
899            {
900            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
901              {
902              c = -1;
903              break;
904              }
905          c = c * 10 + *(++ptr) - CHAR_0;          c = c * 10 + *(++ptr) - CHAR_0;
906        if (c < 0)    /* Integer overflow */          }
907          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
908          {          {
909            while (IS_DIGIT(ptr[1]))
910              ptr++;
911          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
912          break;          break;
913          }          }
# Line 783  else Line 940  else
940      c -= CHAR_0;      c -= CHAR_0;
941      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
942          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
943      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf8 && c > 0xff) *errorcodeptr = ERR51;
944      break;      break;
945    
946      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 791  else Line 948  else
948      treated as a data character. */      treated as a data character. */
949    
950      case CHAR_x:      case CHAR_x:
951        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
952          {
953          /* In JavaScript, \x must be followed by two hexadecimal numbers.
954          Otherwise it is a lowercase x letter. */
955          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
956            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
957            {
958            c = 0;
959            for (i = 0; i < 2; ++i)
960              {
961              register int cc = *(++ptr);
962    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
963              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
964              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
965    #else           /* EBCDIC coding */
966              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
967              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
968    #endif
969              }
970            }
971          break;
972          }
973    
974      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
975        {        {
976        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
977        int count = 0;        int count = 0;
978    
979        c = 0;        c = 0;
980        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
981          {          {
982          register int cc = *pt++;          register int cc = *pt++;
983          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
# Line 814  else Line 994  else
994    
995        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
996          {          {
997          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;  #ifdef COMPILE_PCRE8
998            if (c < 0 || count > (utf8? 8:2)) *errorcodeptr = ERR34;
999    #else
1000    #ifdef COMPILE_PCRE16
1001            if (c < 0 || count > (utf8? 8:4)) *errorcodeptr = ERR34;
1002    #endif
1003    #endif
1004          ptr = pt;          ptr = pt;
1005          break;          break;
1006          }          }
# Line 826  else Line 1012  else
1012      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1013    
1014      c = 0;      c = 0;
1015      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1016        {        {
1017        int cc;                                  /* Some compilers don't like */        int cc;                                  /* Some compilers don't like */
1018        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
# Line 841  else Line 1027  else
1027      break;      break;
1028    
1029      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1030      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
1031        coding is ASCII-specific, but then the whole concept of \cx is
1032      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1033    
1034      case CHAR_c:      case CHAR_c:
# Line 851  else Line 1038  else
1038        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
1039        break;        break;
1040        }        }
1041    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1042  #ifndef EBCDIC  /* ASCII/UTF-8 coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
1043          {
1044          *errorcodeptr = ERR68;
1045          break;
1046          }
1047      if (c >= CHAR_a && c <= CHAR_z) c -= 32;      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1048      c ^= 0x40;      c ^= 0x40;
1049  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1050      if (c >= CHAR_a && c <= CHAR_z) c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1051      c ^= 0xC0;      c ^= 0xC0;
1052  #endif  #endif
# Line 879  else Line 1070  else
1070    }    }
1071    
1072  /* Perl supports \N{name} for character names, as well as plain \N for "not  /* Perl supports \N{name} for character names, as well as plain \N for "not
1073  newline". PCRE does not support \N{name}. */  newline". PCRE does not support \N{name}. However, it does support
1074    quantification such as \N{2,3}. */
1075    
1076  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1077         !is_counted_repeat(ptr+2))
1078    *errorcodeptr = ERR37;    *errorcodeptr = ERR37;
1079    
1080  /* If PCRE_UCP is set, we change the values for \d etc. */  /* If PCRE_UCP is set, we change the values for \d etc. */
# Line 917  Returns:         type value from ucp_typ Line 1110  Returns:         type value from ucp_typ
1110  */  */
1111    
1112  static int  static int
1113  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1114  {  {
1115  int c, i, bot, top;  int c, i, bot, top;
1116  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1117  char name[32];  pcre_uchar name[32];
1118    
1119  c = *(++ptr);  c = *(++ptr);
1120  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 962  else Line 1155  else
1155  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1156    
1157  bot = 0;  bot = 0;
1158  top = _pcre_utt_size;  top = PRIV(utt_size);
1159    
1160  while (bot < top)  while (bot < top)
1161    {    {
1162    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1163    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1164    if (c == 0)    if (c == 0)
1165      {      {
1166      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1167      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1168      }      }
1169    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1170    }    }
# Line 991  return -1; Line 1184  return -1;
1184    
1185    
1186  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if (*p++ != CHAR_COMMA) return FALSE;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == CHAR_RIGHT_CURLY_BRACKET);  
 }  
   
   
   
 /*************************************************  
1187  *         Read repeat counts                     *  *         Read repeat counts                     *
1188  *************************************************/  *************************************************/
1189    
# Line 1042  Returns:         pointer to '}' on succe Line 1202  Returns:         pointer to '}' on succe
1202                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1203  */  */
1204    
1205  static const uschar *  static const pcre_uchar *
1206  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1207  {  {
1208  int min = 0;  int min = 0;
1209  int max = -1;  int max = -1;
# Line 1051  int max = -1; Line 1211  int max = -1;
1211  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1212  an integer overflow. */  an integer overflow. */
1213    
1214  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1215  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1216    {    {
1217    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1066  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1226  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1226    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1227      {      {
1228      max = 0;      max = 0;
1229      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1230      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1231        {        {
1232        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1099  top-level call starts at the beginning o Line 1259  top-level call starts at the beginning o
1259  start at a parenthesis. It scans along a pattern's text looking for capturing  start at a parenthesis. It scans along a pattern's text looking for capturing
1260  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1261  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1262  returns when it reaches a given numbered subpattern. We know that if (?P< is  returns when it reaches a given numbered subpattern. Recursion is used to keep
1263  encountered, the name will be terminated by '>' because that is checked in the  track of subpatterns that reset the capturing group numbers - the (?| feature.
1264  first pass. Recursion is used to keep track of subpatterns that reset the  
1265  capturing group numbers - the (?| feature.  This function was originally called only from the second pass, in which we know
1266    that if (?< or (?' or (?P< is encountered, the name will be correctly
1267    terminated because that is checked in the first pass. There is now one call to
1268    this function in the first pass, to check for a recursive back reference by
1269    name (so that we can make the whole group atomic). In this case, we need check
1270    only up to the current position in the pattern, and that is still OK because
1271    and previous occurrences will have been checked. To make this work, the test
1272    for "end of pattern" is a check against cd->end_pattern in the main loop,
1273    instead of looking for a binary zero. This means that the special first-pass
1274    call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1275    processing items within the loop are OK, because afterwards the main loop will
1276    terminate.)
1277    
1278  Arguments:  Arguments:
1279    ptrptr       address of the current character pointer (updated)    ptrptr       address of the current character pointer (updated)
# Line 1110  Arguments: Line 1281  Arguments:
1281    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1282    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1283    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1284      utf8         TRUE if we are in UTF-8 mode
1285    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1286    
1287  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1288  */  */
1289    
1290  static int  static int
1291  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1292    BOOL xmode, int *count)    BOOL xmode, BOOL utf8, int *count)
1293  {  {
1294  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1295  int start_count = *count;  int start_count = *count;
1296  int hwm_count = start_count;  int hwm_count = start_count;
1297  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1186  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1358  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1358          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1359        {        {
1360        int term;        int term;
1361        const uschar *thisname;        const pcre_uchar *thisname;
1362        *count += 1;        *count += 1;
1363        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1364        term = *ptr++;        term = *ptr++;
# Line 1194  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1366  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1366        thisname = ptr;        thisname = ptr;
1367        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1368        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1369            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, lorn) == 0)
1370          return *count;          return *count;
1371        term++;        term++;
1372        }        }
# Line 1202  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1374  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1374    }    }
1375    
1376  /* Past any initial parenthesis handling, scan for parentheses or vertical  /* Past any initial parenthesis handling, scan for parentheses or vertical
1377  bars. */  bars. Stop if we get to cd->end_pattern. Note that this is important for the
1378    first-pass call when this value is temporarily adjusted to stop at the current
1379    position. So DO NOT change this to a test for binary zero. */
1380    
1381  for (; *ptr != 0; ptr++)  for (; ptr < cd->end_pattern; ptr++)
1382    {    {
1383    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1384    
# Line 1235  for (; *ptr != 0; ptr++) Line 1409  for (; *ptr != 0; ptr++)
1409          {          {
1410          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1411            ptr+= 2;            ptr+= 2;
1412          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1413                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1414            ptr += 4;            ptr += 4;
1415          else          else
# Line 1278  for (; *ptr != 0; ptr++) Line 1452  for (; *ptr != 0; ptr++)
1452    
1453    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1454      {      {
1455      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      ptr++;
1456        while (*ptr != 0)
1457          {
1458          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1459          ptr++;
1460    #ifdef SUPPORT_UTF8
1461          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1462    #endif
1463          }
1464      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1465      continue;      continue;
1466      }      }
# Line 1287  for (; *ptr != 0; ptr++) Line 1469  for (; *ptr != 0; ptr++)
1469    
1470    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1471      {      {
1472      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1473      if (rc > 0) return rc;      if (rc > 0) return rc;
1474      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1475      }      }
# Line 1333  Arguments: Line 1515  Arguments:
1515    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1516    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1517    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1518      utf8         TRUE if we are in UTF-8 mode
1519    
1520  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1521  */  */
1522    
1523  static int  static int
1524  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1525      BOOL utf8)
1526  {  {
1527  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1528  int count = 0;  int count = 0;
1529  int rc;  int rc;
1530    
# Line 1351  matching closing parens. That is why we Line 1535  matching closing parens. That is why we
1535    
1536  for (;;)  for (;;)
1537    {    {
1538    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1539    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1540    }    }
1541    
# Line 1367  return rc; Line 1551  return rc;
1551    
1552  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1553  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1554  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, it makes sense to skip negative
1555  For some calls, it makes sense to skip negative forward and all backward  forward and all backward assertions, and also the \b assertion; for others it
1556  assertions, and also the \b assertion; for others it does not.  does not.
1557    
1558  Arguments:  Arguments:
1559    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1560    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1561    
1562  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1563  */  */
1564    
1565  static const uschar*  static const pcre_uchar*
1566  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const pcre_uchar *code, BOOL skipassert)
   BOOL skipassert)  
1567  {  {
1568  for (;;)  for (;;)
1569    {    {
1570    switch ((int)*code)    switch ((int)*code)
1571      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1572      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1573      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1574      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1575      if (!skipassert) return code;      if (!skipassert) return code;
1576      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1577      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1578      break;      break;
1579    
1580      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1414  for (;;) Line 1588  for (;;)
1588      case OP_RREF:      case OP_RREF:
1589      case OP_NRREF:      case OP_NRREF:
1590      case OP_DEF:      case OP_DEF:
1591      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1592      break;      break;
1593    
1594      default:      default:
# Line 1444  and doing the check at the end; a flag s Line 1618  and doing the check at the end; a flag s
1618    
1619  Arguments:  Arguments:
1620    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1621    options  the compiling options    utf8     TRUE in UTF-8 mode
1622    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1623    cd       the "compile data" structure    cd       the "compile data" structure
1624    
1625  Returns:   the fixed length,  Returns:   the fixed length,
1626               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1627               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1628               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1629                 or -4 if an unknown opcode was encountered (internal error)
1630  */  */
1631    
1632  static int  static int
1633  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf8, BOOL atend, compile_data *cd)
1634  {  {
1635  int length = -1;  int length = -1;
1636    
1637  register int branchlength = 0;  register int branchlength = 0;
1638  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1639    
1640  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1641  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1468  branch, check the length against that of Line 1643  branch, check the length against that of
1643  for (;;)  for (;;)
1644    {    {
1645    int d;    int d;
1646    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1647    register int op = *cc;    register int op = *cc;
1648    switch (op)    switch (op)
1649      {      {
1650        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1651        OP_BRA (normal non-capturing bracket) because the other variants of these
1652        opcodes are all concerned with unlimited repeated groups, which of course
1653        are not of fixed length. */
1654    
1655      case OP_CBRA:      case OP_CBRA:
1656      case OP_BRA:      case OP_BRA:
1657      case OP_ONCE:      case OP_ONCE:
1658        case OP_ONCE_NC:
1659      case OP_COND:      case OP_COND:
1660      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf8, atend, cd);
1661      if (d < 0) return d;      if (d < 0) return d;
1662      branchlength += d;      branchlength += d;
1663      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1664      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1665      break;      break;
1666    
1667      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1668      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1669      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1670        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1671        because they all imply an unlimited repeat. */
1672    
1673      case OP_ALT:      case OP_ALT:
1674      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1675      case OP_END:      case OP_END:
1676        case OP_ACCEPT:
1677        case OP_ASSERT_ACCEPT:
1678      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1679        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1680      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1505  for (;;) Line 1688  for (;;)
1688    
1689      case OP_RECURSE:      case OP_RECURSE:
1690      if (!atend) return -3;      if (!atend) return -3;
1691      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1692      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1693      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1694      d = find_fixedlength(cs + 2, options, atend, cd);      d = find_fixedlength(cs + 2, utf8, atend, cd);
1695      if (d < 0) return d;      if (d < 0) return d;
1696      branchlength += d;      branchlength += d;
1697      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1525  for (;;) Line 1708  for (;;)
1708    
1709      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1710    
1711      case OP_REVERSE:      case OP_MARK:
1712        case OP_PRUNE_ARG:
1713        case OP_SKIP_ARG:
1714        case OP_THEN_ARG:
1715        cc += cc[1] + PRIV(OP_lengths)[*cc];
1716        break;
1717    
1718        case OP_CALLOUT:
1719        case OP_CIRC:
1720        case OP_CIRCM:
1721        case OP_CLOSE:
1722        case OP_COMMIT:
1723      case OP_CREF:      case OP_CREF:
     case OP_NCREF:  
     case OP_RREF:  
     case OP_NRREF:  
1724      case OP_DEF:      case OP_DEF:
1725      case OP_OPT:      case OP_DOLL:
1726      case OP_CALLOUT:      case OP_DOLLM:
     case OP_SOD:  
     case OP_SOM:  
     case OP_SET_SOM:  
1727      case OP_EOD:      case OP_EOD:
1728      case OP_EODN:      case OP_EODN:
1729      case OP_CIRC:      case OP_FAIL:
1730      case OP_DOLL:      case OP_NCREF:
1731        case OP_NRREF:
1732      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1733        case OP_PRUNE:
1734        case OP_REVERSE:
1735        case OP_RREF:
1736        case OP_SET_SOM:
1737        case OP_SKIP:
1738        case OP_SOD:
1739        case OP_SOM:
1740        case OP_THEN:
1741      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1742      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1743      break;      break;
1744    
1745      /* Handle literal characters */      /* Handle literal characters */
1746    
1747      case OP_CHAR:      case OP_CHAR:
1748      case OP_CHARNC:      case OP_CHARI:
1749      case OP_NOT:      case OP_NOT:
1750        case OP_NOTI:
1751      branchlength++;      branchlength++;
1752      cc += 2;      cc += 2;
1753  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1754      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1755  #endif  #endif
1756      break;      break;
1757    
# Line 1562  for (;;) Line 1759  for (;;)
1759      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1760    
1761      case OP_EXACT:      case OP_EXACT:
1762        case OP_EXACTI:
1763        case OP_NOTEXACT:
1764        case OP_NOTEXACTI:
1765      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1766      cc += 4;      cc += 2 + IMM2_SIZE;
1767  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1768      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1769  #endif  #endif
1770      break;      break;
1771    
1772      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1773      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1774      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1775      cc += 4;      cc += 1 + IMM2_SIZE + 1;
1776      break;      break;
1777    
1778      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1583  for (;;) Line 1782  for (;;)
1782      cc += 2;      cc += 2;
1783      /* Fall through */      /* Fall through */
1784    
1785        case OP_HSPACE:
1786        case OP_VSPACE:
1787        case OP_NOT_HSPACE:
1788        case OP_NOT_VSPACE:
1789      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1790      case OP_DIGIT:      case OP_DIGIT:
1791      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1595  for (;;) Line 1798  for (;;)
1798      cc++;      cc++;
1799      break;      break;
1800    
1801      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1802        otherwise \C is coded as OP_ALLANY. */
1803    
1804      case OP_ANYBYTE:      case OP_ANYBYTE:
1805      return -2;      return -2;
1806    
1807      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1808    
1809  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1810      case OP_XCLASS:      case OP_XCLASS:
1811      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1812      /* Fall through */      /* Fall through */
1813  #endif  #endif
1814    
1815      case OP_CLASS:      case OP_CLASS:
1816      case OP_NCLASS:      case OP_NCLASS:
1817      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1818    
1819      switch (*cc)      switch (*cc)
1820        {        {
1821          case OP_CRPLUS:
1822          case OP_CRMINPLUS:
1823        case OP_CRSTAR:        case OP_CRSTAR:
1824        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1825        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1622  for (;;) Line 1828  for (;;)
1828    
1829        case OP_CRRANGE:        case OP_CRRANGE:
1830        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1831        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1832        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1833        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1834        break;        break;
1835    
1836        default:        default:
# Line 1634  for (;;) Line 1840  for (;;)
1840    
1841      /* Anything else is variable length */      /* Anything else is variable length */
1842    
1843      default:      case OP_ANYNL:
1844        case OP_BRAMINZERO:
1845        case OP_BRAPOS:
1846        case OP_BRAPOSZERO:
1847        case OP_BRAZERO:
1848        case OP_CBRAPOS:
1849        case OP_EXTUNI:
1850        case OP_KETRMAX:
1851        case OP_KETRMIN:
1852        case OP_KETRPOS:
1853        case OP_MINPLUS:
1854        case OP_MINPLUSI:
1855        case OP_MINQUERY:
1856        case OP_MINQUERYI:
1857        case OP_MINSTAR:
1858        case OP_MINSTARI:
1859        case OP_MINUPTO:
1860        case OP_MINUPTOI:
1861        case OP_NOTMINPLUS:
1862        case OP_NOTMINPLUSI:
1863        case OP_NOTMINQUERY:
1864        case OP_NOTMINQUERYI:
1865        case OP_NOTMINSTAR:
1866        case OP_NOTMINSTARI:
1867        case OP_NOTMINUPTO:
1868        case OP_NOTMINUPTOI:
1869        case OP_NOTPLUS:
1870        case OP_NOTPLUSI:
1871        case OP_NOTPOSPLUS:
1872        case OP_NOTPOSPLUSI:
1873        case OP_NOTPOSQUERY:
1874        case OP_NOTPOSQUERYI:
1875        case OP_NOTPOSSTAR:
1876        case OP_NOTPOSSTARI:
1877        case OP_NOTPOSUPTO:
1878        case OP_NOTPOSUPTOI:
1879        case OP_NOTQUERY:
1880        case OP_NOTQUERYI:
1881        case OP_NOTSTAR:
1882        case OP_NOTSTARI:
1883        case OP_NOTUPTO:
1884        case OP_NOTUPTOI:
1885        case OP_PLUS:
1886        case OP_PLUSI:
1887        case OP_POSPLUS:
1888        case OP_POSPLUSI:
1889        case OP_POSQUERY:
1890        case OP_POSQUERYI:
1891        case OP_POSSTAR:
1892        case OP_POSSTARI:
1893        case OP_POSUPTO:
1894        case OP_POSUPTOI:
1895        case OP_QUERY:
1896        case OP_QUERYI:
1897        case OP_REF:
1898        case OP_REFI:
1899        case OP_SBRA:
1900        case OP_SBRAPOS:
1901        case OP_SCBRA:
1902        case OP_SCBRAPOS:
1903        case OP_SCOND:
1904        case OP_SKIPZERO:
1905        case OP_STAR:
1906        case OP_STARI:
1907        case OP_TYPEMINPLUS:
1908        case OP_TYPEMINQUERY:
1909        case OP_TYPEMINSTAR:
1910        case OP_TYPEMINUPTO:
1911        case OP_TYPEPLUS:
1912        case OP_TYPEPOSPLUS:
1913        case OP_TYPEPOSQUERY:
1914        case OP_TYPEPOSSTAR:
1915        case OP_TYPEPOSUPTO:
1916        case OP_TYPEQUERY:
1917        case OP_TYPESTAR:
1918        case OP_TYPEUPTO:
1919        case OP_UPTO:
1920        case OP_UPTOI:
1921      return -1;      return -1;
1922    
1923        /* Catch unrecognized opcodes so that when new ones are added they
1924        are not forgotten, as has happened in the past. */
1925    
1926        default:
1927        return -4;
1928      }      }
1929    }    }
1930  /* Control never gets here */  /* Control never gets here */
# Line 1662  Arguments: Line 1951  Arguments:
1951  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1952  */  */
1953    
1954  const uschar *  const pcre_uchar *
1955  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf8, int number)
1956  {  {
1957  for (;;)  for (;;)
1958    {    {
1959    register int c = *code;    register int c = *code;
1960    
1961    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1962    
1963    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1680  for (;;) Line 1970  for (;;)
1970    
1971    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
1972      {      {
1973      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
1974      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
1975      }      }
1976    
1977    /* Handle capturing bracket */    /* Handle capturing bracket */
1978    
1979    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
1980               c == OP_CBRAPOS || c == OP_SCBRAPOS)
1981      {      {
1982      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
1983      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
1984      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
1985      }      }
1986    
1987    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
# Line 1718  for (;;) Line 2009  for (;;)
2009        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2010        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2011        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2012        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2013            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2014        break;        break;
2015    
2016        case OP_MARK:        case OP_MARK:
2017        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2018        case OP_SKIP_ARG:        case OP_SKIP_ARG:
2019          code += code[1];
2020          break;
2021    
2022        case OP_THEN_ARG:        case OP_THEN_ARG:
2023        code += code[1];        code += code[1];
2024        break;        break;
# Line 1731  for (;;) Line 2026  for (;;)
2026    
2027      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2028    
2029      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2030    
2031    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2032    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
# Line 1741  for (;;) Line 2036  for (;;)
2036      if (utf8) switch(c)      if (utf8) switch(c)
2037        {        {
2038        case OP_CHAR:        case OP_CHAR:
2039        case OP_CHARNC:        case OP_CHARI:
2040        case OP_EXACT:        case OP_EXACT:
2041          case OP_EXACTI:
2042        case OP_UPTO:        case OP_UPTO:
2043          case OP_UPTOI:
2044        case OP_MINUPTO:        case OP_MINUPTO:
2045          case OP_MINUPTOI:
2046        case OP_POSUPTO:        case OP_POSUPTO:
2047          case OP_POSUPTOI:
2048        case OP_STAR:        case OP_STAR:
2049          case OP_STARI:
2050        case OP_MINSTAR:        case OP_MINSTAR:
2051          case OP_MINSTARI:
2052        case OP_POSSTAR:        case OP_POSSTAR:
2053          case OP_POSSTARI:
2054        case OP_PLUS:        case OP_PLUS:
2055          case OP_PLUSI:
2056        case OP_MINPLUS:        case OP_MINPLUS:
2057          case OP_MINPLUSI:
2058        case OP_POSPLUS:        case OP_POSPLUS:
2059          case OP_POSPLUSI:
2060        case OP_QUERY:        case OP_QUERY:
2061          case OP_QUERYI:
2062        case OP_MINQUERY:        case OP_MINQUERY:
2063          case OP_MINQUERYI:
2064        case OP_POSQUERY:        case OP_POSQUERY:
2065        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2066          if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
2067        break;        break;
2068        }        }
2069  #else  #else
# Line 1781  Arguments: Line 2089  Arguments:
2089  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2090  */  */
2091    
2092  static const uschar *  static const pcre_uchar *
2093  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf8)
2094  {  {
2095  for (;;)  for (;;)
2096    {    {
# Line 1821  for (;;) Line 2129  for (;;)
2129        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2130        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2131        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2132        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2133            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2134        break;        break;
2135    
2136        case OP_MARK:        case OP_MARK:
2137        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2138        case OP_SKIP_ARG:        case OP_SKIP_ARG:
2139          code += code[1];
2140          break;
2141    
2142        case OP_THEN_ARG:        case OP_THEN_ARG:
2143        code += code[1];        code += code[1];
2144        break;        break;
# Line 1834  for (;;) Line 2146  for (;;)
2146    
2147      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2148    
2149      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2150    
2151      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2152      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
# Line 1844  for (;;) Line 2156  for (;;)
2156      if (utf8) switch(c)      if (utf8) switch(c)
2157        {        {
2158        case OP_CHAR:        case OP_CHAR:
2159        case OP_CHARNC:        case OP_CHARI:
2160        case OP_EXACT:        case OP_EXACT:
2161          case OP_EXACTI:
2162        case OP_UPTO:        case OP_UPTO:
2163          case OP_UPTOI:
2164        case OP_MINUPTO:        case OP_MINUPTO:
2165          case OP_MINUPTOI:
2166        case OP_POSUPTO:        case OP_POSUPTO:
2167          case OP_POSUPTOI:
2168        case OP_STAR:        case OP_STAR:
2169          case OP_STARI:
2170        case OP_MINSTAR:        case OP_MINSTAR:
2171          case OP_MINSTARI:
2172        case OP_POSSTAR:        case OP_POSSTAR:
2173          case OP_POSSTARI:
2174        case OP_PLUS:        case OP_PLUS:
2175          case OP_PLUSI:
2176        case OP_MINPLUS:        case OP_MINPLUS:
2177          case OP_MINPLUSI:
2178        case OP_POSPLUS:        case OP_POSPLUS:
2179          case OP_POSPLUSI:
2180        case OP_QUERY:        case OP_QUERY:
2181          case OP_QUERYI:
2182        case OP_MINQUERY:        case OP_MINQUERY:
2183          case OP_MINQUERYI:
2184        case OP_POSQUERY:        case OP_POSQUERY:
2185        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2186          if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
2187        break;        break;
2188        }        }
2189  #else  #else
# Line 1892  Returns:      TRUE if what is matched co Line 2217  Returns:      TRUE if what is matched co
2217  */  */
2218    
2219  static BOOL  static BOOL
2220  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2221    compile_data *cd)    BOOL utf8, compile_data *cd)
2222  {  {
2223  register int c;  register int c;
2224  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2225       code < endcode;       code < endcode;
2226       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2227    {    {
2228    const uschar *ccode;    const pcre_uchar *ccode;
2229    
2230    c = *code;    c = *code;
2231    
# Line 1914  for (code = first_significant_code(code Line 2239  for (code = first_significant_code(code
2239      continue;      continue;
2240      }      }
2241    
   /* Groups with zero repeats can of course be empty; skip them. */  
   
   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)  
     {  
     code += _pcre_OP_lengths[c];  
     do code += GET(code, 1); while (*code == OP_ALT);  
     c = *code;  
     continue;  
     }  
   
2242    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
2243    implies a subroutine call, we can scan it. */    implies a backward reference subroutine call, we can scan it. If it's a
2244      forward reference subroutine call, we can't. To detect forward reference
2245      we have to scan up the list that is kept in the workspace. This function is
2246      called only when doing the real compile, not during the pre-compile that
2247      measures the size of the compiled pattern. */
2248    
2249    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2250      {      {
2251      BOOL empty_branch = FALSE;      const pcre_uchar *scode;
2252      const uschar *scode = cd->start_code + GET(code, 1);      BOOL empty_branch;
2253    
2254        /* Test for forward reference */
2255    
2256        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2257          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2258    
2259        /* Not a forward reference, test for completed backward reference */
2260    
2261        empty_branch = FALSE;
2262        scode = cd->start_code + GET(code, 1);
2263      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2264    
2265        /* Completed backwards reference */
2266    
2267      do      do
2268        {        {
2269        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf8, cd))
# Line 1942  for (code = first_significant_code(code Line 2274  for (code = first_significant_code(code
2274        scode += GET(scode, 1);        scode += GET(scode, 1);
2275        }        }
2276      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2277    
2278      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2279      continue;      continue;
2280      }      }
2281    
2282      /* Groups with zero repeats can of course be empty; skip them. */
2283    
2284      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2285          c == OP_BRAPOSZERO)
2286        {
2287        code += PRIV(OP_lengths)[c];
2288        do code += GET(code, 1); while (*code == OP_ALT);
2289        c = *code;
2290        continue;
2291        }
2292    
2293      /* A nested group that is already marked as "could be empty" can just be
2294      skipped. */
2295    
2296      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2297          c == OP_SCBRA || c == OP_SCBRAPOS)
2298        {
2299        do code += GET(code, 1); while (*code == OP_ALT);
2300        c = *code;
2301        continue;
2302        }
2303    
2304    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2305    
2306    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2307          c == OP_CBRA || c == OP_CBRAPOS ||
2308          c == OP_ONCE || c == OP_ONCE_NC ||
2309          c == OP_COND)
2310      {      {
2311      BOOL empty_branch;      BOOL empty_branch;
2312      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1982  for (code = first_significant_code(code Line 2340  for (code = first_significant_code(code
2340      {      {
2341      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2342      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2343      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2344      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2345      here. */      here. */
2346    
# Line 1994  for (code = first_significant_code(code Line 2352  for (code = first_significant_code(code
2352    
2353      case OP_CLASS:      case OP_CLASS:
2354      case OP_NCLASS:      case OP_NCLASS:
2355      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2356    
2357  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2358      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
# Line 2035  for (code = first_significant_code(code Line 2393  for (code = first_significant_code(code
2393      case OP_ALLANY:      case OP_ALLANY:
2394      case OP_ANYBYTE:      case OP_ANYBYTE:
2395      case OP_CHAR:      case OP_CHAR:
2396      case OP_CHARNC:      case OP_CHARI:
2397      case OP_NOT:      case OP_NOT:
2398        case OP_NOTI:
2399      case OP_PLUS:      case OP_PLUS:
2400      case OP_MINPLUS:      case OP_MINPLUS:
2401      case OP_POSPLUS:      case OP_POSPLUS:
# Line 2068  for (code = first_significant_code(code Line 2427  for (code = first_significant_code(code
2427      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2428      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2429      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2430      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP
2431          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2432      break;      break;
2433    
2434      /* End of branch */      /* End of branch */
# Line 2076  for (code = first_significant_code(code Line 2436  for (code = first_significant_code(code
2436      case OP_KET:      case OP_KET:
2437      case OP_KETRMAX:      case OP_KETRMAX:
2438      case OP_KETRMIN:      case OP_KETRMIN:
2439        case OP_KETRPOS:
2440      case OP_ALT:      case OP_ALT:
2441      return TRUE;      return TRUE;
2442    
# Line 2084  for (code = first_significant_code(code Line 2445  for (code = first_significant_code(code
2445    
2446  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2447      case OP_STAR:      case OP_STAR:
2448        case OP_STARI:
2449      case OP_MINSTAR:      case OP_MINSTAR:
2450        case OP_MINSTARI:
2451      case OP_POSSTAR:      case OP_POSSTAR:
2452        case OP_POSSTARI:
2453      case OP_QUERY:      case OP_QUERY:
2454        case OP_QUERYI:
2455      case OP_MINQUERY:      case OP_MINQUERY:
2456        case OP_MINQUERYI:
2457      case OP_POSQUERY:      case OP_POSQUERY:
2458      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      case OP_POSQUERYI:
2459        if (utf8 && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
2460      break;      break;
2461    
2462      case OP_UPTO:      case OP_UPTO:
2463        case OP_UPTOI:
2464      case OP_MINUPTO:      case OP_MINUPTO:
2465        case OP_MINUPTOI:
2466      case OP_POSUPTO:      case OP_POSUPTO:
2467      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      case OP_POSUPTOI:
2468        if (utf8 && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
2469      break;      break;
2470  #endif  #endif
2471    
# Line 2105  for (code = first_significant_code(code Line 2475  for (code = first_significant_code(code
2475      case OP_MARK:      case OP_MARK:
2476      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
2477      case OP_SKIP_ARG:      case OP_SKIP_ARG:
2478        code += code[1];
2479        break;
2480    
2481      case OP_THEN_ARG:      case OP_THEN_ARG:
2482      code += code[1];      code += code[1];
2483      break;      break;
# Line 2129  return TRUE; Line 2502  return TRUE;
2502  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2503  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2504  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2505    This function is called only during the real compile, not during the
2506    pre-compile.
2507    
2508  Arguments:  Arguments:
2509    code        points to start of the recursion    code        points to start of the recursion
# Line 2141  Returns:      TRUE if what is matched co Line 2516  Returns:      TRUE if what is matched co
2516  */  */
2517    
2518  static BOOL  static BOOL
2519  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2520    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf8, compile_data *cd)
2521  {  {
2522  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2523    {    {
# Line 2179  where Perl recognizes it as the POSIX cl Line 2554  where Perl recognizes it as the POSIX cl
2554  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2555  I think.  I think.
2556    
2557    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2558    It seems that the appearance of a nested POSIX class supersedes an apparent
2559    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2560    a digit.
2561    
2562    In Perl, unescaped square brackets may also appear as part of class names. For
2563    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2564    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2565    seem right at all. PCRE does not allow closing square brackets in POSIX class
2566    names.
2567    
2568  Arguments:  Arguments:
2569    ptr      pointer to the initial [    ptr      pointer to the initial [
2570    endptr   where to return the end pointer    endptr   where to return the end pointer
# Line 2187  Returns:   TRUE or FALSE Line 2573  Returns:   TRUE or FALSE
2573  */  */
2574    
2575  static BOOL  static BOOL
2576  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2577  {  {
2578  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2579  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2580  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2581    {    {
2582    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2583        ptr++;
2584      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2585      else
2586      {      {
     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;  
2587      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2588        {        {
2589        *endptr = ptr;        *endptr = ptr;
2590        return TRUE;        return TRUE;
2591        }        }
2592        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2593             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2594              ptr[1] == CHAR_EQUALS_SIGN) &&
2595            check_posix_syntax(ptr, endptr))
2596          return FALSE;
2597      }      }
2598    }    }
2599  return FALSE;  return FALSE;
# Line 2224  Returns:     a value representing the na Line 2617  Returns:     a value representing the na
2617  */  */
2618    
2619  static int  static int
2620  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2621  {  {
2622  const char *pn = posix_names;  const char *pn = posix_names;
2623  register int yield = 0;  register int yield = 0;
2624  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2625    {    {
2626    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2627      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2628    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2629    yield++;    yield++;
2630    }    }
# Line 2271  Returns:     nothing Line 2664  Returns:     nothing
2664  */  */
2665    
2666  static void  static void
2667  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf8, compile_data *cd,
2668    uschar *save_hwm)    pcre_uchar *save_hwm)
2669  {  {
2670  uschar *ptr = group;  pcre_uchar *ptr = group;
2671    
2672  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf8)) != NULL)
2673    {    {
2674    int offset;    int offset;
2675    uschar *hc;    pcre_uchar *hc;
2676    
2677    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2678    reference. */    reference. */
# Line 2324  Arguments: Line 2717  Arguments:
2717  Returns:         new code pointer  Returns:         new code pointer
2718  */  */
2719    
2720  static uschar *  static pcre_uchar *
2721  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2722  {  {
2723  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2724  *code++ = 255;  *code++ = 255;
2725  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2726  PUT(code, LINK_SIZE, 0);                       /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2727  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2728  }  }
2729    
2730    
# Line 2353  Returns:             nothing Line 2746  Returns:             nothing
2746  */  */
2747    
2748  static void  static void
2749  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2750  {  {
2751  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2752  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
# Line 2436  switch(ptype) Line 2829  switch(ptype)
2829            prop->chartype == ucp_Lt) == negated;            prop->chartype == ucp_Lt) == negated;
2830    
2831    case PT_GC:    case PT_GC:
2832    return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2833    
2834    case PT_PC:    case PT_PC:
2835    return (pdata == prop->chartype) == negated;    return (pdata == prop->chartype) == negated;
# Line 2447  switch(ptype) Line 2840  switch(ptype)
2840    /* These are specials */    /* These are specials */
2841    
2842    case PT_ALNUM:    case PT_ALNUM:
2843    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2844            _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2845    
2846    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2847    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2848            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2849            == negated;            == negated;
2850    
2851    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2852    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2853            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2854            c == CHAR_FF || c == CHAR_CR)            c == CHAR_FF || c == CHAR_CR)
2855            == negated;            == negated;
2856    
2857    case PT_WORD:    case PT_WORD:
2858    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2859            _pcre_ucp_gentype[prop->chartype] == ucp_N ||            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2860            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
2861    }    }
2862  return FALSE;  return FALSE;
# Line 2491  Returns:        TRUE if possessifying is Line 2884  Returns:        TRUE if possessifying is
2884  */  */
2885    
2886  static BOOL  static BOOL
2887  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const pcre_uchar *previous, BOOL utf8,
2888    int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2889  {  {
2890  int c, next;  int c, next;
2891  int op_code = *previous++;  int op_code = *previous++;
# Line 2506  if ((options & PCRE_EXTENDED) != 0) Line 2899  if ((options & PCRE_EXTENDED) != 0)
2899      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2900      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2901        {        {
2902        while (*(++ptr) != 0)        ptr++;
2903          while (*ptr != 0)
2904            {
2905          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2906            ptr++;
2907    #ifdef SUPPORT_UTF8
2908            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2909    #endif
2910            }
2911        }        }
2912      else break;      else break;
2913      }      }
# Line 2543  if ((options & PCRE_EXTENDED) != 0) Line 2943  if ((options & PCRE_EXTENDED) != 0)
2943      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2944      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2945        {        {
2946        while (*(++ptr) != 0)        ptr++;
2947          while (*ptr != 0)
2948            {
2949          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2950            ptr++;
2951    #ifdef SUPPORT_UTF8
2952            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2953    #endif
2954            }
2955        }        }
2956      else break;      else break;
2957      }      }
# Line 2553  if ((options & PCRE_EXTENDED) != 0) Line 2960  if ((options & PCRE_EXTENDED) != 0)
2960  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2961    
2962  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2963    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2964      return FALSE;      return FALSE;
2965    
2966  /* Now compare the next item with the previous opcode. First, handle cases when  /* Now compare the next item with the previous opcode. First, handle cases when
# Line 2569  if (next >= 0) switch(op_code) Line 2976  if (next >= 0) switch(op_code)
2976  #endif  #endif
2977    return c != next;    return c != next;
2978    
2979    /* For CHARNC (caseless character) we must check the other case. If we have    /* For CHARI (caseless character) we must check the other case. If we have
2980    Unicode property support, we can use it to test the other case of    Unicode property support, we can use it to test the other case of
2981    high-valued characters. */    high-valued characters. */
2982    
2983    case OP_CHARNC:    case OP_CHARI:
2984  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2985    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
2986  #else  #else
# Line 2596  if (next >= 0) switch(op_code) Line 3003  if (next >= 0) switch(op_code)
3003  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
3004    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
3005    
3006    /* For OP_NOT, its data is always a single-byte character. */    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
3007      opcodes are not used for multi-byte characters, because they are coded using
3008      an XCLASS instead. */
3009    
3010    case OP_NOT:    case OP_NOT:
3011      return (c = *previous) == next;
3012    
3013      case OP_NOTI:
3014    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
   if ((options & PCRE_CASELESS) == 0) return FALSE;  
3015  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3016    if (utf8)    if (utf8)
3017      {      {
# Line 2705  replaced by OP_PROP codes when PCRE_UCP Line 3116  replaced by OP_PROP codes when PCRE_UCP
3116  switch(op_code)  switch(op_code)
3117    {    {
3118    case OP_CHAR:    case OP_CHAR:
3119    case OP_CHARNC:    case OP_CHARI:
3120  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3121    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3122  #else  #else
# Line 2811  switch(op_code) Line 3222  switch(op_code)
3222        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
3223    
3224        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3225          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3226            return FALSE;            return FALSE;
3227    
3228        /* Do the property check. */        /* Do the property check. */
# Line 2889  Arguments: Line 3300  Arguments:
3300    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3301    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3302    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3303    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3304    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3305    bcptr          points to current branch chain    bcptr          points to current branch chain
3306      cond_depth     conditional nesting depth
3307    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3308    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3309                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 2901  Returns:         TRUE on success Line 3313  Returns:         TRUE on success
3313  */  */
3314    
3315  static BOOL  static BOOL
3316  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3317    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3318      pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3319    compile_data *cd, int *lengthptr)    compile_data *cd, int *lengthptr)
3320  {  {
3321  int repeat_type, op_type;  int repeat_type, op_type;
3322  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3323  int bravalue = 0;  int bravalue = 0;
3324  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3325  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3326  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3327  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3328  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3329  int after_manual_callout = 0;  int after_manual_callout = 0;
3330  int length_prevgroup = 0;  int length_prevgroup = 0;
3331  register int c;  register int c;
3332  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3333  uschar *last_code = code;  pcre_uchar *last_code = code;
3334  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3335  uschar *tempcode;  pcre_uchar *tempcode;
3336  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3337  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3338  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3339  const uschar *tempptr;  const pcre_uchar *tempptr;
3340  const uschar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
3341  uschar *previous = NULL;  pcre_uchar *previous = NULL;
3342  uschar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
3343  uschar *save_hwm = NULL;  pcre_uchar *save_hwm = NULL;
3344  uschar classbits[32];  pcre_uint8 classbits[32];
3345    
3346    /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3347    must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3348    dynamically as we process the pattern. */
3349    
3350  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
 BOOL class_utf8;  
3351  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
3352  uschar *class_utf8data;  pcre_uint8 utf8_char[6];
 uschar *class_utf8data_base;  
 uschar utf8_char[6];  
3353  #else  #else
3354  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
3355  uschar *utf8_char = NULL;  #endif
3356    
3357    /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3358    
3359    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3360    BOOL xclass;
3361    pcre_uchar *class_uchardata;
3362    pcre_uchar *class_uchardata_base;
3363  #endif  #endif
3364    
3365  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 2952  greedy_non_default = greedy_default ^ 1; Line 3373  greedy_non_default = greedy_default ^ 1;
3373    
3374  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3375  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3376  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3377  find one.  find one.
3378    
3379  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3380  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3381  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3382  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3383    
3384  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3385    
3386  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3387  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3388  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3389  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3390    value. This is used only for ASCII characters. */
3391    
3392  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3393    
3394  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3395    
# Line 2985  for (;; ptr++) Line 3407  for (;; ptr++)
3407    int recno;    int recno;
3408    int refsign;    int refsign;
3409    int skipbytes;    int skipbytes;
3410    int subreqbyte;    int subreqchar;
3411    int subfirstbyte;    int subfirstchar;
3412    int terminator;    int terminator;
3413    int mclength;    int mclength;
3414    uschar mcbuffer[8];    int tempbracount;
3415      pcre_uchar mcbuffer[8];
3416    
3417    /* Get next byte in the pattern */    /* Get next byte in the pattern */
3418    
# Line 3036  for (;; ptr++) Line 3459  for (;; ptr++)
3459        }        }
3460    
3461      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3462      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3463          c));
3464    
3465      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3466      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 3046  for (;; ptr++) Line 3470  for (;; ptr++)
3470        {        {
3471        if (previous > orig_code)        if (previous > orig_code)
3472          {          {
3473          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3474          code -= previous - orig_code;          code -= previous - orig_code;
3475          previous = orig_code;          previous = orig_code;
3476          }          }
# Line 3110  for (;; ptr++) Line 3534  for (;; ptr++)
3534      previous_callout = NULL;      previous_callout = NULL;
3535      }      }
3536    
3537    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3538    
3539    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3540      {      {
3541      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
3542      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3543        {        {
3544        while (*(++ptr) != 0)        ptr++;
3545          while (*ptr != 0)
3546          {          {
3547          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3548            ptr++;
3549    #ifdef SUPPORT_UTF8
3550            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3551    #endif
3552          }          }
3553        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3554    
# Line 3142  for (;; ptr++) Line 3571  for (;; ptr++)
3571      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3572      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3573      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3574      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3575      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3576      *codeptr = code;      *codeptr = code;
3577      *ptrptr = ptr;      *ptrptr = ptr;
3578      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3164  for (;; ptr++) Line 3593  for (;; ptr++)
3593      the setting of any following char as a first character. */      the setting of any following char as a first character. */
3594    
3595      case CHAR_CIRCUMFLEX_ACCENT:      case CHAR_CIRCUMFLEX_ACCENT:
3596        previous = NULL;
3597      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3598        {        {
3599        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3600          *code++ = OP_CIRCM;
3601        }        }
3602      previous = NULL;      else *code++ = OP_CIRC;
     *code++ = OP_CIRC;  
3603      break;      break;
3604    
3605      case CHAR_DOLLAR_SIGN:      case CHAR_DOLLAR_SIGN:
3606      previous = NULL;      previous = NULL;
3607      *code++ = OP_DOLL;      *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3608      break;      break;
3609    
3610      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3611      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3612    
3613      case CHAR_DOT:      case CHAR_DOT:
3614      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3615      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3616      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3617      previous = code;      previous = code;
3618      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3619      break;      break;
# Line 3238  for (;; ptr++) Line 3668  for (;; ptr++)
3668          {          {
3669          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
3670            ptr++;            ptr++;
3671          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
                           STR_Q STR_BACKSLASH STR_E, 3) == 0)  
3672            ptr += 3;            ptr += 3;
3673          else          else
3674            break;            break;
# Line 3258  for (;; ptr++) Line 3687  for (;; ptr++)
3687          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3688        {        {
3689        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3690        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3691        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3692        break;        break;
3693        }        }
3694    
# Line 3281  for (;; ptr++) Line 3710  for (;; ptr++)
3710      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3711      */      */
3712    
3713      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3714    
3715  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3716      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3717      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3718      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3719  #endif  #endif
3720    
3721      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3295  for (;; ptr++) Line 3724  for (;; ptr++)
3724    
3725      if (c != 0) do      if (c != 0) do
3726        {        {
3727        const uschar *oldptr;        const pcre_uchar *oldptr;
3728    
3729  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3730        if (utf8 && c > 127)        if (utf8 && c > 127)
3731          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3732          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3733          }          }
3734    #endif
3735    
3736        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3737          /* In the pre-compile phase, accumulate the length of any extra
3738        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3739        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
3740        (which is on the stack). */        (which is on the stack). */
3741    
3742        if (lengthptr != NULL)        if (lengthptr != NULL)
3743          {          {
3744          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_uchardata - class_uchardata_base;
3745          class_utf8data = class_utf8data_base;          class_uchardata = class_uchardata_base;
3746          }          }
   
3747  #endif  #endif
3748    
3749        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3341  for (;; ptr++) Line 3771  for (;; ptr++)
3771          {          {
3772          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3773          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3774          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3775          uschar pbits[32];          pcre_uint8 pbits[32];
3776    
3777          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
3778            {            {
# Line 3397  for (;; ptr++) Line 3827  for (;; ptr++)
3827          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3828    
3829          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3830            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3831    
3832          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3833    
# Line 3459  for (;; ptr++) Line 3889  for (;; ptr++)
3889    
3890          if (c < 0)          if (c < 0)
3891            {            {
3892            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3893            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
3894    
3895            switch (-c)            switch (-c)
# Line 3494  for (;; ptr++) Line 3924  for (;; ptr++)
3924              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3925              continue;              continue;
3926    
3927                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3928                if it was previously set by something earlier in the character
3929                class. */
3930    
3931              case ESC_s:              case ESC_s:
3932              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
3933              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
3934                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3935              continue;              continue;
3936    
3937              case ESC_S:              case ESC_S:
# Line 3509  for (;; ptr++) Line 3944  for (;; ptr++)
3944              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
3945              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
3946              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
3947  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3948              if (utf8)              if (utf8)
3949                {                {
3950                class_utf8 = TRUE;                xclass = TRUE;
3951                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3952                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata);
3953                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3954                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata);
3955                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
3956                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata);
3957                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata);
3958                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3959                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata);
3960                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3961                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata);
3962                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
3963                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x3000, class_uchardata);
3964                }                }
3965  #endif  #endif
3966              continue;              continue;
# Line 3544  for (;; ptr++) Line 3979  for (;; ptr++)
3979                classbits[c] |= x;                classbits[c] |= x;
3980                }                }
3981    
3982  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3983              if (utf8)              if (utf8)
3984                {                {
3985                class_utf8 = TRUE;                xclass = TRUE;
3986                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
3987                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
3988                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x167f, class_uchardata);
3989                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
3990                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x1681, class_uchardata);
3991                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x180d, class_uchardata);
3992                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
3993                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x180f, class_uchardata);
3994                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x1fff, class_uchardata);
3995                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
3996                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x200B, class_uchardata);
3997                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x202e, class_uchardata);
3998                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
3999                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x2030, class_uchardata);
4000                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x205e, class_uchardata);
4001                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4002                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x2060, class_uchardata);
4003                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x2fff, class_uchardata);
4004                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4005                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x3001, class_uchardata);
4006                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
4007                }                }
4008  #endif  #endif
4009              continue;              continue;
# Line 3579  for (;; ptr++) Line 4014  for (;; ptr++)
4014              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
4015              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, 0x0d); /* CR */
4016              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, 0x85); /* NEL */
4017  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4018              if (utf8)              if (utf8)
4019                {                {
4020                class_utf8 = TRUE;                xclass = TRUE;
4021                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4022                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x2028, class_uchardata);
4023                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
4024                }                }
4025  #endif  #endif
4026              continue;              continue;
# Line 3607  for (;; ptr++) Line 4042  for (;; ptr++)
4042                classbits[c] |= x;                classbits[c] |= x;
4043                }                }
4044    
4045  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4046              if (utf8)              if (utf8)
4047                {                {
4048                class_utf8 = TRUE;                xclass = TRUE;
4049                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4050                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
4051                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x2027, class_uchardata);
4052                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4053                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
4054                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
4055                }                }
4056  #endif  #endif
4057              continue;              continue;
# Line 3629  for (;; ptr++) Line 4064  for (;; ptr++)
4064                int pdata;                int pdata;
4065                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4066                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
4067                class_utf8 = TRUE;                xclass = TRUE;
4068                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_uchardata++ = ((-c == ESC_p) != negated)?
4069                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4070                *class_utf8data++ = ptype;                *class_uchardata++ = ptype;
4071                *class_utf8data++ = pdata;                *class_uchardata++ = pdata;
4072                class_charcount -= 2;   /* Not a < 256 character */                class_charcount -= 2;   /* Not a < 256 character */
4073                continue;                continue;
4074                }                }
# Line 3655  for (;; ptr++) Line 4090  for (;; ptr++)
4090            }            }
4091    
4092          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4093          greater than 256 in UTF-8 mode. */          greater than 256 mode. */
4094    
4095          }   /* End of backslash handling */          }   /* End of backslash handling */
4096    
# Line 3753  for (;; ptr++) Line 4188  for (;; ptr++)
4188          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4189          available. */          available. */
4190    
4191  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4192          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4193    #endif
4194    #ifndef COMPILE_PCRE8
4195            if (d > 255)
4196    #endif
4197    #if defined SUPPORT_UTF || defined COMPILE_PCRE16
4198            {            {
4199            class_utf8 = TRUE;            xclass = TRUE;
4200    
4201            /* With UCP support, we can find the other case equivalents of            /* With UCP support, we can find the other case equivalents of
4202            the relevant characters. There may be several ranges. Optimize how            the relevant characters. There may be several ranges. Optimize how
# Line 3789  for (;; ptr++) Line 4229  for (;; ptr++)
4229    
4230                if (occ == ocd)                if (occ == ocd)
4231                  {                  {
4232                  *class_utf8data++ = XCL_SINGLE;                  *class_uchardata++ = XCL_SINGLE;
4233                  }                  }
4234                else                else
4235                  {                  {
4236                  *class_utf8data++ = XCL_RANGE;                  *class_uchardata++ = XCL_RANGE;
4237                  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);                  class_uchardata += PRIV(ord2utf8)(occ, class_uchardata);
4238                  }                  }
4239                class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);                class_uchardata += PRIV(ord2utf8)(ocd, class_uchardata);
4240                }                }
4241              }              }
4242  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 3804  for (;; ptr++) Line 4244  for (;; ptr++)
4244            /* Now record the original range, possibly modified for UCP caseless            /* Now record the original range, possibly modified for UCP caseless
4245            overlapping ranges. */            overlapping ranges. */
4246    
4247            *class_utf8data++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4248            class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #ifdef SUPPORT_UTF
4249            class_utf8data += _pcre_ord2utf8(d, class_utf8data);            class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
4250              class_uchardata += PRIV(ord2utf8)(d, class_uchardata);
4251    #else
4252              *class_uchardata++ = c;
4253              *class_uchardata++ = d;
4254    #endif
4255    
4256            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4257            caseless matching for UTF-8 characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
4258            for the smaller ones. */            for the smaller ones. As for 16 bit characters without UTF, we
4259              can still use  */
4260    
4261  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4262            continue;    /* With next character in the class */            continue;    /* With next character in the class */
4263  #else  #else
4264    #ifdef SUPPORT_UTF
4265            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
   
4266            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
   
4267            d = 127;            d = 127;
4268    #else
4269              if (c > 255) continue;
4270              /* Adjust upper limit and fall through to set up the map */
4271              d = 255;
4272    #endif  /* SUPPORT_UTF */
4273  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
4274            }            }
4275  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 || COMPILE_PCRE16 */
4276    
4277          /* We use the bit map for all cases when not in UTF-8 mode; else          /* We use the bit map for 8 bit mode, or when the characters fall
4278          ranges that lie entirely within 0-127 when there is UCP support; else          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
         for partial ranges without UCP support. */  
4279    
4280          class_charcount += d - c + 1;          class_charcount += d - c + 1;
4281          class_lastchar = d;          class_lastchar = d;
# Line 3855  for (;; ptr++) Line 4303  for (;; ptr++)
4303    
4304        /* Handle a character that cannot go in the bit map */        /* Handle a character that cannot go in the bit map */
4305    
4306  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4307        if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))        if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4308    #endif
4309    #ifndef COMPILE_PCRE8
4310          if (c > 255)
4311    #endif
4312    #if defined SUPPORT_UTF || defined COMPILE_PCRE16
4313          {          {
4314          class_utf8 = TRUE;          xclass = TRUE;
4315          *class_utf8data++ = XCL_SINGLE;          *class_uchardata++ = XCL_SINGLE;
4316          class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #ifdef SUPPORT_UTF
4317            class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
4318    #else
4319            *class_uchardata++ = c;
4320    #endif
4321    
4322  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4323          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
# Line 3868  for (;; ptr++) Line 4325  for (;; ptr++)
4325            unsigned int othercase;            unsigned int othercase;
4326            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((othercase = UCD_OTHERCASE(c)) != c)
4327              {              {
4328              *class_utf8data++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4329              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_uchardata += PRIV(ord2utf8)(othercase, class_uchardata);
4330              }              }
4331            }            }
4332  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 3915  for (;; ptr++) Line 4372  for (;; ptr++)
4372    
4373      In UTF-8 mode, we can optimize the negative case only if there were no      In UTF-8 mode, we can optimize the negative case only if there were no
4374      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4375      operate on single-bytes only. This is an historical hangover. Maybe one day      operate on single-bytes characters only. This is an historical hangover.
4376      we can tidy these opcodes to handle multi-byte characters.      Maybe one day we can tidy these opcodes to handle multi-byte characters.
4377    
4378      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
4379      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4380      that OP_NOT does not support multibyte characters. In the positive case, it      Note that OP_NOT[I] does not support multibyte characters. In the positive
4381      can cause firstbyte to be set. Otherwise, there can be no first char if      case, it can cause firstchar to be set. Otherwise, there can be no first
4382      this item is first, whatever repeat count may follow. In the case of      char if this item is first, whatever repeat count may follow. In the case
4383      reqbyte, save the previous value for reinstating. */      of reqchar, save the previous value for reinstating. */
4384    
4385  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4386      if (class_charcount == 1 && !class_utf8 &&      if (class_charcount == 1 && !xclass &&
4387        (!utf8 || !negate_class || class_lastchar < 128))        (!utf8 || !negate_class || class_lastchar < 128))
4388  #else  #elif defined COMPILE_PCRE8
4389      if (class_charcount == 1)      if (class_charcount == 1)
4390    #else
4391        if (class_charcount == 1 && !xclass)
4392  #endif  #endif
4393        {        {
4394        zeroreqbyte = reqbyte;        zeroreqchar = reqchar;
4395    
4396        /* The OP_NOT opcode works on one-byte characters only. */        /* The OP_NOT[I] opcodes work on one-byte characters only. */
4397    
4398        if (negate_class)        if (negate_class)
4399          {          {
4400          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4401          zerofirstbyte = firstbyte;          zerofirstchar = firstchar;
4402          *code++ = OP_NOT;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4403          *code++ = class_lastchar;          *code++ = class_lastchar;
4404          break;          break;
4405          }          }
# Line 3950  for (;; ptr++) Line 4409  for (;; ptr++)
4409    
4410  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4411        if (utf8 && class_lastchar > 127)        if (utf8 && class_lastchar > 127)
4412          mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);          mclength = PRIV(ord2utf8)(class_lastchar, mcbuffer);
4413        else        else
4414  #endif  #endif
4415          {          {
# Line 3962  for (;; ptr++) Line 4421  for (;; ptr++)
4421    
4422      /* The general case - not the one-char optimization. If this is the first      /* The general case - not the one-char optimization. If this is the first
4423      thing in the branch, there can be no first char setting, whatever the      thing in the branch, there can be no first char setting, whatever the
4424      repeat count. Any reqbyte setting must remain unchanged after any kind of      repeat count. Any reqchar setting must remain unchanged after any kind of
4425      repeat. */      repeat. */
4426    
4427      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4428      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
4429      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
4430    
4431      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4432      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
# Line 3977  for (;; ptr++) Line 4436  for (;; ptr++)
4436      be listed) there are no characters < 256, we can omit the bitmap in the      be listed) there are no characters < 256, we can omit the bitmap in the
4437      actual compiled code. */      actual compiled code. */
4438    
4439  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4440      if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4441    #endif
4442    #ifndef COMPILE_PCRE8
4443        if (xclass && !should_flip_negation)
4444    #endif
4445    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4446        {        {
4447        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4448        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
4449        code += LINK_SIZE;        code += LINK_SIZE;
4450        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT:0;
4451    
4452        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4453        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
# Line 3991  for (;; ptr++) Line 4455  for (;; ptr++)
4455        if (class_charcount > 0)        if (class_charcount > 0)
4456          {          {
4457          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4458          memmove(code + 32, code, class_utf8data - code);          memmove(code + (32 / sizeof(pcre_uchar)), code,
4459              IN_UCHARS(class_uchardata - code));
4460          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
4461          code = class_utf8data + 32;          code = class_uchardata + (32 / sizeof(pcre_uchar));
4462          }          }
4463        else code = class_utf8data;        else code = class_uchardata;
4464    
4465        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4466    
# Line 4011  for (;; ptr++) Line 4476  for (;; ptr++)
4476      negating it if necessary. */      negating it if necessary. */
4477    
4478      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4479      if (negate_class)      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
       {  
       if (lengthptr == NULL)    /* Save time in the pre-compile phase */  
         for (c = 0; c < 32; c++) code[c] = ~classbits[c];  
       }  
     else  
4480        {        {
4481          if (negate_class)
4482            for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4483        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4484        }        }
4485      code += 32;      code += 32 / sizeof(pcre_uchar);
4486      break;      break;
4487    
4488    
# Line 4057  for (;; ptr++) Line 4519  for (;; ptr++)
4519    
4520      if (repeat_min == 0)      if (repeat_min == 0)
4521        {        {
4522        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */        firstchar = zerofirstchar;    /* Adjust for zero repeat */
4523        reqbyte = zeroreqbyte;        /* Ditto */        reqchar = zeroreqchar;        /* Ditto */
4524        }        }
4525    
4526      /* Remember whether this is a variable length repeat */      /* Remember whether this is a variable length repeat */
# Line 4068  for (;; ptr++) Line 4530  for (;; ptr++)
4530      op_type = 0;                    /* Default single-char op codes */      op_type = 0;                    /* Default single-char op codes */
4531      possessive_quantifier = FALSE;  /* Default not possessive quantifier */      possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4532    
4533      /* Save start of previous item, in case we have to move it up to make space      /* Save start of previous item, in case we have to move it up in order to
4534      for an inserted OP_ONCE for the additional '+' extension. */      insert something before it. */
4535    
4536      tempcode = previous;      tempcode = previous;
4537    
# Line 4092  for (;; ptr++) Line 4554  for (;; ptr++)
4554        }        }
4555      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4556    
4557        /* If previous was a recursion call, wrap it in atomic brackets so that
4558        previous becomes the atomic group. All recursions were so wrapped in the
4559        past, but it no longer happens for non-repeated recursions. In fact, the
4560        repeated ones could be re-implemented independently so as not to need this,
4561        but for the moment we rely on the code for repeating groups. */
4562    
4563        if (*previous == OP_RECURSE)
4564          {
4565          memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4566          *previous = OP_ONCE;
4567          PUT(previous, 1, 2 + 2*LINK_SIZE);
4568          previous[2 + 2*LINK_SIZE] = OP_KET;
4569          PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4570          code += 2 + 2 * LINK_SIZE;
4571          length_prevgroup = 3 + 3*LINK_SIZE;
4572    
4573          /* When actually compiling, we need to check whether this was a forward
4574          reference, and if so, adjust the offset. */
4575    
4576          if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4577            {
4578            int offset = GET(cd->hwm, -LINK_SIZE);
4579            if (offset == previous + 1 - cd->start_code)
4580              PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4581            }
4582          }
4583    
4584        /* Now handle repetition for the different types of item. */
4585    
4586      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4587      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
4588      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqchar - it might not be if a sequence such as x{3} is
4589      the first thing in a branch because the x will have gone into firstbyte      the first thing in a branch because the x will have gone into firstchar
4590      instead.  */      instead.  */
4591    
4592      if (*previous == OP_CHAR || *previous == OP_CHARNC)      if (*previous == OP_CHAR || *previous == OP_CHARI)
4593        {        {
4594          op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4595    
4596        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF-8 characters that take up more than one byte. It's
4597        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4598        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus 0x80 to flag that it's a
# Line 4108  for (;; ptr++) Line 4601  for (;; ptr++)
4601  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4602        if (utf8 && (code[-1] & 0x80) != 0)        if (utf8 && (code[-1] & 0x80) != 0)
4603          {          {
4604          uschar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4605          while((*lastchar & 0xc0) == 0x80) lastchar--;          while((*lastchar & 0xc0) == 0x80) lastchar--;
4606          c = code - lastchar;            /* Length of UTF-8 character */          c = code - lastchar;            /* Length of UTF-8 character */
4607          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf8_char, lastchar, c); /* Save the char */
# Line 4122  for (;; ptr++) Line 4615  for (;; ptr++)
4615    
4616          {          {
4617          c = code[-1];          c = code[-1];
4618          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
4619          }          }
4620    
4621        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4144  for (;; ptr++) Line 4637  for (;; ptr++)
4637      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
4638      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
4639      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
4640      repeat_type. We can also test for auto-possessification. OP_NOT is      repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
4641      currently used only for single-byte chars. */      are currently used only for single-byte chars. */
4642    
4643      else if (*previous == OP_NOT)      else if (*previous == OP_NOT || *previous == OP_NOTI)
4644        {        {
4645        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
4646        c = previous[1];        c = previous[1];
4647        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4648            repeat_max < 0 &&            repeat_max < 0 &&
# Line 4170  for (;; ptr++) Line 4663  for (;; ptr++)
4663    
4664      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
4665        {        {
4666        uschar *oldcode;        pcre_uchar *oldcode;
4667        int prop_type, prop_value;        int prop_type, prop_value;
4668        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4669        c = *previous;        c = *previous;
# Line 4343  for (;; ptr++) Line 4836  for (;; ptr++)
4836    
4837      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
4838               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
4839  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8
4840               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
4841  #endif  #endif
4842               *previous == OP_REF)               *previous == OP_REF ||
4843                 *previous == OP_REFI)
4844        {        {
4845        if (repeat_max == 0)        if (repeat_max == 0)
4846          {          {
# Line 4380  for (;; ptr++) Line 4874  for (;; ptr++)
4874        }        }
4875    
4876      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
4877      cases. */      cases. Note that at this point we can encounter only the "basic" bracket
4878        opcodes such as BRA and CBRA, as this is the place where they get converted
4879        into the more special varieties such as BRAPOS and SBRA. A test for >=
4880        OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
4881        ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
4882        repetition of assertions, but now it does, for Perl compatibility. */
4883    
4884      else if (*previous == OP_BRA  || *previous == OP_CBRA ||      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
              *previous == OP_ONCE || *previous == OP_COND)  
4885        {        {
4886        register int i;        register int i;
       int ketoffset = 0;  
4887        int len = (int)(code - previous);        int len = (int)(code - previous);
4888        uschar *bralink = NULL;        pcre_uchar *bralink = NULL;
4889          pcre_uchar *brazeroptr = NULL;
4890    
4891        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
4892          we just ignore the repeat. */
4893    
4894        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4895          {          goto END_REPEAT;
4896          *errorcodeptr = ERR55;  
4897          goto FAILED;        /* There is no sense in actually repeating assertions. The only potential
4898          }        use of repetition is in cases when the assertion is optional. Therefore,
4899          if the minimum is greater than zero, just ignore the repeat. If the
4900          maximum is not not zero or one, set it to 1. */
4901    
4902        /* If the maximum repeat count is unlimited, find the end of the bracket        if (*previous < OP_ONCE)    /* Assertion */
4903        by scanning through from the start, and compute the offset back to it          {
4904        from the current code pointer. There may be an OP_OPT setting following          if (repeat_min > 0) goto END_REPEAT;
4905        the final KET, so we can't find the end just by going back from the code          if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
       pointer. */  
   
       if (repeat_max == -1)  
         {  
         register uschar *ket = previous;  
         do ket += GET(ket, 1); while (*ket != OP_KET);  
         ketoffset = (int)(code - ket);  
4906          }          }
4907    
4908        /* The case of a zero minimum is special because of the need to stick        /* The case of a zero minimum is special because of the need to stick
# Line 4429  for (;; ptr++) Line 4923  for (;; ptr++)
4923          **   goto END_REPEAT;          **   goto END_REPEAT;
4924          **   }          **   }
4925    
4926          However, that fails when a group is referenced as a subroutine from          However, that fails when a group or a subgroup within it is referenced
4927          elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it          as a subroutine from elsewhere in the pattern, so now we stick in
4928          so that it is skipped on execution. As we don't have a list of which          OP_SKIPZERO in front of it so that it is skipped on execution. As we
4929          groups are referenced, we cannot do this selectively.          don't have a list of which groups are referenced, we cannot do this
4930            selectively.
4931    
4932          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4933          and do no more at this point. However, we do need to adjust any          and do no more at this point. However, we do need to adjust any
# Line 4445  for (;; ptr++) Line 4940  for (;; ptr++)
4940            {            {
4941            *code = OP_END;            *code = OP_END;
4942            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
4943            memmove(previous+1, previous, len);            memmove(previous + 1, previous, IN_UCHARS(len));
4944            code++;            code++;
4945            if (repeat_max == 0)            if (repeat_max == 0)
4946              {              {
4947              *previous++ = OP_SKIPZERO;              *previous++ = OP_SKIPZERO;
4948              goto END_REPEAT;              goto END_REPEAT;
4949              }              }
4950              brazeroptr = previous;    /* Save for possessive optimizing */
4951            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4952            }            }
4953    
# Line 4468  for (;; ptr++) Line 4964  for (;; ptr++)
4964            int offset;            int offset;
4965            *code = OP_END;            *code = OP_END;
4966            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4967            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
4968            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
4969            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4970            *previous++ = OP_BRA;            *previous++ = OP_BRA;
# Line 4518  for (;; ptr++) Line 5014  for (;; ptr++)
5014    
5015            else            else
5016              {              {
5017              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5018              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5019                {                {
5020                uschar *hc;                pcre_uchar *hc;
5021                uschar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5022                memcpy(code, previous, len);                memcpy(code, previous, IN_UCHARS(len));
5023                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5024                  {                  {
5025                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 4573  for (;; ptr++) Line 5069  for (;; ptr++)
5069    
5070          else for (i = repeat_max - 1; i >= 0; i--)          else for (i = repeat_max - 1; i >= 0; i--)
5071            {            {
5072            uschar *hc;            pcre_uchar *hc;
5073            uschar *this_hwm = cd->hwm;            pcre_uchar *this_hwm = cd->hwm;
5074    
5075            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
5076    
# Line 4590  for (;; ptr++) Line 5086  for (;; ptr++)
5086              PUTINC(code, 0, offset);              PUTINC(code, 0, offset);
5087              }              }
5088    
5089            memcpy(code, previous, len);            memcpy(code, previous, IN_UCHARS(len));
5090            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5091              {              {
5092              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 4607  for (;; ptr++) Line 5103  for (;; ptr++)
5103            {            {
5104            int oldlinkoffset;            int oldlinkoffset;
5105            int offset = (int)(code - bralink + 1);            int offset = (int)(code - bralink + 1);
5106            uschar *bra = code - offset;            pcre_uchar *bra = code - offset;
5107            oldlinkoffset = GET(bra, 1);            oldlinkoffset = GET(bra, 1);
5108            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5109            *code++ = OP_KET;            *code++ = OP_KET;
# Line 4616  for (;; ptr++) Line 5112  for (;; ptr++)
5112            }            }
5113          }          }
5114    
5115        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. For
5116        can't just offset backwards from the current code point, because we        ONCE brackets, that's all we need to do. However, possessively repeated
5117        don't know if there's been an options resetting after the ket. The        ONCE brackets can be converted into non-capturing brackets, as the
5118        correct offset was computed above.        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5119          deal with possessive ONCEs specially.
5120        Then, when we are doing the actual compile phase, check to see whether  
5121        this group is a non-atomic one that could match an empty string. If so,        Otherwise, when we are doing the actual compile phase, check to see
5122          whether this group is one that could match an empty string. If so,
5123        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5124        that runtime checking can be done. [This check is also applied to        that runtime checking can be done. [This check is also applied to ONCE
5125        atomic groups at runtime, but in a different way.] */        groups at runtime, but in a different way.]
5126    
5127          Then, if the quantifier was possessive and the bracket is not a
5128          conditional, we convert the BRA code to the POS form, and the KET code to
5129          KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5130          subpattern at both the start and at the end.) The use of special opcodes
5131          makes it possible to reduce greatly the stack usage in pcre_exec(). If
5132          the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5133    
5134          Then, if the minimum number of matches is 1 or 0, cancel the possessive
5135          flag so that the default action below, of wrapping everything inside
5136          atomic brackets, does not happen. When the minimum is greater than 1,
5137          there will be earlier copies of the group, and so we still have to wrap
5138          the whole thing. */
5139    
5140        else        else
5141          {          {
5142          uschar *ketcode = code - ketoffset;          pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5143          uschar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5144          *ketcode = OP_KETRMAX + repeat_type;  
5145          if (lengthptr == NULL && *bracode != OP_ONCE)          /* Convert possessive ONCE brackets to non-capturing */
5146    
5147            if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5148                possessive_quantifier) *bracode = OP_BRA;
5149    
5150            /* For non-possessive ONCE brackets, all we need to do is to
5151            set the KET. */
5152    
5153            if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5154              *ketcode = OP_KETRMAX + repeat_type;
5155    
5156            /* Handle non-ONCE brackets and possessive ONCEs (which have been
5157            converted to non-capturing above). */
5158    
5159            else
5160            {            {
5161            uschar *scode = bracode;            /* In the compile phase, check for empty string matching. */
5162            do  
5163              if (lengthptr == NULL)
5164              {              {
5165              if (could_be_empty_branch(scode, ketcode, utf8, cd))              pcre_uchar *scode = bracode;
5166                do
5167                {                {
5168                *bracode += OP_SBRA - OP_BRA;                if (could_be_empty_branch(scode, ketcode, utf8, cd))
5169                break;                  {
5170                    *bracode += OP_SBRA - OP_BRA;
5171                    break;
5172                    }
5173                  scode += GET(scode, 1);
5174                }                }
5175              scode += GET(scode, 1);              while (*scode == OP_ALT);
5176              }              }
5177            while (*scode == OP_ALT);  
5178              /* Handle possessive quantifiers. */
5179    
5180              if (possessive_quantifier)
5181                {
5182                /* For COND brackets, we wrap the whole thing in a possessively
5183                repeated non-capturing bracket, because we have not invented POS
5184                versions of the COND opcodes. Because we are moving code along, we
5185                must ensure that any pending recursive references are updated. */
5186    
5187                if (*bracode == OP_COND || *bracode == OP_SCOND)
5188                  {
5189                  int nlen = (int)(code - bracode);
5190                  *code = OP_END;
5191                  adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);
5192                  memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5193                  code += 1 + LINK_SIZE;
5194                  nlen += 1 + LINK_SIZE;
5195                  *bracode = OP_BRAPOS;
5196                  *code++ = OP_KETRPOS;
5197                  PUTINC(code, 0, nlen);
5198                  PUT(bracode, 1, nlen);
5199                  }
5200    
5201                /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5202    
5203                else
5204                  {
5205                  *bracode += 1;              /* Switch to xxxPOS opcodes */
5206                  *ketcode = OP_KETRPOS;
5207                  }
5208    
5209                /* If the minimum is zero, mark it as possessive, then unset the
5210                possessive flag when the minimum is 0 or 1. */
5211    
5212                if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5213                if (repeat_min < 2) possessive_quantifier = FALSE;
5214                }
5215    
5216              /* Non-possessive quantifier */
5217    
5218              else *ketcode = OP_KETRMAX + repeat_type;
5219            }            }
5220          }          }
5221        }        }
# Line 4665  for (;; ptr++) Line 5236  for (;; ptr++)
5236        }        }
5237    
5238      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', or if certain optimization
5239      tests above succeeded, possessive_quantifier is TRUE. For some of the      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5240      simpler opcodes, there is an special alternative opcode for this. For      there are special alternative opcodes for this case. For anything else, we
5241      anything else, we wrap the entire repeated item inside OP_ONCE brackets.      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5242      The '+' notation is just syntactic sugar, taken from Sun's Java package,      notation is just syntactic sugar, taken from Sun's Java package, but the
5243      but the special opcodes can optimize it a bit. The repeated item starts at      special opcodes can optimize it.
5244      tempcode, not at previous, which might be the first part of a string whose  
5245      (former) last char we repeated.      Some (but not all) possessively repeated subpatterns have already been
5246        completely handled in the code just above. For them, possessive_quantifier
5247        is always FALSE at this stage.
5248    
5249        Note that the repeated item starts at tempcode, not at previous, which
5250        might be the first part of a string whose (former) last char we repeated.
5251    
5252      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5253      an 'upto' may follow. We skip over an 'exact' item, and then test the      an 'upto' may follow. We skip over an 'exact' item, and then test the
# Line 4682  for (;; ptr++) Line 5258  for (;; ptr++)
5258        int len;        int len;
5259    
5260        if (*tempcode == OP_TYPEEXACT)        if (*tempcode == OP_TYPEEXACT)
5261          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
5262            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);            ((tempcode[1 + IMM2_SIZE] == OP_PROP
5263              || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5264    
5265        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5266          {          {
5267          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
5268  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5269          if (utf8 && tempcode[-1] >= 0xc0)          if (utf8 && tempcode[-1] >= 0xc0)
5270            tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];            tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f];
5271  #endif  #endif
5272          }          }
5273    
# Line 4702  for (;; ptr++) Line 5279  for (;; ptr++)
5279          case OP_QUERY: *tempcode = OP_POSQUERY; break;          case OP_QUERY: *tempcode = OP_POSQUERY; break;
5280          case OP_UPTO:  *tempcode = OP_POSUPTO; break;          case OP_UPTO:  *tempcode = OP_POSUPTO; break;
5281    
5282          case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;          case OP_STARI:  *tempcode = OP_POSSTARI; break;
5283          case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;          case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
5284          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5285          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
5286    
5287          case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;          case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
5288          case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;          case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
5289          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5290          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
5291    
5292            case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
5293            case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
5294            case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5295            case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
5296    
5297            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
5298            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
5299            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5300            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
5301    
5302          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
5303          pending recursive references are updated. */          pending recursive references are updated. */
5304    
5305          default:          default:
5306          *code = OP_END;          *code = OP_END;
5307          adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);          adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
5308          memmove(tempcode + 1+LINK_SIZE, tempcode, len);          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5309          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
5310          len += 1 + LINK_SIZE;          len += 1 + LINK_SIZE;
5311          tempcode[0] = OP_ONCE;          tempcode[0] = OP_ONCE;
# Line 4730  for (;; ptr++) Line 5317  for (;; ptr++)
5317        }        }
5318    
5319      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
5320      "follows varying string" flag for subsequently encountered reqbytes if      "follows varying string" flag for subsequently encountered reqchars if
5321      it isn't already set and we have just passed a varying length item. */      it isn't already set and we have just passed a varying length item. */
5322    
5323      END_REPEAT:      END_REPEAT:
# Line 4759  for (;; ptr++) Line 5346  for (;; ptr++)
5346        int i, namelen;        int i, namelen;
5347        int arglen = 0;        int arglen = 0;
5348        const char *vn = verbnames;        const char *vn = verbnames;
5349        const uschar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5350        const uschar *arg = NULL;        const pcre_uchar *arg = NULL;
5351        previous = NULL;        previous = NULL;
5352        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
5353        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5354    
5355          /* It appears that Perl allows any characters whatsoever, other than
5356          a closing parenthesis, to appear in arguments, so we no longer insist on
5357          letters, digits, and underscores. */
5358    
5359        if (*ptr == CHAR_COLON)        if (*ptr == CHAR_COLON)
5360          {          {
5361          arg = ++ptr;          arg = ++ptr;
5362          while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
           || *ptr == '_') ptr++;  
5363          arglen = (int)(ptr - arg);          arglen = (int)(ptr - arg);
5364          }          }
5365    
# Line 4784  for (;; ptr++) Line 5374  for (;; ptr++)
5374        for (i = 0; i < verbcount; i++)        for (i = 0; i < verbcount; i++)
5375          {          {
5376          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5377              strncmp((char *)name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5378            {            {
5379            /* Check for open captures before ACCEPT */            /* Check for open captures before ACCEPT and convert it to
5380              ASSERT_ACCEPT if in an assertion. */
5381    
5382            if (verbs[i].op == OP_ACCEPT)            if (verbs[i].op == OP_ACCEPT)
5383              {              {
5384              open_capitem *oc;              open_capitem *oc;
5385                if (arglen != 0)
5386                  {
5387                  *errorcodeptr = ERR59;
5388                  goto FAILED;
5389                  }
5390              cd->had_accept = TRUE;              cd->had_accept = TRUE;
5391              for (oc = cd->open_caps; oc != NULL; oc = oc->next)              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5392                {                {
5393                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
5394                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5395                }                }
5396                *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5397    
5398                /* Do not set firstchar after *ACCEPT */
5399                if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5400              }              }
5401    
5402            /* Handle the cases with/without an argument */            /* Handle other cases with/without an argument */
5403    
5404            if (arglen == 0)            else if (arglen == 0)
5405              {              {
5406              if (verbs[i].op < 0)   /* Argument is mandatory */              if (verbs[i].op < 0)   /* Argument is mandatory */
5407                {                {
5408                *errorcodeptr = ERR66;                *errorcodeptr = ERR66;
5409                goto FAILED;                goto FAILED;
5410                }                }
5411              *code++ = verbs[i].op;              *code = verbs[i].op;
5412                if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
5413              }              }
5414    
5415            else            else
# Line 4818  for (;; ptr++) Line 5419  for (;; ptr++)
5419                *errorcodeptr = ERR59;                *errorcodeptr = ERR59;
5420                goto FAILED;                goto FAILED;
5421                }                }
5422              *code++ = verbs[i].op_arg;              *code = verbs[i].op_arg;
5423                if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5424              *code++ = arglen;              *code++ = arglen;
5425              memcpy(code, arg, arglen);              memcpy(code, arg, IN_UCHARS(arglen));
5426              code += arglen;              code += arglen;
5427              *code++ = 0;              *code++ = 0;
5428              }              }
# Line 4843  for (;; ptr++) Line 5445  for (;; ptr++)
5445        {        {
5446        int i, set, unset, namelen;        int i, set, unset, namelen;
5447        int *optset;        int *optset;
5448        const uschar *name;        const pcre_uchar *name;
5449        uschar *slot;        pcre_uchar *slot;
5450    
5451        switch (*(++ptr))        switch (*(++ptr))
5452          {          {
# Line 4897  for (;; ptr++) Line 5499  for (;; ptr++)
5499            break;            break;
5500    
5501          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Most other conditions use OP_CREF (a couple change to OP_RREF
5502          below), and all need to skip 3 bytes at the start of the group. */          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5503    
5504          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
5505          skipbytes = 3;          skipbytes = 1+IMM2_SIZE;
5506          refsign = -1;          refsign = -1;
5507    
5508          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
# Line 4947  for (;; ptr++) Line 5549  for (;; ptr++)
5549          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while ((cd->ctypes[*ptr] & ctype_word) != 0)
5550            {            {
5551            if (recno >= 0)            if (recno >= 0)
5552              recno = ((digitab[*ptr] & ctype_digit) != 0)?              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
               recno * 10 + *ptr - CHAR_0 : -1;  
5553            ptr++;            ptr++;
5554            }            }
5555          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
# Line 4996  for (;; ptr++) Line 5597  for (;; ptr++)
5597          slot = cd->name_table;          slot = cd->name_table;
5598          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
5599            {            {
5600            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5601            slot += cd->name_entry_size;            slot += cd->name_entry_size;
5602            }            }
5603    
# Line 5012  for (;; ptr++) Line 5613  for (;; ptr++)
5613          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
5614    
5615          else if ((i = find_parens(cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
5616                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0, utf8)) > 0)
5617            {            {
5618            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
5619            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
# Line 5038  for (;; ptr++) Line 5639  for (;; ptr++)
5639            recno = 0;            recno = 0;
5640            for (i = 1; i < namelen; i++)            for (i = 1; i < namelen; i++)
5641              {              {
5642              if ((digitab[name[i]] & ctype_digit) == 0)              if (!IS_DIGIT(name[i]))
5643                {                {
5644                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
5645                goto FAILED;                goto FAILED;
# Line 5053  for (;; ptr++) Line 5654  for (;; ptr++)
5654          /* Similarly, check for the (?(DEFINE) "condition", which is always          /* Similarly, check for the (?(DEFINE) "condition", which is always
5655          false. */          false. */
5656    
5657          else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)          else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5658            {            {
5659            code[1+LINK_SIZE] = OP_DEF;            code[1+LINK_SIZE] = OP_DEF;
5660            skipbytes = 1;            skipbytes = 1;
# Line 5080  for (;; ptr++) Line 5681  for (;; ptr++)
5681          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5682          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5683          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
5684            cd->assert_depth += 1;
5685          ptr++;          ptr++;
5686          break;          break;
5687    
# Line 5094  for (;; ptr++) Line 5696  for (;; ptr++)
5696            continue;            continue;
5697            }            }
5698          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
5699            cd->assert_depth += 1;
5700          break;          break;
5701    
5702    
# Line 5103  for (;; ptr++) Line 5706  for (;; ptr++)
5706            {            {
5707            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5708            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
5709              cd->assert_depth += 1;
5710            ptr += 2;            ptr += 2;
5711            break;            break;
5712    
5713            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5714            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
5715              cd->assert_depth += 1;
5716            ptr += 2;            ptr += 2;
5717            break;            break;
5718    
# Line 5129  for (;; ptr++) Line 5734  for (;; ptr++)
5734    
5735          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5736          case CHAR_C:                 /* Callout - may be followed by digits; */          case CHAR_C:                 /* Callout - may be followed by digits; */
5737          previous_callout = code;  /* Save for later completion */          previous_callout = code;     /* Save for later completion */
5738          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1;    /* Skip one item before completing */
5739          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
5740            {            {
5741            int n = 0;            int n = 0;
5742            while ((digitab[*(++ptr)] & ctype_digit) != 0)            ptr++;
5743              n = n * 10 + *ptr - CHAR_0;            while(IS_DIGIT(*ptr))
5744                n = n * 10 + *ptr++ - CHAR_0;
5745            if (*ptr != CHAR_RIGHT_PARENTHESIS)            if (*ptr != CHAR_RIGHT_PARENTHESIS)
5746              {              {
5747              *errorcodeptr = ERR39;              *errorcodeptr = ERR39;
# Line 5252  for (;; ptr++) Line 5858  for (;; ptr++)
5858                if (crc < 0)                if (crc < 0)
5859                  {                  {
5860                  memmove(slot + cd->name_entry_size, slot,                  memmove(slot + cd->name_entry_size, slot,
5861                    (cd->names_found - i) * cd->name_entry_size);                    IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
5862                  break;                  break;
5863                  }                  }
5864    
# Line 5266  for (;; ptr++) Line 5872  for (;; ptr++)
5872    
5873              if (!dupname)              if (!dupname)
5874                {                {
5875                uschar *cslot = cd->name_table;                pcre_uchar *cslot = cd->name_table;
5876                for (i = 0; i < cd->names_found; i++)                for (i = 0; i < cd->names_found; i++)
5877                  {                  {
5878                  if (cslot != slot)                  if (cslot != slot)
# Line 5283  for (;; ptr++) Line 5889  for (;; ptr++)
5889                }                }
5890    
5891              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
5892              memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, IN_UCHARS(namelen));
5893              slot[2+namelen] = 0;              slot[2 + namelen] = 0;
5894              }              }
5895            }            }
5896    
# Line 5313  for (;; ptr++) Line 5919  for (;; ptr++)
5919          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5920          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
5921    
5922          /* In the pre-compile phase, do a syntax check and set a dummy          /* In the pre-compile phase, do a syntax check. We used to just set
5923          reference number. */          a dummy reference number, because it was not used in the first pass.
5924            However, with the change of recursive back references to be atomic,
5925            we have to look for the number so that this state can be identified, as
5926            otherwise the incorrect length is computed. If it's not a backwards
5927            reference, the dummy number will do. */
5928    
5929          if (lengthptr != NULL)          if (lengthptr != NULL)
5930            {            {
5931              const pcre_uchar *temp;
5932    
5933            if (namelen == 0)            if (namelen == 0)
5934              {              {
5935              *errorcodeptr = ERR62;              *errorcodeptr = ERR62;
# Line 5333  for (;; ptr++) Line 5945  for (;; ptr++)
5945              *errorcodeptr = ERR48;              *errorcodeptr = ERR48;
5946              goto FAILED;              goto FAILED;
5947              }              }
5948            recno = 0;  
5949              /* The name table does not exist in the first pass, so we cannot
5950              do a simple search as in the code below. Instead, we have to scan the
5951              pattern to find the number. It is important that we scan it only as
5952              far as we have got because the syntax of named subpatterns has not
5953              been checked for the rest of the pattern, and find_parens() assumes
5954              correct syntax. In any case, it's a waste of resources to scan
5955              further. We stop the scan at the current point by temporarily
5956              adjusting the value of cd->endpattern. */
5957    
5958              temp = cd->end_pattern;
5959              cd->end_pattern = ptr;
5960              recno = find_parens(cd, name, namelen,
5961                (options & PCRE_EXTENDED) != 0, utf8);
5962              cd->end_pattern = temp;
5963              if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
5964            }            }
5965    
5966          /* In the real compile, seek the name in the table. We check the name          /* In the real compile, seek the name in the table. We check the name
# Line 5346  for (;; ptr++) Line 5973  for (;; ptr++)
5973            slot = cd->name_table;            slot = cd->name_table;
5974            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
5975              {              {
5976              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
5977                  slot[2+namelen] == 0)                  slot[2+namelen] == 0)
5978                break;                break;
5979              slot += cd->name_entry_size;              slot += cd->name_entry_size;
# Line 5358  for (;; ptr++) Line 5985  for (;; ptr++)
5985              }              }
5986            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
5987                      find_parens(cd, name, namelen,                      find_parens(cd, name, namelen,
5988                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
5989              {              {
5990              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
5991              goto FAILED;              goto FAILED;
# Line 5383  for (;; ptr++) Line 6010  for (;; ptr++)
6010          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6011          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6012            {            {
6013            const uschar *called;            const pcre_uchar *called;
6014            terminator = CHAR_RIGHT_PARENTHESIS;            terminator = CHAR_RIGHT_PARENTHESIS;
6015    
6016            /* Come here from the \g<...> and \g'...' code (Oniguruma            /* Come here from the \g<...> and \g'...' code (Oniguruma
# Line 5397  for (;; ptr++) Line 6024  for (;; ptr++)
6024            if ((refsign = *ptr) == CHAR_PLUS)            if ((refsign = *ptr) == CHAR_PLUS)
6025              {              {
6026              ptr++;              ptr++;
6027              if ((digitab[*ptr] & ctype_digit) == 0)              if (!IS_DIGIT(*ptr))
6028                {                {
6029                *errorcodeptr = ERR63;                *errorcodeptr = ERR63;
6030                goto FAILED;                goto FAILED;
# Line 5405  for (;; ptr++) Line 6032  for (;; ptr++)
6032              }              }
6033            else if (refsign == CHAR_MINUS)            else if (refsign == CHAR_MINUS)
6034              {              {
6035              if ((digitab[ptr[1]] & ctype_digit) == 0)              if (!IS_DIGIT(ptr[1]))
6036                goto OTHER_CHAR_AFTER_QUERY;                goto OTHER_CHAR_AFTER_QUERY;
6037              ptr++;              ptr++;
6038              }              }
6039    
6040            recno = 0;            recno = 0;
6041            while((digitab[*ptr] & ctype_digit) != 0)            while(IS_DIGIT(*ptr))
6042              recno = recno * 10 + *ptr++ - CHAR_0;              recno = recno * 10 + *ptr++ - CHAR_0;
6043    
6044            if (*ptr != terminator)            if (*ptr != terminator)
# Line 5462  for (;; ptr++) Line 6089  for (;; ptr++)
6089              {              {
6090              *code = OP_END;              *code = OP_END;
6091              if (recno != 0)              if (recno != 0)
6092                called = _pcre_find_bracket(cd->start_code, utf8, recno);                called = PRIV(find_bracket)(cd->start_code, utf8, recno);
6093    
6094              /* Forward reference */              /* Forward reference */
6095    
6096              if (called == NULL)              if (called == NULL)
6097                {                {
6098                if (find_parens(cd, NULL, recno,                if (find_parens(cd, NULL, recno,
6099                      (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0, utf8) < 0)
6100                  {                  {
6101                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
6102                  goto FAILED;                  goto FAILED;
# Line 5477  for (;; ptr++) Line 6104  for (;; ptr++)
6104    
6105                /* Fudge the value of "called" so that when it is inserted as an                /* Fudge the value of "called" so that when it is inserted as an
6106                offset below, what it actually inserted is the reference number                offset below, what it actually inserted is the reference number
6107                of the group. */                of the group. Then remember the forward reference. */
6108    
6109                called = cd->start_code + recno;                called = cd->start_code + recno;
6110                PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6111                }                }
6112    
6113              /* If not a forward reference, and the subpattern is still open,              /* If not a forward reference, and the subpattern is still open,
6114              this is a recursive call. We check to see if this is a left              this is a recursive call. We check to see if this is a left
6115              recursion that could loop for ever, and diagnose that case. */              recursion that could loop for ever, and diagnose that case. We
6116                must not, however, do this check if we are in a conditional
6117                subpattern because the condition might be testing for recursion in
6118                a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
6119                Forever loops are also detected at runtime, so those that occur in
6120                conditional subpatterns will be picked up then. */
6121    
6122              else if (GET(called, 1) == 0 &&              else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6123                       could_be_empty(called, code, bcptr, utf8, cd))                       could_be_empty(called, code, bcptr, utf8, cd))
6124                {                {
6125                *errorcodeptr = ERR40;                *errorcodeptr = ERR40;
# Line 5495  for (;; ptr++) Line 6127  for (;; ptr++)
6127                }                }
6128              }              }
6129    
6130            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item. */
           "once" brackets. Set up a "previous group" length so that a  
           subsequent quantifier will work. */  
   
           *code = OP_ONCE;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
6131    
6132            *code = OP_RECURSE;            *code = OP_RECURSE;
6133            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
6134            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
   
           *code = OP_KET;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
           length_prevgroup = 3 + 3*LINK_SIZE;  
6135            }            }
6136    
6137          /* Can't determine a first byte now */          /* Can't determine a first byte now */
6138    
6139          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6140          continue;          continue;
6141    
6142    
# Line 5572  for (;; ptr++) Line 6192  for (;; ptr++)
6192          is necessary to ensure we correctly detect the start of the pattern in          is necessary to ensure we correctly detect the start of the pattern in
6193          both phases.          both phases.
6194    
6195          If we are not at the pattern start, compile code to change the ims          If we are not at the pattern start, reset the greedy defaults and the
6196          options if this setting actually changes any of them, and reset the          case value for firstchar and reqchar. */
         greedy defaults and the case value for firstbyte and reqbyte. */  
6197    
6198          if (*ptr == CHAR_RIGHT_PARENTHESIS)          if (*ptr == CHAR_RIGHT_PARENTHESIS)
6199            {            {
# Line 5585  for (;; ptr++) Line 6204  for (;; ptr++)
6204              }              }
6205            else            else
6206              {              {
             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))  
               {  
               *code++ = OP_OPT;  
               *code++ = newoptions & PCRE_IMS;  
               }  
6207              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6208              greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
6209              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6210              }              }
6211    
6212            /* Change options at this level, and pass them back for use            /* Change options at this level, and pass them back for use
6213            in subsequent branches. When not at the start of the pattern, this            in subsequent branches. */
           information is also necessary so that a resetting item can be  
           compiled at the end of a group (if we are in a group). */  
6214    
6215            *optionsptr = options = newoptions;            *optionsptr = options = newoptions;
6216            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
# Line 5631  for (;; ptr++) Line 6243  for (;; ptr++)
6243        NUMBERED_GROUP:        NUMBERED_GROUP:
6244        cd->bracount += 1;        cd->bracount += 1;
6245        PUT2(code, 1+LINK_SIZE, cd->bracount);        PUT2(code, 1+LINK_SIZE, cd->bracount);
6246        skipbytes = 2;        skipbytes = IMM2_SIZE;
6247        }        }
6248    
6249      /* Process nested bracketed regex. Assertions may not be repeated, but      /* Process nested bracketed regex. Assertions used not to be repeatable,
6250      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a      but this was changed for Perl compatibility, so all kinds can now be
6251      non-register variable in order to be able to pass its address because some      repeated. We copy code into a non-register variable (tempcode) in order to
6252      compilers complain otherwise. Pass in a new setting for the ims options if      be able to pass its address because some compilers complain otherwise. */
     they have changed. */  
6253    
6254      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = code;                      /* For handling repetition */
6255      *code = bravalue;      *code = bravalue;
6256      tempcode = code;      tempcode = code;
6257      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;        /* Save value before bracket */
6258      length_prevgroup = 0;              /* Initialize for pre-compile phase */      tempbracount = cd->bracount;          /* Save value before bracket */
6259        length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6260    
6261      if (!compile_regex(      if (!compile_regex(
6262           newoptions,                   /* The complete new option state */           newoptions,                      /* The complete new option state */
6263           options & PCRE_IMS,           /* The previous ims option state */           &tempcode,                       /* Where to put code (updated) */
6264           &tempcode,                    /* Where to put code (updated) */           &ptr,                            /* Input pointer (updated) */
6265           &ptr,                         /* Input pointer (updated) */           errorcodeptr,                    /* Where to put an error message */
          errorcodeptr,                 /* Where to put an error message */  
6266           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
6267            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
6268           reset_bracount,               /* True if (?| group */           reset_bracount,                  /* True if (?| group */
6269           skipbytes,                    /* Skip over bracket number */           skipbytes,                       /* Skip over bracket number */
6270           &subfirstbyte,                /* For possible first char */           cond_depth +
6271           &subreqbyte,                  /* For possible last char */             ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
6272           bcptr,                        /* Current branch chain */           &subfirstchar,                   /* For possible first char */
6273           cd,                           /* Tables block */           &subreqchar,                     /* For possible last char */
6274           (lengthptr == NULL)? NULL :   /* Actual compile phase */           bcptr,                           /* Current branch chain */
6275             &length_prevgroup           /* Pre-compile phase */           cd,                              /* Tables block */
6276             (lengthptr == NULL)? NULL :      /* Actual compile phase */
6277               &length_prevgroup              /* Pre-compile phase */
6278           ))           ))
6279        goto FAILED;        goto FAILED;
6280    
6281        /* If this was an atomic group and there are no capturing groups within it,
6282        generate OP_ONCE_NC instead of OP_ONCE. */
6283    
6284        if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
6285          *code = OP_ONCE_NC;