/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

code/trunk/pcre_compile.c revision 545 by ph10, Wed Jun 16 10:51:15 2010 UTC code/branches/pcre16/pcre_compile.c revision 763 by zherczeg, Tue Nov 22 21:46:22 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 231  static const char posix_names[] = Line 231  static const char posix_names[] =
231    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
233    
234  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
235    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 266  substitutes must be in the order of the Line 266  substitutes must be in the order of the
266  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
267    
268  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
269  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
270    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
271    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
272    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
273    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
274    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
275    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
276      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
277      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
278    static const pcre_uchar string_pXsp[] = {
279      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
280      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
281    static const pcre_uchar string_PXwd[] = {
282      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
283      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
284    static const pcre_uchar string_pXwd[] = {
285      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
286      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
287    
288    static const pcre_uchar *substitutes[] = {
289      string_PNd,           /* \D */
290      string_pNd,           /* \d */
291      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
292      string_pXsp,          /* \s */
293      string_PXwd,          /* \W */
294      string_pXwd           /* \w */
295  };  };
296    
297  static const uschar *posix_substitutes[] = {  static const pcre_uchar string_pL[] =   {
298    (uschar *)"\\p{L}",     /* alpha */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299    (uschar *)"\\p{Ll}",    /* lower */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300    (uschar *)"\\p{Lu}",    /* upper */  static const pcre_uchar string_pLl[] =  {
301    (uschar *)"\\p{Xan}",   /* alnum */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
302    NULL,                   /* ascii */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pLu[] =  {
304    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    NULL,                   /* graph */  static const pcre_uchar string_pXan[] = {
307    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
308    NULL,                   /* punct */    CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
309    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_h[] =    {
310    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_h, '\0' };
311    NULL,                   /* xdigit */  static const pcre_uchar string_pXps[] = {
312      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
313      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
314    static const pcre_uchar string_PL[] =   {
315      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
316      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
317    static const pcre_uchar string_PLl[] =  {
318      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
319      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
320    static const pcre_uchar string_PLu[] =  {
321      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
322      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
323    static const pcre_uchar string_PXan[] = {
324      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
325      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
326    static const pcre_uchar string_H[] =    {
327      CHAR_BACKSLASH, CHAR_H, '\0' };
328    static const pcre_uchar string_PXps[] = {
329      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
330      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
331    
332    static const pcre_uchar *posix_substitutes[] = {
333      string_pL,            /* alpha */
334      string_pLl,           /* lower */
335      string_pLu,           /* upper */
336      string_pXan,          /* alnum */
337      NULL,                 /* ascii */
338      string_h,             /* blank */
339      NULL,                 /* cntrl */
340      string_pNd,           /* digit */
341      NULL,                 /* graph */
342      NULL,                 /* print */
343      NULL,                 /* punct */
344      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
345      string_pXwd,          /* word */
346      NULL,                 /* xdigit */
347    /* Negated cases */    /* Negated cases */
348    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
349    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
350    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
351    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
352    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
353    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
354    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
355    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
356    NULL,                   /* ^graph */    NULL,                 /* ^graph */
357    NULL,                   /* ^print */    NULL,                 /* ^print */
358    NULL,                   /* ^punct */    NULL,                 /* ^punct */
359    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
360    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */
361    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
362  };  };
363  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
364  #endif  #endif
365    
366  #define STRING(a)  # a  #define STRING(a)  # a
# Line 393  static const char error_texts[] = Line 447  static const char error_texts[] =
447    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
448    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
449    /* 55 */    /* 55 */
450    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
451    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
452    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
453    "a numbered reference must not be zero\0"    "a numbered reference must not be zero\0"
# Line 408  static const char error_texts[] = Line 462  static const char error_texts[] =
462    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
463    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
464    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
465      "\\c must be followed by an ASCII character\0"
466      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
467      /* 70 */
468      "internal error: unknown opcode in find_fixedlength()\0"
469    ;    ;
470    
471  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 544  static const unsigned char ebcdic_charta Line 602  static const unsigned char ebcdic_charta
602  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
603    
604  static BOOL  static BOOL
605    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
606      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
607    
608    
# Line 576  return s; Line 634  return s;
634    
635    
636  /*************************************************  /*************************************************
637    *            Check for counted repeat            *
638    *************************************************/
639    
640    /* This function is called when a '{' is encountered in a place where it might
641    start a quantifier. It looks ahead to see if it really is a quantifier or not.
642    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
643    where the ddds are digits.
644    
645    Arguments:
646      p         pointer to the first char after '{'
647    
648    Returns:    TRUE or FALSE
649    */
650    
651    static BOOL
652    is_counted_repeat(const pcre_uchar *p)
653    {
654    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
655    while ((digitab[*p] & ctype_digit) != 0) p++;
656    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
657    
658    if (*p++ != CHAR_COMMA) return FALSE;
659    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
660    
661    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
662    while ((digitab[*p] & ctype_digit) != 0) p++;
663    
664    return (*p == CHAR_RIGHT_CURLY_BRACKET);
665    }
666    
667    
668    
669    /*************************************************
670  *            Handle escapes                      *  *            Handle escapes                      *
671  *************************************************/  *************************************************/
672    
# Line 600  Returns:         zero or positive => a d Line 691  Returns:         zero or positive => a d
691  */  */
692    
693  static int  static int
694  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
695    int options, BOOL isclass)    int options, BOOL isclass)
696  {  {
697  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
698  const uschar *ptr = *ptrptr + 1;  const pcre_uchar *ptr = *ptrptr + 1;
699  int c, i;  int c, i;
700    
701  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
# Line 631  else if ((i = escapes[c - 0x48]) != 0) Line 722  else if ((i = escapes[c - 0x48]) != 0)
722    
723  else  else
724    {    {
725    const uschar *oldptr;    const pcre_uchar *oldptr;
726    BOOL braced, negated;    BOOL braced, negated;
727    
728    switch (c)    switch (c)
# Line 641  else Line 732  else
732    
733      case CHAR_l:      case CHAR_l:
734      case CHAR_L:      case CHAR_L:
735        *errorcodeptr = ERR37;
736        break;
737    
738      case CHAR_u:      case CHAR_u:
739        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
740          {
741          /* In JavaScript, \u must be followed by four hexadecimal numbers.
742          Otherwise it is a lowercase u letter. */
743          if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
744               && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
745            {
746            c = 0;
747            for (i = 0; i < 4; ++i)
748              {
749              register int cc = *(++ptr);
750    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
751              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
752              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
753    #else           /* EBCDIC coding */
754              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
755              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
756    #endif
757              }
758            }
759          }
760        else
761          *errorcodeptr = ERR37;
762        break;
763    
764      case CHAR_U:      case CHAR_U:
765      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
766        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
767      break;      break;
768    
769      /* \g must be followed by one of a number of specific things:      /* In a character class, \g is just a literal "g". Outside a character
770        class, \g must be followed by one of a number of specific things:
771    
772      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
773      backreference. If negative, it is a relative backreference. This is a Perl      backreference. If negative, it is a relative backreference. This is a Perl
# Line 663  else Line 784  else
784      the -ESC_g code (cf \k). */      the -ESC_g code (cf \k). */
785    
786      case CHAR_g:      case CHAR_g:
787        if (isclass) break;
788      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
789        {        {
790        c = -ESC_g;        c = -ESC_g;
# Line 673  else Line 795  else
795    
796      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
797        {        {
798        const uschar *p;        const pcre_uchar *p;
799        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
800          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
801        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
# Line 791  else Line 913  else
913      treated as a data character. */      treated as a data character. */
914    
915      case CHAR_x:      case CHAR_x:
916        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
917          {
918          /* In JavaScript, \x must be followed by two hexadecimal numbers.
919          Otherwise it is a lowercase x letter. */
920          if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
921            {
922            c = 0;
923            for (i = 0; i < 2; ++i)
924              {
925              register int cc = *(++ptr);
926    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
927              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
928              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
929    #else           /* EBCDIC coding */
930              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
931              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
932    #endif
933              }
934            }
935          break;
936          }
937    
938      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
939        {        {
940        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
941        int count = 0;        int count = 0;
942    
943        c = 0;        c = 0;
# Line 841  else Line 985  else
985      break;      break;
986    
987      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
988      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
989        coding is ASCII-specific, but then the whole concept of \cx is
990      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
991    
992      case CHAR_c:      case CHAR_c:
# Line 851  else Line 996  else
996        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
997        break;        break;
998        }        }
999    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1000  #ifndef EBCDIC  /* ASCII/UTF-8 coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
1001          {
1002          *errorcodeptr = ERR68;
1003          break;
1004          }
1005      if (c >= CHAR_a && c <= CHAR_z) c -= 32;      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1006      c ^= 0x40;      c ^= 0x40;
1007  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1008      if (c >= CHAR_a && c <= CHAR_z) c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1009      c ^= 0xC0;      c ^= 0xC0;
1010  #endif  #endif
# Line 879  else Line 1028  else
1028    }    }
1029    
1030  /* Perl supports \N{name} for character names, as well as plain \N for "not  /* Perl supports \N{name} for character names, as well as plain \N for "not
1031  newline". PCRE does not support \N{name}. */  newline". PCRE does not support \N{name}. However, it does support
1032    quantification such as \N{2,3}. */
1033    
1034  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1035         !is_counted_repeat(ptr+2))
1036    *errorcodeptr = ERR37;    *errorcodeptr = ERR37;
1037    
1038  /* If PCRE_UCP is set, we change the values for \d etc. */  /* If PCRE_UCP is set, we change the values for \d etc. */
# Line 917  Returns:         type value from ucp_typ Line 1068  Returns:         type value from ucp_typ
1068  */  */
1069    
1070  static int  static int
1071  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1072  {  {
1073  int c, i, bot, top;  int c, i, bot, top;
1074  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1075  char name[32];  pcre_uchar name[32];
1076    
1077  c = *(++ptr);  c = *(++ptr);
1078  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 967  top = _pcre_utt_size; Line 1118  top = _pcre_utt_size;
1118  while (bot < top)  while (bot < top)
1119    {    {
1120    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1121    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, _pcre_utt_names + _pcre_utt[i].name_offset);
1122    if (c == 0)    if (c == 0)
1123      {      {
1124      *dptr = _pcre_utt[i].value;      *dptr = _pcre_utt[i].value;
# Line 991  return -1; Line 1142  return -1;
1142    
1143    
1144  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if (*p++ != CHAR_COMMA) return FALSE;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == CHAR_RIGHT_CURLY_BRACKET);  
 }  
   
   
   
 /*************************************************  
1145  *         Read repeat counts                     *  *         Read repeat counts                     *
1146  *************************************************/  *************************************************/
1147    
# Line 1042  Returns:         pointer to '}' on succe Line 1160  Returns:         pointer to '}' on succe
1160                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1161  */  */
1162    
1163  static const uschar *  static const pcre_uchar *
1164  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1165  {  {
1166  int min = 0;  int min = 0;
1167  int max = -1;  int max = -1;
# Line 1099  top-level call starts at the beginning o Line 1217  top-level call starts at the beginning o
1217  start at a parenthesis. It scans along a pattern's text looking for capturing  start at a parenthesis. It scans along a pattern's text looking for capturing
1218  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1219  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1220  returns when it reaches a given numbered subpattern. We know that if (?P< is  returns when it reaches a given numbered subpattern. Recursion is used to keep
1221  encountered, the name will be terminated by '>' because that is checked in the  track of subpatterns that reset the capturing group numbers - the (?| feature.
1222  first pass. Recursion is used to keep track of subpatterns that reset the  
1223  capturing group numbers - the (?| feature.  This function was originally called only from the second pass, in which we know
1224    that if (?< or (?' or (?P< is encountered, the name will be correctly
1225    terminated because that is checked in the first pass. There is now one call to
1226    this function in the first pass, to check for a recursive back reference by
1227    name (so that we can make the whole group atomic). In this case, we need check
1228    only up to the current position in the pattern, and that is still OK because
1229    and previous occurrences will have been checked. To make this work, the test
1230    for "end of pattern" is a check against cd->end_pattern in the main loop,
1231    instead of looking for a binary zero. This means that the special first-pass
1232    call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1233    processing items within the loop are OK, because afterwards the main loop will
1234    terminate.)
1235    
1236  Arguments:  Arguments:
1237    ptrptr       address of the current character pointer (updated)    ptrptr       address of the current character pointer (updated)
# Line 1110  Arguments: Line 1239  Arguments:
1239    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1240    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1241    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1242      utf8         TRUE if we are in UTF-8 mode
1243    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1244    
1245  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1246  */  */
1247    
1248  static int  static int
1249  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1250    BOOL xmode, int *count)    BOOL xmode, BOOL utf8, int *count)
1251  {  {
1252  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1253  int start_count = *count;  int start_count = *count;
1254  int hwm_count = start_count;  int hwm_count = start_count;
1255  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1186  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1316  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1316          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1317        {        {
1318        int term;        int term;
1319        const uschar *thisname;        const pcre_uchar *thisname;
1320        *count += 1;        *count += 1;
1321        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1322        term = *ptr++;        term = *ptr++;
# Line 1194  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1324  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1324        thisname = ptr;        thisname = ptr;
1325        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1326        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1327            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, lorn) == 0)
1328          return *count;          return *count;
1329        term++;        term++;
1330        }        }
# Line 1202  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1332  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1332    }    }
1333    
1334  /* Past any initial parenthesis handling, scan for parentheses or vertical  /* Past any initial parenthesis handling, scan for parentheses or vertical
1335  bars. */  bars. Stop if we get to cd->end_pattern. Note that this is important for the
1336    first-pass call when this value is temporarily adjusted to stop at the current
1337    position. So DO NOT change this to a test for binary zero. */
1338    
1339  for (; *ptr != 0; ptr++)  for (; ptr < cd->end_pattern; ptr++)
1340    {    {
1341    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1342    
# Line 1235  for (; *ptr != 0; ptr++) Line 1367  for (; *ptr != 0; ptr++)
1367          {          {
1368          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1369            ptr+= 2;            ptr+= 2;
1370          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1371                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1372            ptr += 4;            ptr += 4;
1373          else          else
# Line 1278  for (; *ptr != 0; ptr++) Line 1410  for (; *ptr != 0; ptr++)
1410    
1411    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1412      {      {
1413      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      ptr++;
1414        while (*ptr != 0)
1415          {
1416          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1417          ptr++;
1418    #ifdef SUPPORT_UTF8
1419          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1420    #endif
1421          }
1422      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1423      continue;      continue;
1424      }      }
# Line 1287  for (; *ptr != 0; ptr++) Line 1427  for (; *ptr != 0; ptr++)
1427    
1428    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1429      {      {
1430      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1431      if (rc > 0) return rc;      if (rc > 0) return rc;
1432      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1433      }      }
# Line 1333  Arguments: Line 1473  Arguments:
1473    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1474    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1475    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1476      utf8         TRUE if we are in UTF-8 mode
1477    
1478  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1479  */  */
1480    
1481  static int  static int
1482  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1483      BOOL utf8)
1484  {  {
1485  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1486  int count = 0;  int count = 0;
1487  int rc;  int rc;
1488    
# Line 1351  matching closing parens. That is why we Line 1493  matching closing parens. That is why we
1493    
1494  for (;;)  for (;;)
1495    {    {
1496    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1497    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1498    }    }
1499    
# Line 1367  return rc; Line 1509  return rc;
1509    
1510  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1511  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1512  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, it makes sense to skip negative
1513  For some calls, it makes sense to skip negative forward and all backward  forward and all backward assertions, and also the \b assertion; for others it
1514  assertions, and also the \b assertion; for others it does not.  does not.
1515    
1516  Arguments:  Arguments:
1517    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1518    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1519    
1520  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1521  */  */
1522    
1523  static const uschar*  static const pcre_uchar*
1524  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const pcre_uchar *code, BOOL skipassert)
   BOOL skipassert)  
1525  {  {
1526  for (;;)  for (;;)
1527    {    {
1528    switch ((int)*code)    switch ((int)*code)
1529      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1530      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1531      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1532      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1444  and doing the check at the end; a flag s Line 1576  and doing the check at the end; a flag s
1576    
1577  Arguments:  Arguments:
1578    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1579    options  the compiling options    utf8     TRUE in UTF-8 mode
1580    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1581    cd       the "compile data" structure    cd       the "compile data" structure
1582    
1583  Returns:   the fixed length,  Returns:   the fixed length,
1584               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1585               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1586               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1587                 or -4 if an unknown opcode was encountered (internal error)
1588  */  */
1589    
1590  static int  static int
1591  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf8, BOOL atend, compile_data *cd)
1592  {  {
1593  int length = -1;  int length = -1;
1594    
1595  register int branchlength = 0;  register int branchlength = 0;
1596  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1597    
1598  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1599  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1468  branch, check the length against that of Line 1601  branch, check the length against that of
1601  for (;;)  for (;;)
1602    {    {
1603    int d;    int d;
1604    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1605    register int op = *cc;    register int op = *cc;
1606    switch (op)    switch (op)
1607      {      {
1608        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1609        OP_BRA (normal non-capturing bracket) because the other variants of these
1610        opcodes are all concerned with unlimited repeated groups, which of course
1611        are not of fixed length. */
1612    
1613      case OP_CBRA:      case OP_CBRA:
1614      case OP_BRA:      case OP_BRA:
1615      case OP_ONCE:      case OP_ONCE:
1616        case OP_ONCE_NC:
1617      case OP_COND:      case OP_COND:
1618      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1619      if (d < 0) return d;      if (d < 0) return d;
1620      branchlength += d;      branchlength += d;
1621      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1622      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1623      break;      break;
1624    
1625      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1626      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1627      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1628        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1629        because they all imply an unlimited repeat. */
1630    
1631      case OP_ALT:      case OP_ALT:
1632      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1633      case OP_END:      case OP_END:
1634        case OP_ACCEPT:
1635        case OP_ASSERT_ACCEPT:
1636      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1637        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1638      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1505  for (;;) Line 1646  for (;;)
1646    
1647      case OP_RECURSE:      case OP_RECURSE:
1648      if (!atend) return -3;      if (!atend) return -3;
1649      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1650      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1651      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1652      d = find_fixedlength(cs + 2, options, atend, cd);      d = find_fixedlength(cs + 2, utf8, atend, cd);
1653      if (d < 0) return d;      if (d < 0) return d;
1654      branchlength += d;      branchlength += d;
1655      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1525  for (;;) Line 1666  for (;;)
1666    
1667      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1668    
1669      case OP_REVERSE:      case OP_MARK:
1670        case OP_PRUNE_ARG:
1671        case OP_SKIP_ARG:
1672        case OP_THEN_ARG:
1673        cc += cc[1] + _pcre_OP_lengths[*cc];
1674        break;
1675    
1676        case OP_CALLOUT:
1677        case OP_CIRC:
1678        case OP_CIRCM:
1679        case OP_CLOSE:
1680        case OP_COMMIT:
1681      case OP_CREF:      case OP_CREF:
     case OP_NCREF:  
     case OP_RREF:  
     case OP_NRREF:  
1682      case OP_DEF:      case OP_DEF:
1683      case OP_OPT:      case OP_DOLL:
1684      case OP_CALLOUT:      case OP_DOLLM:
     case OP_SOD:  
     case OP_SOM:  
     case OP_SET_SOM:  
1685      case OP_EOD:      case OP_EOD:
1686      case OP_EODN:      case OP_EODN:
1687      case OP_CIRC:      case OP_FAIL:
1688      case OP_DOLL:      case OP_NCREF:
1689        case OP_NRREF:
1690      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1691        case OP_PRUNE:
1692        case OP_REVERSE:
1693        case OP_RREF:
1694        case OP_SET_SOM:
1695        case OP_SKIP:
1696        case OP_SOD:
1697        case OP_SOM:
1698        case OP_THEN:
1699      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1700      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
1701      break;      break;
# Line 1548  for (;;) Line 1703  for (;;)
1703      /* Handle literal characters */      /* Handle literal characters */
1704    
1705      case OP_CHAR:      case OP_CHAR:
1706      case OP_CHARNC:      case OP_CHARI:
1707      case OP_NOT:      case OP_NOT:
1708        case OP_NOTI:
1709      branchlength++;      branchlength++;
1710      cc += 2;      cc += 2;
1711  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1712      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1713  #endif  #endif
1714      break;      break;
1715    
# Line 1562  for (;;) Line 1717  for (;;)
1717      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1718    
1719      case OP_EXACT:      case OP_EXACT:
1720        case OP_EXACTI:
1721        case OP_NOTEXACT:
1722        case OP_NOTEXACTI:
1723      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1724      cc += 4;      cc += 4;
1725  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1726      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1727  #endif  #endif
1728      break;      break;
1729    
# Line 1583  for (;;) Line 1740  for (;;)
1740      cc += 2;      cc += 2;
1741      /* Fall through */      /* Fall through */
1742    
1743        case OP_HSPACE:
1744        case OP_VSPACE:
1745        case OP_NOT_HSPACE:
1746        case OP_NOT_VSPACE:
1747      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1748      case OP_DIGIT:      case OP_DIGIT:
1749      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1595  for (;;) Line 1756  for (;;)
1756      cc++;      cc++;
1757      break;      break;
1758    
1759      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1760        otherwise \C is coded as OP_ALLANY. */
1761    
1762      case OP_ANYBYTE:      case OP_ANYBYTE:
1763      return -2;      return -2;
# Line 1614  for (;;) Line 1776  for (;;)
1776    
1777      switch (*cc)      switch (*cc)
1778        {        {
1779          case OP_CRPLUS:
1780          case OP_CRMINPLUS:
1781        case OP_CRSTAR:        case OP_CRSTAR:
1782        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1783        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1634  for (;;) Line 1798  for (;;)
1798    
1799      /* Anything else is variable length */      /* Anything else is variable length */
1800    
1801      default:      case OP_ANYNL:
1802        case OP_BRAMINZERO:
1803        case OP_BRAPOS:
1804        case OP_BRAPOSZERO:
1805        case OP_BRAZERO:
1806        case OP_CBRAPOS:
1807        case OP_EXTUNI:
1808        case OP_KETRMAX:
1809        case OP_KETRMIN:
1810        case OP_KETRPOS:
1811        case OP_MINPLUS:
1812        case OP_MINPLUSI:
1813        case OP_MINQUERY:
1814        case OP_MINQUERYI:
1815        case OP_MINSTAR:
1816        case OP_MINSTARI:
1817        case OP_MINUPTO:
1818        case OP_MINUPTOI:
1819        case OP_NOTMINPLUS:
1820        case OP_NOTMINPLUSI:
1821        case OP_NOTMINQUERY:
1822        case OP_NOTMINQUERYI:
1823        case OP_NOTMINSTAR:
1824        case OP_NOTMINSTARI:
1825        case OP_NOTMINUPTO:
1826        case OP_NOTMINUPTOI:
1827        case OP_NOTPLUS:
1828        case OP_NOTPLUSI:
1829        case OP_NOTPOSPLUS:
1830        case OP_NOTPOSPLUSI:
1831        case OP_NOTPOSQUERY:
1832        case OP_NOTPOSQUERYI:
1833        case OP_NOTPOSSTAR:
1834        case OP_NOTPOSSTARI:
1835        case OP_NOTPOSUPTO:
1836        case OP_NOTPOSUPTOI:
1837        case OP_NOTQUERY:
1838        case OP_NOTQUERYI:
1839        case OP_NOTSTAR:
1840        case OP_NOTSTARI:
1841        case OP_NOTUPTO:
1842        case OP_NOTUPTOI:
1843        case OP_PLUS:
1844        case OP_PLUSI:
1845        case OP_POSPLUS:
1846        case OP_POSPLUSI:
1847        case OP_POSQUERY:
1848        case OP_POSQUERYI:
1849        case OP_POSSTAR:
1850        case OP_POSSTARI:
1851        case OP_POSUPTO:
1852        case OP_POSUPTOI:
1853        case OP_QUERY:
1854        case OP_QUERYI:
1855        case OP_REF:
1856        case OP_REFI:
1857        case OP_SBRA:
1858        case OP_SBRAPOS:
1859        case OP_SCBRA:
1860        case OP_SCBRAPOS:
1861        case OP_SCOND:
1862        case OP_SKIPZERO:
1863        case OP_STAR:
1864        case OP_STARI:
1865        case OP_TYPEMINPLUS:
1866        case OP_TYPEMINQUERY:
1867        case OP_TYPEMINSTAR:
1868        case OP_TYPEMINUPTO:
1869        case OP_TYPEPLUS:
1870        case OP_TYPEPOSPLUS:
1871        case OP_TYPEPOSQUERY:
1872        case OP_TYPEPOSSTAR:
1873        case OP_TYPEPOSUPTO:
1874        case OP_TYPEQUERY:
1875        case OP_TYPESTAR:
1876        case OP_TYPEUPTO:
1877        case OP_UPTO:
1878        case OP_UPTOI:
1879      return -1;      return -1;
1880    
1881        /* Catch unrecognized opcodes so that when new ones are added they
1882        are not forgotten, as has happened in the past. */
1883    
1884        default:
1885        return -4;
1886      }      }
1887    }    }
1888  /* Control never gets here */  /* Control never gets here */
# Line 1662  Arguments: Line 1909  Arguments:
1909  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1910  */  */
1911    
1912  const uschar *  const pcre_uchar *
1913  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  _pcre_find_bracket(const pcre_uchar *code, BOOL utf8, int number)
1914  {  {
1915  for (;;)  for (;;)
1916    {    {
1917    register int c = *code;    register int c = *code;
1918    
1919    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1920    
1921    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1680  for (;;) Line 1928  for (;;)
1928    
1929    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
1930      {      {
1931      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
1932      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1933      }      }
1934    
1935    /* Handle capturing bracket */    /* Handle capturing bracket */
1936    
1937    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
1938               c == OP_CBRAPOS || c == OP_SCBRAPOS)
1939      {      {
1940      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
1941      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
1942      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1943      }      }
1944    
# Line 1724  for (;;) Line 1973  for (;;)
1973        case OP_MARK:        case OP_MARK:
1974        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
1975        case OP_SKIP_ARG:        case OP_SKIP_ARG:
1976          code += code[1];
1977          break;
1978    
1979        case OP_THEN_ARG:        case OP_THEN_ARG:
1980        code += code[1];        code += code[1];
1981        break;        break;
# Line 1741  for (;;) Line 1993  for (;;)
1993      if (utf8) switch(c)      if (utf8) switch(c)
1994        {        {
1995        case OP_CHAR:        case OP_CHAR:
1996        case OP_CHARNC:        case OP_CHARI:
1997        case OP_EXACT:        case OP_EXACT:
1998          case OP_EXACTI:
1999        case OP_UPTO:        case OP_UPTO:
2000          case OP_UPTOI:
2001        case OP_MINUPTO:        case OP_MINUPTO:
2002          case OP_MINUPTOI:
2003        case OP_POSUPTO:        case OP_POSUPTO:
2004          case OP_POSUPTOI:
2005        case OP_STAR:        case OP_STAR:
2006          case OP_STARI:
2007        case OP_MINSTAR:        case OP_MINSTAR:
2008          case OP_MINSTARI:
2009        case OP_POSSTAR:        case OP_POSSTAR:
2010          case OP_POSSTARI:
2011        case OP_PLUS:        case OP_PLUS:
2012          case OP_PLUSI:
2013        case OP_MINPLUS:        case OP_MINPLUS:
2014          case OP_MINPLUSI:
2015        case OP_POSPLUS:        case OP_POSPLUS:
2016          case OP_POSPLUSI:
2017        case OP_QUERY:        case OP_QUERY:
2018          case OP_QUERYI:
2019        case OP_MINQUERY:        case OP_MINQUERY:
2020          case OP_MINQUERYI:
2021        case OP_POSQUERY:        case OP_POSQUERY:
2022          case OP_POSQUERYI:
2023        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2024        break;        break;
2025        }        }
# Line 1781  Arguments: Line 2046  Arguments:
2046  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2047  */  */
2048    
2049  static const uschar *  static const pcre_uchar *
2050  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf8)
2051  {  {
2052  for (;;)  for (;;)
2053    {    {
# Line 1827  for (;;) Line 2092  for (;;)
2092        case OP_MARK:        case OP_MARK:
2093        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2094        case OP_SKIP_ARG:        case OP_SKIP_ARG:
2095          code += code[1];
2096          break;
2097    
2098        case OP_THEN_ARG:        case OP_THEN_ARG:
2099        code += code[1];        code += code[1];
2100        break;        break;
# Line 1844  for (;;) Line 2112  for (;;)
2112      if (utf8) switch(c)      if (utf8) switch(c)
2113        {        {
2114        case OP_CHAR:        case OP_CHAR:
2115        case OP_CHARNC:        case OP_CHARI:
2116        case OP_EXACT:        case OP_EXACT:
2117          case OP_EXACTI:
2118        case OP_UPTO:        case OP_UPTO:
2119          case OP_UPTOI:
2120        case OP_MINUPTO:        case OP_MINUPTO:
2121          case OP_MINUPTOI:
2122        case OP_POSUPTO:        case OP_POSUPTO:
2123          case OP_POSUPTOI:
2124        case OP_STAR:        case OP_STAR:
2125          case OP_STARI:
2126        case OP_MINSTAR:        case OP_MINSTAR:
2127          case OP_MINSTARI:
2128        case OP_POSSTAR:        case OP_POSSTAR:
2129          case OP_POSSTARI:
2130        case OP_PLUS:        case OP_PLUS:
2131          case OP_PLUSI:
2132        case OP_MINPLUS:        case OP_MINPLUS:
2133          case OP_MINPLUSI:
2134        case OP_POSPLUS:        case OP_POSPLUS:
2135          case OP_POSPLUSI:
2136        case OP_QUERY:        case OP_QUERY:
2137          case OP_QUERYI:
2138        case OP_MINQUERY:        case OP_MINQUERY:
2139          case OP_MINQUERYI:
2140        case OP_POSQUERY:        case OP_POSQUERY:
2141          case OP_POSQUERYI:
2142        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2143        break;        break;
2144        }        }
# Line 1892  Returns:      TRUE if what is matched co Line 2173  Returns:      TRUE if what is matched co
2173  */  */
2174    
2175  static BOOL  static BOOL
2176  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2177    compile_data *cd)    BOOL utf8, compile_data *cd)
2178  {  {
2179  register int c;  register int c;
2180  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
2181       code < endcode;       code < endcode;
2182       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
2183    {    {
2184    const uschar *ccode;    const pcre_uchar *ccode;
2185    
2186    c = *code;    c = *code;
2187    
# Line 1914  for (code = first_significant_code(code Line 2195  for (code = first_significant_code(code
2195      continue;      continue;
2196      }      }
2197    
   /* Groups with zero repeats can of course be empty; skip them. */  
   
   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)  
     {  
     code += _pcre_OP_lengths[c];  
     do code += GET(code, 1); while (*code == OP_ALT);  
     c = *code;  
     continue;  
     }  
   
2198    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
2199    implies a subroutine call, we can scan it. */    implies a backward reference subroutine call, we can scan it. If it's a
2200      forward reference subroutine call, we can't. To detect forward reference
2201      we have to scan up the list that is kept in the workspace. This function is
2202      called only when doing the real compile, not during the pre-compile that
2203      measures the size of the compiled pattern. */
2204    
2205    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2206      {      {
2207      BOOL empty_branch = FALSE;      const pcre_uchar *scode;
2208      const uschar *scode = cd->start_code + GET(code, 1);      BOOL empty_branch;
2209    
2210        /* Test for forward reference */
2211    
2212        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2213          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2214    
2215        /* Not a forward reference, test for completed backward reference */
2216    
2217        empty_branch = FALSE;
2218        scode = cd->start_code + GET(code, 1);
2219      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2220    
2221        /* Completed backwards reference */
2222    
2223      do      do
2224        {        {
2225        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf8, cd))
# Line 1942  for (code = first_significant_code(code Line 2230  for (code = first_significant_code(code
2230        scode += GET(scode, 1);        scode += GET(scode, 1);
2231        }        }
2232      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2233    
2234      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2235      continue;      continue;
2236      }      }
2237    
2238      /* Groups with zero repeats can of course be empty; skip them. */
2239    
2240      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2241          c == OP_BRAPOSZERO)
2242        {
2243        code += _pcre_OP_lengths[c];
2244        do code += GET(code, 1); while (*code == OP_ALT);
2245        c = *code;
2246        continue;
2247        }
2248    
2249      /* A nested group that is already marked as "could be empty" can just be
2250      skipped. */
2251    
2252      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2253          c == OP_SCBRA || c == OP_SCBRAPOS)
2254        {
2255        do code += GET(code, 1); while (*code == OP_ALT);
2256        c = *code;
2257        continue;
2258        }
2259    
2260    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2261    
2262    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2263          c == OP_CBRA || c == OP_CBRAPOS ||
2264          c == OP_ONCE || c == OP_ONCE_NC ||
2265          c == OP_COND)
2266      {      {
2267      BOOL empty_branch;      BOOL empty_branch;
2268      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 2035  for (code = first_significant_code(code Line 2349  for (code = first_significant_code(code
2349      case OP_ALLANY:      case OP_ALLANY:
2350      case OP_ANYBYTE:      case OP_ANYBYTE:
2351      case OP_CHAR:      case OP_CHAR:
2352      case OP_CHARNC:      case OP_CHARI:
2353      case OP_NOT:      case OP_NOT:
2354        case OP_NOTI:
2355      case OP_PLUS:      case OP_PLUS:
2356      case OP_MINPLUS:      case OP_MINPLUS:
2357      case OP_POSPLUS:      case OP_POSPLUS:
# Line 2076  for (code = first_significant_code(code Line 2391  for (code = first_significant_code(code
2391      case OP_KET:      case OP_KET:
2392      case OP_KETRMAX:      case OP_KETRMAX:
2393      case OP_KETRMIN:      case OP_KETRMIN:
2394        case OP_KETRPOS:
2395      case OP_ALT:      case OP_ALT:
2396      return TRUE;      return TRUE;
2397    
# Line 2084  for (code = first_significant_code(code Line 2400  for (code = first_significant_code(code
2400    
2401  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2402      case OP_STAR:      case OP_STAR:
2403        case OP_STARI:
2404      case OP_MINSTAR:      case OP_MINSTAR:
2405        case OP_MINSTARI:
2406      case OP_POSSTAR:      case OP_POSSTAR:
2407        case OP_POSSTARI:
2408      case OP_QUERY:      case OP_QUERY:
2409        case OP_QUERYI:
2410      case OP_MINQUERY:      case OP_MINQUERY:
2411        case OP_MINQUERYI:
2412      case OP_POSQUERY:      case OP_POSQUERY:
2413        case OP_POSQUERYI:
2414      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2415      break;      break;
2416    
2417      case OP_UPTO:      case OP_UPTO:
2418        case OP_UPTOI:
2419      case OP_MINUPTO:      case OP_MINUPTO:
2420        case OP_MINUPTOI:
2421      case OP_POSUPTO:      case OP_POSUPTO:
2422        case OP_POSUPTOI:
2423      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2424      break;      break;
2425  #endif  #endif
# Line 2105  for (code = first_significant_code(code Line 2430  for (code = first_significant_code(code
2430      case OP_MARK:      case OP_MARK:
2431      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
2432      case OP_SKIP_ARG:      case OP_SKIP_ARG:
2433        code += code[1];
2434        break;
2435    
2436      case OP_THEN_ARG:      case OP_THEN_ARG:
2437      code += code[1];      code += code[1];
2438      break;      break;
# Line 2129  return TRUE; Line 2457  return TRUE;
2457  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2458  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2459  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2460    This function is called only during the real compile, not during the
2461    pre-compile.
2462    
2463  Arguments:  Arguments:
2464    code        points to start of the recursion    code        points to start of the recursion
# Line 2141  Returns:      TRUE if what is matched co Line 2471  Returns:      TRUE if what is matched co
2471  */  */
2472    
2473  static BOOL  static BOOL
2474  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2475    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf8, compile_data *cd)
2476  {  {
2477  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2478    {    {
# Line 2179  where Perl recognizes it as the POSIX cl Line 2509  where Perl recognizes it as the POSIX cl
2509  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2510  I think.  I think.
2511    
2512    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2513    It seems that the appearance of a nested POSIX class supersedes an apparent
2514    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2515    a digit.
2516    
2517    In Perl, unescaped square brackets may also appear as part of class names. For
2518    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2519    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2520    seem right at all. PCRE does not allow closing square brackets in POSIX class
2521    names.
2522    
2523  Arguments:  Arguments:
2524    ptr      pointer to the initial [    ptr      pointer to the initial [
2525    endptr   where to return the end pointer    endptr   where to return the end pointer
# Line 2187  Returns:   TRUE or FALSE Line 2528  Returns:   TRUE or FALSE
2528  */  */
2529    
2530  static BOOL  static BOOL
2531  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2532  {  {
2533  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2534  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2535  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2536    {    {
2537    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2538        ptr++;
2539      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2540      else
2541      {      {
     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;  
2542      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2543        {        {
2544        *endptr = ptr;        *endptr = ptr;
2545        return TRUE;        return TRUE;
2546        }        }
2547        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2548             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2549              ptr[1] == CHAR_EQUALS_SIGN) &&
2550            check_posix_syntax(ptr, endptr))
2551          return FALSE;
2552      }      }
2553    }    }
2554  return FALSE;  return FALSE;
# Line 2224  Returns:     a value representing the na Line 2572  Returns:     a value representing the na
2572  */  */
2573    
2574  static int  static int
2575  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2576  {  {
2577  const char *pn = posix_names;  const char *pn = posix_names;
2578  register int yield = 0;  register int yield = 0;
2579  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2580    {    {
2581    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2582      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2583    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2584    yield++;    yield++;
2585    }    }
# Line 2271  Returns:     nothing Line 2619  Returns:     nothing
2619  */  */
2620    
2621  static void  static void
2622  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf8, compile_data *cd,
2623    uschar *save_hwm)    pcre_uchar *save_hwm)
2624  {  {
2625  uschar *ptr = group;  pcre_uchar *ptr = group;
2626    
2627  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf8)) != NULL)
2628    {    {
2629    int offset;    int offset;
2630    uschar *hc;    pcre_uchar *hc;
2631    
2632    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2633    reference. */    reference. */
# Line 2324  Arguments: Line 2672  Arguments:
2672  Returns:         new code pointer  Returns:         new code pointer
2673  */  */
2674    
2675  static uschar *  static pcre_uchar *
2676  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2677  {  {
2678  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2679  *code++ = 255;  *code++ = 255;
2680  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2681  PUT(code, LINK_SIZE, 0);                       /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2682  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2683  }  }
2684    
2685    
# Line 2353  Returns:             nothing Line 2701  Returns:             nothing
2701  */  */
2702    
2703  static void  static void
2704  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2705  {  {
2706  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2707  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
# Line 2491  Returns:        TRUE if possessifying is Line 2839  Returns:        TRUE if possessifying is
2839  */  */
2840    
2841  static BOOL  static BOOL
2842  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const pcre_uchar *previous, BOOL utf8,
2843    int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2844  {  {
2845  int c, next;  int c, next;
2846  int op_code = *previous++;  int op_code = *previous++;
# Line 2506  if ((options & PCRE_EXTENDED) != 0) Line 2854  if ((options & PCRE_EXTENDED) != 0)
2854      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2855      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2856        {        {
2857        while (*(++ptr) != 0)        ptr++;
2858          while (*ptr != 0)
2859            {
2860          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2861            ptr++;
2862    #ifdef SUPPORT_UTF8
2863            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2864    #endif
2865            }
2866        }        }
2867      else break;      else break;
2868      }      }
# Line 2543  if ((options & PCRE_EXTENDED) != 0) Line 2898  if ((options & PCRE_EXTENDED) != 0)
2898      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2899      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2900        {        {
2901        while (*(++ptr) != 0)        ptr++;
2902          while (*ptr != 0)
2903            {
2904          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2905            ptr++;
2906    #ifdef SUPPORT_UTF8
2907            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2908    #endif
2909            }
2910        }        }
2911      else break;      else break;
2912      }      }
# Line 2553  if ((options & PCRE_EXTENDED) != 0) Line 2915  if ((options & PCRE_EXTENDED) != 0)
2915  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2916    
2917  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2918    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2919      return FALSE;      return FALSE;
2920    
2921  /* Now compare the next item with the previous opcode. First, handle cases when  /* Now compare the next item with the previous opcode. First, handle cases when
# Line 2569  if (next >= 0) switch(op_code) Line 2931  if (next >= 0) switch(op_code)
2931  #endif  #endif
2932    return c != next;    return c != next;
2933    
2934    /* For CHARNC (caseless character) we must check the other case. If we have    /* For CHARI (caseless character) we must check the other case. If we have
2935    Unicode property support, we can use it to test the other case of    Unicode property support, we can use it to test the other case of
2936    high-valued characters. */    high-valued characters. */
2937    
2938    case OP_CHARNC:    case OP_CHARI:
2939  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2940    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
2941  #else  #else
# Line 2596  if (next >= 0) switch(op_code) Line 2958  if (next >= 0) switch(op_code)
2958  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
2959    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
2960    
2961    /* For OP_NOT, its data is always a single-byte character. */    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2962      opcodes are not used for multi-byte characters, because they are coded using
2963      an XCLASS instead. */
2964    
2965    case OP_NOT:    case OP_NOT:
2966      return (c = *previous) == next;
2967    
2968      case OP_NOTI:
2969    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
   if ((options & PCRE_CASELESS) == 0) return FALSE;  
2970  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2971    if (utf8)    if (utf8)
2972      {      {
# Line 2705  replaced by OP_PROP codes when PCRE_UCP Line 3071  replaced by OP_PROP codes when PCRE_UCP
3071  switch(op_code)  switch(op_code)
3072    {    {
3073    case OP_CHAR:    case OP_CHAR:
3074    case OP_CHARNC:    case OP_CHARI:
3075  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3076    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3077  #else  #else
# Line 2811  switch(op_code) Line 3177  switch(op_code)
3177        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
3178    
3179        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3180          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3181            return FALSE;            return FALSE;
3182    
3183        /* Do the property check. */        /* Do the property check. */
# Line 2892  Arguments: Line 3258  Arguments:
3258    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3259    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
3260    bcptr          points to current branch chain    bcptr          points to current branch chain
3261      cond_depth     conditional nesting depth
3262    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3263    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3264                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 2901  Returns:         TRUE on success Line 3268  Returns:         TRUE on success
3268  */  */
3269    
3270  static BOOL  static BOOL
3271  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3272    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, int *firstbyteptr,
3273    compile_data *cd, int *lengthptr)    int *reqbyteptr, branch_chain *bcptr, int cond_depth, compile_data *cd,
3274      int *lengthptr)
3275  {  {
3276  int repeat_type, op_type;  int repeat_type, op_type;
3277  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 2912  int greedy_default, greedy_non_default; Line 3280  int greedy_default, greedy_non_default;
3280  int firstbyte, reqbyte;  int firstbyte, reqbyte;
3281  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
3282  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
3283  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3284  int after_manual_callout = 0;  int after_manual_callout = 0;
3285  int length_prevgroup = 0;  int length_prevgroup = 0;
3286  register int c;  register int c;
3287  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3288  uschar *last_code = code;  pcre_uchar *last_code = code;
3289  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3290  uschar *tempcode;  pcre_uchar *tempcode;
3291  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3292  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
3293  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3294  const uschar *tempptr;  const pcre_uchar *tempptr;
3295  const uschar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
3296  uschar *previous = NULL;  pcre_uchar *previous = NULL;
3297  uschar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
3298  uschar *save_hwm = NULL;  pcre_uchar *save_hwm = NULL;
3299  uschar classbits[32];  pcre_uchar classbits[32];
3300    
3301    /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3302    must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3303    dynamically as we process the pattern. */
3304    
3305  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3306  BOOL class_utf8;  BOOL class_utf8;
3307  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
3308  uschar *class_utf8data;  pcre_uint8 *class_utf8data;
3309  uschar *class_utf8data_base;  pcre_uint8 *class_utf8data_base;
3310  uschar utf8_char[6];  pcre_uint8 utf8_char[6];
3311  #else  #else
3312  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
 uschar *utf8_char = NULL;  
3313  #endif  #endif
3314    
3315  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 2989  for (;; ptr++) Line 3360  for (;; ptr++)
3360    int subfirstbyte;    int subfirstbyte;
3361    int terminator;    int terminator;
3362    int mclength;    int mclength;
3363    uschar mcbuffer[8];    int tempbracount;
3364      pcre_uchar mcbuffer[8];
3365    
3366    /* Get next byte in the pattern */    /* Get next byte in the pattern */
3367    
# Line 3036  for (;; ptr++) Line 3408  for (;; ptr++)
3408        }        }
3409    
3410      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3411      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3412          c));
3413    
3414      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3415      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 3110  for (;; ptr++) Line 3483  for (;; ptr++)
3483      previous_callout = NULL;      previous_callout = NULL;
3484      }      }
3485    
3486    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3487    
3488    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3489      {      {
3490      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
3491      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3492        {        {
3493        while (*(++ptr) != 0)        ptr++;
3494          while (*ptr != 0)
3495          {          {
3496          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3497            ptr++;
3498    #ifdef SUPPORT_UTF8
3499            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3500    #endif
3501          }          }
3502        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3503    
# Line 3164  for (;; ptr++) Line 3542  for (;; ptr++)
3542      the setting of any following char as a first character. */      the setting of any following char as a first character. */
3543    
3544      case CHAR_CIRCUMFLEX_ACCENT:      case CHAR_CIRCUMFLEX_ACCENT:
3545        previous = NULL;
3546      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3547        {        {
3548        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3549          *code++ = OP_CIRCM;
3550        }        }
3551      previous = NULL;      else *code++ = OP_CIRC;
     *code++ = OP_CIRC;  
3552      break;      break;
3553    
3554      case CHAR_DOLLAR_SIGN:      case CHAR_DOLLAR_SIGN:
3555      previous = NULL;      previous = NULL;
3556      *code++ = OP_DOLL;      *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3557      break;      break;
3558    
3559      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
# Line 3238  for (;; ptr++) Line 3617  for (;; ptr++)
3617          {          {
3618          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
3619            ptr++;            ptr++;
3620          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1,
3621                            STR_Q STR_BACKSLASH STR_E, 3) == 0)                            STR_Q STR_BACKSLASH STR_E, 3) == 0)
3622            ptr += 3;            ptr += 3;
3623          else          else
# Line 3281  for (;; ptr++) Line 3660  for (;; ptr++)
3660      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3661      */      */
3662    
3663      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3664    
3665  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3666      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
# Line 3295  for (;; ptr++) Line 3674  for (;; ptr++)
3674    
3675      if (c != 0) do      if (c != 0) do
3676        {        {
3677        const uschar *oldptr;        const pcre_uchar *oldptr;
3678    
3679  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3680        if (utf8 && c > 127)        if (utf8 && c > 127)
# Line 3341  for (;; ptr++) Line 3720  for (;; ptr++)
3720          {          {
3721          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3722          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3723          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3724          uschar pbits[32];          pcre_uint8 pbits[32];
3725    
3726          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
3727            {            {
# Line 3397  for (;; ptr++) Line 3776  for (;; ptr++)
3776          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3777    
3778          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3779            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3780    
3781          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3782    
# Line 3459  for (;; ptr++) Line 3838  for (;; ptr++)
3838    
3839          if (c < 0)          if (c < 0)
3840            {            {
3841            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3842            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
3843    
3844            switch (-c)            switch (-c)
# Line 3494  for (;; ptr++) Line 3873  for (;; ptr++)
3873              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3874              continue;              continue;
3875    
3876                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3877                if it was previously set by something earlier in the character
3878                class. */
3879    
3880              case ESC_s:              case ESC_s:
3881              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
3882              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
3883                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3884              continue;              continue;
3885    
3886              case ESC_S:              case ESC_S:
# Line 3915  for (;; ptr++) Line 4299  for (;; ptr++)
4299    
4300      In UTF-8 mode, we can optimize the negative case only if there were no      In UTF-8 mode, we can optimize the negative case only if there were no
4301      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4302      operate on single-bytes only. This is an historical hangover. Maybe one day      operate on single-bytes characters only. This is an historical hangover.
4303      we can tidy these opcodes to handle multi-byte characters.      Maybe one day we can tidy these opcodes to handle multi-byte characters.
4304    
4305      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
4306      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4307      that OP_NOT does not support multibyte characters. In the positive case, it      Note that OP_NOT[I] does not support multibyte characters. In the positive
4308      can cause firstbyte to be set. Otherwise, there can be no first char if      case, it can cause firstbyte to be set. Otherwise, there can be no first
4309      this item is first, whatever repeat count may follow. In the case of      char if this item is first, whatever repeat count may follow. In the case
4310      reqbyte, save the previous value for reinstating. */      of reqbyte, save the previous value for reinstating. */
4311    
4312  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4313      if (class_charcount == 1 && !class_utf8 &&      if (class_charcount == 1 && !class_utf8 &&
# Line 3934  for (;; ptr++) Line 4318  for (;; ptr++)
4318        {        {
4319        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4320    
4321        /* The OP_NOT opcode works on one-byte characters only. */        /* The OP_NOT[I] opcodes work on one-byte characters only. */
4322    
4323        if (negate_class)        if (negate_class)
4324          {          {
4325          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4326          zerofirstbyte = firstbyte;          zerofirstbyte = firstbyte;
4327          *code++ = OP_NOT;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4328          *code++ = class_lastchar;          *code++ = class_lastchar;
4329          break;          break;
4330          }          }
# Line 4068  for (;; ptr++) Line 4452  for (;; ptr++)
4452      op_type = 0;                    /* Default single-char op codes */      op_type = 0;                    /* Default single-char op codes */
4453      possessive_quantifier = FALSE;  /* Default not possessive quantifier */      possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4454    
4455      /* Save start of previous item, in case we have to move it up to make space      /* Save start of previous item, in case we have to move it up in order to
4456      for an inserted OP_ONCE for the additional '+' extension. */      insert something before it. */
4457    
4458      tempcode = previous;      tempcode = previous;
4459    
# Line 4092  for (;; ptr++) Line 4476  for (;; ptr++)
4476        }        }
4477      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4478    
4479        /* If previous was a recursion call, wrap it in atomic brackets so that
4480        previous becomes the atomic group. All recursions were so wrapped in the
4481        past, but it no longer happens for non-repeated recursions. In fact, the
4482        repeated ones could be re-implemented independently so as not to need this,
4483        but for the moment we rely on the code for repeating groups. */
4484    
4485        if (*previous == OP_RECURSE)
4486          {
4487          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
4488          *previous = OP_ONCE;
4489          PUT(previous, 1, 2 + 2*LINK_SIZE);
4490          previous[2 + 2*LINK_SIZE] = OP_KET;
4491          PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4492          code += 2 + 2 * LINK_SIZE;
4493          length_prevgroup = 3 + 3*LINK_SIZE;
4494    
4495          /* When actually compiling, we need to check whether this was a forward
4496          reference, and if so, adjust the offset. */
4497    
4498          if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4499            {
4500            int offset = GET(cd->hwm, -LINK_SIZE);
4501            if (offset == previous + 1 - cd->start_code)
4502              PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4503            }
4504          }
4505    
4506        /* Now handle repetition for the different types of item. */
4507    
4508      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4509      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
4510      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
4511      the first thing in a branch because the x will have gone into firstbyte      the first thing in a branch because the x will have gone into firstbyte
4512      instead.  */      instead.  */
4513    
4514      if (*previous == OP_CHAR || *previous == OP_CHARNC)      if (*previous == OP_CHAR || *previous == OP_CHARI)
4515        {        {
4516          op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4517    
4518        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF-8 characters that take up more than one byte. It's
4519        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4520        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus 0x80 to flag that it's a
# Line 4108  for (;; ptr++) Line 4523  for (;; ptr++)
4523  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4524        if (utf8 && (code[-1] & 0x80) != 0)        if (utf8 && (code[-1] & 0x80) != 0)
4525          {          {
4526          uschar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4527          while((*lastchar & 0xc0) == 0x80) lastchar--;          while((*lastchar & 0xc0) == 0x80) lastchar--;
4528          c = code - lastchar;            /* Length of UTF-8 character */          c = code - lastchar;            /* Length of UTF-8 character */
4529          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf8_char, lastchar, c); /* Save the char */
# Line 4144  for (;; ptr++) Line 4559  for (;; ptr++)
4559      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
4560      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
4561      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
4562      repeat_type. We can also test for auto-possessification. OP_NOT is      repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
4563      currently used only for single-byte chars. */      are currently used only for single-byte chars. */
4564    
4565      else if (*previous == OP_NOT)      else if (*previous == OP_NOT || *previous == OP_NOTI)
4566        {        {
4567        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
4568        c = previous[1];        c = previous[1];
4569        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4570            repeat_max < 0 &&            repeat_max < 0 &&
# Line 4170  for (;; ptr++) Line 4585  for (;; ptr++)
4585    
4586      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
4587        {        {
4588        uschar *oldcode;        pcre_uchar *oldcode;
4589        int prop_type, prop_value;        int prop_type, prop_value;
4590        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4591        c = *previous;        c = *previous;
# Line 4346  for (;; ptr++) Line 4761  for (;; ptr++)
4761  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4762               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
4763  #endif  #endif
4764               *previous == OP_REF)               *previous == OP_REF ||
4765                 *previous == OP_REFI)
4766        {        {
4767        if (repeat_max == 0)        if (repeat_max == 0)
4768          {          {
# Line 4380  for (;; ptr++) Line 4796  for (;; ptr++)
4796        }        }
4797    
4798      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
4799      cases. */      cases. Note that at this point we can encounter only the "basic" bracket
4800        opcodes such as BRA and CBRA, as this is the place where they get converted
4801        into the more special varieties such as BRAPOS and SBRA. A test for >=
4802        OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
4803        ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
4804        repetition of assertions, but now it does, for Perl compatibility. */
4805    
4806      else if (*previous == OP_BRA  || *previous == OP_CBRA ||      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
              *previous == OP_ONCE || *previous == OP_COND)  
4807        {        {
4808        register int i;        register int i;
       int ketoffset = 0;  
4809        int len = (int)(code - previous);        int len = (int)(code - previous);
4810        uschar *bralink = NULL;        pcre_uchar *bralink = NULL;
4811          pcre_uchar *brazeroptr = NULL;
4812    
4813        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
4814          we just ignore the repeat. */
4815    
4816        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4817          {          goto END_REPEAT;
         *errorcodeptr = ERR55;  
         goto FAILED;  
         }  
4818    
4819        /* If the maximum repeat count is unlimited, find the end of the bracket        /* There is no sense in actually repeating assertions. The only potential
4820        by scanning through from the start, and compute the offset back to it        use of repetition is in cases when the assertion is optional. Therefore,
4821        from the current code pointer. There may be an OP_OPT setting following        if the minimum is greater than zero, just ignore the repeat. If the
4822        the final KET, so we can't find the end just by going back from the code        maximum is not not zero or one, set it to 1. */
4823        pointer. */  
4824          if (*previous < OP_ONCE)    /* Assertion */
4825        if (repeat_max == -1)          {
4826          {          if (repeat_min > 0) goto END_REPEAT;
4827          register uschar *ket = previous;          if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
         do ket += GET(ket, 1); while (*ket != OP_KET);  
         ketoffset = (int)(code - ket);  
4828          }          }
4829    
4830        /* The case of a zero minimum is special because of the need to stick        /* The case of a zero minimum is special because of the need to stick
# Line 4429  for (;; ptr++) Line 4845  for (;; ptr++)
4845          **   goto END_REPEAT;          **   goto END_REPEAT;
4846          **   }          **   }
4847    
4848          However, that fails when a group is referenced as a subroutine from          However, that fails when a group or a subgroup within it is referenced
4849          elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it          as a subroutine from elsewhere in the pattern, so now we stick in
4850          so that it is skipped on execution. As we don't have a list of which          OP_SKIPZERO in front of it so that it is skipped on execution. As we
4851          groups are referenced, we cannot do this selectively.          don't have a list of which groups are referenced, we cannot do this
4852            selectively.
4853    
4854          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4855          and do no more at this point. However, we do need to adjust any          and do no more at this point. However, we do need to adjust any
# Line 4452  for (;; ptr++) Line 4869  for (;; ptr++)
4869              *previous++ = OP_SKIPZERO;              *previous++ = OP_SKIPZERO;
4870              goto END_REPEAT;              goto END_REPEAT;
4871              }              }
4872              brazeroptr = previous;    /* Save for possessive optimizing */
4873            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4874            }            }
4875    
# Line 4521  for (;; ptr++) Line 4939  for (;; ptr++)
4939              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4940              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
4941                {                {
4942                uschar *hc;                pcre_uchar *hc;
4943                uschar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
4944                memcpy(code, previous, len);                memcpy(code, previous, len);
4945                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4946                  {                  {
# Line 4573  for (;; ptr++) Line 4991  for (;; ptr++)
4991    
4992          else for (i = repeat_max - 1; i >= 0; i--)          else for (i = repeat_max - 1; i >= 0; i--)
4993            {            {
4994            uschar *hc;            pcre_uchar *hc;
4995            uschar *this_hwm = cd->hwm;            pcre_uchar *this_hwm = cd->hwm;
4996    
4997            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
4998    
# Line 4607  for (;; ptr++) Line 5025  for (;; ptr++)
5025            {            {
5026            int oldlinkoffset;            int oldlinkoffset;
5027            int offset = (int)(code - bralink + 1);            int offset = (int)(code - bralink + 1);
5028            uschar *bra = code - offset;            pcre_uchar *bra = code - offset;
5029            oldlinkoffset = GET(bra, 1);            oldlinkoffset = GET(bra, 1);
5030            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5031            *code++ = OP_KET;            *code++ = OP_KET;
# Line 4616  for (;; ptr++) Line 5034  for (;; ptr++)
5034            }            }
5035          }          }
5036    
5037        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. For
5038        can't just offset backwards from the current code point, because we        ONCE brackets, that's all we need to do. However, possessively repeated
5039        don't know if there's been an options resetting after the ket. The        ONCE brackets can be converted into non-capturing brackets, as the
5040        correct offset was computed above.        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5041          deal with possessive ONCEs specially.
5042        Then, when we are doing the actual compile phase, check to see whether  
5043        this group is a non-atomic one that could match an empty string. If so,        Otherwise, when we are doing the actual compile phase, check to see
5044          whether this group is one that could match an empty string. If so,
5045        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5046        that runtime checking can be done. [This check is also applied to        that runtime checking can be done. [This check is also applied to ONCE
5047        atomic groups at runtime, but in a different way.] */        groups at runtime, but in a different way.]
5048    
5049          Then, if the quantifier was possessive and the bracket is not a
5050          conditional, we convert the BRA code to the POS form, and the KET code to
5051          KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5052          subpattern at both the start and at the end.) The use of special opcodes
5053          makes it possible to reduce greatly the stack usage in pcre_exec(). If
5054          the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5055    
5056          Then, if the minimum number of matches is 1 or 0, cancel the possessive
5057          flag so that the default action below, of wrapping everything inside
5058          atomic brackets, does not happen. When the minimum is greater than 1,
5059          there will be earlier copies of the group, and so we still have to wrap
5060          the whole thing. */
5061    
5062        else        else
5063          {          {
5064          uschar *ketcode = code - ketoffset;          pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5065          uschar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5066          *ketcode = OP_KETRMAX + repeat_type;  
5067          if (lengthptr == NULL && *bracode != OP_ONCE)          /* Convert possessive ONCE brackets to non-capturing */
5068    
5069            if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5070                possessive_quantifier) *bracode = OP_BRA;
5071    
5072            /* For non-possessive ONCE brackets, all we need to do is to
5073            set the KET. */
5074    
5075            if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5076              *ketcode = OP_KETRMAX + repeat_type;
5077    
5078            /* Handle non-ONCE brackets and possessive ONCEs (which have been
5079            converted to non-capturing above). */
5080    
5081            else
5082            {            {
5083            uschar *scode = bracode;            /* In the compile phase, check for empty string matching. */
5084            do  
5085              if (lengthptr == NULL)
5086              {              {
5087              if (could_be_empty_branch(scode, ketcode, utf8, cd))              pcre_uchar *scode = bracode;
5088                do
5089                {                {
5090                *bracode += OP_SBRA - OP_BRA;                if (could_be_empty_branch(scode, ketcode, utf8, cd))
5091                break;                  {
5092                    *bracode += OP_SBRA - OP_BRA;
5093                    break;
5094                    }
5095                  scode += GET(scode, 1);
5096                }                }
5097              scode += GET(scode, 1);              while (*scode == OP_ALT);
5098              }              }
5099            while (*scode == OP_ALT);  
5100              /* Handle possessive quantifiers. */
5101    
5102              if (possessive_quantifier)
5103                {
5104                /* For COND brackets, we wrap the whole thing in a possessively
5105                repeated non-capturing bracket, because we have not invented POS
5106                versions of the COND opcodes. Because we are moving code along, we
5107                must ensure that any pending recursive references are updated. */
5108    
5109                if (*bracode == OP_COND || *bracode == OP_SCOND)
5110                  {
5111                  int nlen = (int)(code - bracode);
5112                  *code = OP_END;
5113                  adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);
5114                  memmove(bracode + 1+LINK_SIZE, bracode, nlen);
5115                  code += 1 + LINK_SIZE;
5116                  nlen += 1 + LINK_SIZE;
5117                  *bracode = OP_BRAPOS;
5118                  *code++ = OP_KETRPOS;
5119                  PUTINC(code, 0, nlen);
5120                  PUT(bracode, 1, nlen);
5121                  }
5122    
5123                /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5124    
5125                else
5126                  {
5127                  *bracode += 1;              /* Switch to xxxPOS opcodes */
5128                  *ketcode = OP_KETRPOS;
5129                  }
5130    
5131                /* If the minimum is zero, mark it as possessive, then unset the
5132                possessive flag when the minimum is 0 or 1. */
5133    
5134                if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5135                if (repeat_min < 2) possessive_quantifier = FALSE;
5136                }
5137    
5138              /* Non-possessive quantifier */
5139    
5140              else *ketcode = OP_KETRMAX + repeat_type;
5141            }            }
5142          }          }
5143        }        }
# Line 4665  for (;; ptr++) Line 5158  for (;; ptr++)
5158        }        }
5159    
5160      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', or if certain optimization
5161      tests above succeeded, possessive_quantifier is TRUE. For some of the      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5162      simpler opcodes, there is an special alternative opcode for this. For      there are special alternative opcodes for this case. For anything else, we
5163      anything else, we wrap the entire repeated item inside OP_ONCE brackets.      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5164      The '+' notation is just syntactic sugar, taken from Sun's Java package,      notation is just syntactic sugar, taken from Sun's Java package, but the
5165      but the special opcodes can optimize it a bit. The repeated item starts at      special opcodes can optimize it.
5166      tempcode, not at previous, which might be the first part of a string whose  
5167      (former) last char we repeated.      Some (but not all) possessively repeated subpatterns have already been
5168        completely handled in the code just above. For them, possessive_quantifier
5169        is always FALSE at this stage.
5170    
5171        Note that the repeated item starts at tempcode, not at previous, which
5172        might be the first part of a string whose (former) last char we repeated.
5173    
5174      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5175      an 'upto' may follow. We skip over an 'exact' item, and then test the      an 'upto' may follow. We skip over an 'exact' item, and then test the
# Line 4702  for (;; ptr++) Line 5200  for (;; ptr++)
5200          case OP_QUERY: *tempcode = OP_POSQUERY; break;          case OP_QUERY: *tempcode = OP_POSQUERY; break;
5201          case OP_UPTO:  *tempcode = OP_POSUPTO; break;          case OP_UPTO:  *tempcode = OP_POSUPTO; break;
5202    
5203          case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;          case OP_STARI:  *tempcode = OP_POSSTARI; break;
5204          case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;          case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
5205          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5206          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
5207    
5208          case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;          case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
5209          case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;          case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
5210          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5211          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
5212    
5213            case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
5214            case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
5215            case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5216            case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
5217    
5218            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
5219            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
5220            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5221            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
5222    
5223          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
5224          pending recursive references are updated. */          pending recursive references are updated. */
5225    
# Line 4759  for (;; ptr++) Line 5267  for (;; ptr++)
5267        int i, namelen;        int i, namelen;
5268        int arglen = 0;        int arglen = 0;
5269        const char *vn = verbnames;        const char *vn = verbnames;
5270        const uschar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5271        const uschar *arg = NULL;        const pcre_uchar *arg = NULL;
5272        previous = NULL;        previous = NULL;
5273        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
5274        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5275    
5276          /* It appears that Perl allows any characters whatsoever, other than
5277          a closing parenthesis, to appear in arguments, so we no longer insist on
5278          letters, digits, and underscores. */
5279    
5280        if (*ptr == CHAR_COLON)        if (*ptr == CHAR_COLON)
5281          {          {
5282          arg = ++ptr;          arg = ++ptr;
5283          while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
           || *ptr == '_') ptr++;  
5284          arglen = (int)(ptr - arg);          arglen = (int)(ptr - arg);
5285          }          }
5286    
# Line 4784  for (;; ptr++) Line 5295  for (;; ptr++)
5295        for (i = 0; i < verbcount; i++)        for (i = 0; i < verbcount; i++)
5296          {          {
5297          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5298              strncmp((char *)name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5299            {            {
5300            /* Check for open captures before ACCEPT */            /* Check for open captures before ACCEPT and convert it to
5301              ASSERT_ACCEPT if in an assertion. */
5302    
5303            if (verbs[i].op == OP_ACCEPT)            if (verbs[i].op == OP_ACCEPT)
5304              {              {
5305              open_capitem *oc;              open_capitem *oc;
5306                if (arglen != 0)
5307                  {
5308                  *errorcodeptr = ERR59;
5309                  goto FAILED;
5310                  }
5311              cd->had_accept = TRUE;              cd->had_accept = TRUE;
5312              for (oc = cd->open_caps; oc != NULL; oc = oc->next)              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5313                {                {
5314                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
5315                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5316                }                }
5317                *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5318    
5319                /* Do not set firstbyte after *ACCEPT */
5320                if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5321              }              }
5322    
5323            /* Handle the cases with/without an argument */            /* Handle other cases with/without an argument */
5324    
5325            if (arglen == 0)            else if (arglen == 0)
5326              {              {
5327              if (verbs[i].op < 0)   /* Argument is mandatory */              if (verbs[i].op < 0)   /* Argument is mandatory */
5328                {                {
5329                *errorcodeptr = ERR66;                *errorcodeptr = ERR66;
5330                goto FAILED;                goto FAILED;
5331                }                }
5332              *code++ = verbs[i].op;              *code = verbs[i].op;
5333                if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
5334              }              }
5335    
5336            else            else
# Line 4818  for (;; ptr++) Line 5340  for (;; ptr++)
5340                *errorcodeptr = ERR59;                *errorcodeptr = ERR59;
5341                goto FAILED;                goto FAILED;
5342                }                }
5343              *code++ = verbs[i].op_arg;              *code = verbs[i].op_arg;
5344                if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5345              *code++ = arglen;              *code++ = arglen;
5346              memcpy(code, arg, arglen);              memcpy(code, arg, arglen);
5347              code += arglen;              code += arglen;
# Line 4843  for (;; ptr++) Line 5366  for (;; ptr++)
5366        {        {
5367        int i, set, unset, namelen;        int i, set, unset, namelen;
5368        int *optset;        int *optset;
5369        const uschar *name;        const pcre_uchar *name;
5370        uschar *slot;        pcre_uchar *slot;
5371    
5372        switch (*(++ptr))        switch (*(++ptr))
5373          {          {
# Line 4996  for (;; ptr++) Line 5519  for (;; ptr++)
5519          slot = cd->name_table;          slot = cd->name_table;
5520          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
5521            {            {
5522            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5523            slot += cd->name_entry_size;            slot += cd->name_entry_size;
5524            }            }
5525    
# Line 5012  for (;; ptr++) Line 5535  for (;; ptr++)
5535          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
5536    
5537          else if ((i = find_parens(cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
5538                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0, utf8)) > 0)
5539            {            {
5540            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
5541            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
# Line 5053  for (;; ptr++) Line 5576  for (;; ptr++)
5576          /* Similarly, check for the (?(DEFINE) "condition", which is always          /* Similarly, check for the (?(DEFINE) "condition", which is always
5577          false. */          false. */
5578    
5579          else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)          else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5580            {            {
5581            code[1+LINK_SIZE] = OP_DEF;            code[1+LINK_SIZE] = OP_DEF;
5582            skipbytes = 1;            skipbytes = 1;
# Line 5080  for (;; ptr++) Line 5603  for (;; ptr++)
5603          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5604          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5605          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
5606            cd->assert_depth += 1;
5607          ptr++;          ptr++;
5608          break;          break;
5609    
# Line 5094  for (;; ptr++) Line 5618  for (;; ptr++)
5618            continue;            continue;
5619            }            }
5620          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
5621            cd->assert_depth += 1;
5622          break;          break;
5623    
5624    
# Line 5103  for (;; ptr++) Line 5628  for (;; ptr++)
5628            {            {
5629            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5630            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
5631              cd->assert_depth += 1;
5632            ptr += 2;            ptr += 2;
5633            break;            break;
5634    
5635            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5636            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
5637              cd->assert_depth += 1;
5638            ptr += 2;            ptr += 2;
5639            break;            break;
5640    
# Line 5129  for (;; ptr++) Line 5656  for (;; ptr++)
5656    
5657          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5658          case CHAR_C:                 /* Callout - may be followed by digits; */          case CHAR_C:                 /* Callout - may be followed by digits; */
5659          previous_callout = code;  /* Save for later completion */          previous_callout = code;     /* Save for later completion */
5660          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1;    /* Skip one item before completing */
5661          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
5662            {            {
5663            int n = 0;            int n = 0;
# Line 5266  for (;; ptr++) Line 5793  for (;; ptr++)
5793    
5794              if (!dupname)              if (!dupname)
5795                {                {
5796                uschar *cslot = cd->name_table;                pcre_uchar *cslot = cd->name_table;
5797                for (i = 0; i < cd->names_found; i++)                for (i = 0; i < cd->names_found; i++)
5798                  {                  {
5799                  if (cslot != slot)                  if (cslot != slot)
# Line 5313  for (;; ptr++) Line 5840  for (;; ptr++)
5840          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5841          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
5842    
5843          /* In the pre-compile phase, do a syntax check and set a dummy          /* In the pre-compile phase, do a syntax check. We used to just set
5844          reference number. */          a dummy reference number, because it was not used in the first pass.
5845            However, with the change of recursive back references to be atomic,
5846            we have to look for the number so that this state can be identified, as
5847            otherwise the incorrect length is computed. If it's not a backwards
5848            reference, the dummy number will do. */
5849    
5850          if (lengthptr != NULL)          if (lengthptr != NULL)
5851            {            {
5852              const pcre_uchar *temp;
5853    
5854            if (namelen == 0)            if (namelen == 0)
5855              {              {
5856              *errorcodeptr = ERR62;              *errorcodeptr = ERR62;
# Line 5333  for (;; ptr++) Line 5866  for (;; ptr++)
5866              *errorcodeptr = ERR48;              *errorcodeptr = ERR48;
5867              goto FAILED;              goto FAILED;
5868              }              }
5869            recno = 0;  
5870              /* The name table does not exist in the first pass, so we cannot
5871              do a simple search as in the code below. Instead, we have to scan the
5872              pattern to find the number. It is important that we scan it only as
5873              far as we have got because the syntax of named subpatterns has not
5874              been checked for the rest of the pattern, and find_parens() assumes
5875              correct syntax. In any case, it's a waste of resources to scan
5876              further. We stop the scan at the current point by temporarily
5877              adjusting the value of cd->endpattern. */
5878    
5879              temp = cd->end_pattern;
5880              cd->end_pattern = ptr;
5881              recno = find_parens(cd, name, namelen,
5882                (options & PCRE_EXTENDED) != 0, utf8);
5883              cd->end_pattern = temp;
5884              if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
5885            }            }
5886    
5887          /* In the real compile, seek the name in the table. We check the name          /* In the real compile, seek the name in the table. We check the name
# Line 5346  for (;; ptr++) Line 5894  for (;; ptr++)
5894            slot = cd->name_table;            slot = cd->name_table;
5895            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
5896              {              {
5897              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
5898                  slot[2+namelen] == 0)                  slot[2+namelen] == 0)
5899                break;                break;
5900              slot += cd->name_entry_size;              slot += cd->name_entry_size;
# Line 5358  for (;; ptr++) Line 5906  for (;; ptr++)
5906              }              }
5907            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
5908                      find_parens(cd, name, namelen,                      find_parens(cd, name, namelen,
5909                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
5910              {              {
5911              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
5912              goto FAILED;              goto FAILED;
# Line 5383  for (;; ptr++) Line 5931  for (;; ptr++)
5931          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5932          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5933            {            {
5934            const uschar *called;            const pcre_uchar *called;
5935            terminator = CHAR_RIGHT_PARENTHESIS;            terminator = CHAR_RIGHT_PARENTHESIS;
5936    
5937            /* Come here from the \g<...> and \g'...' code (Oniguruma            /* Come here from the \g<...> and \g'...' code (Oniguruma
# Line 5469  for (;; ptr++) Line 6017  for (;; ptr++)
6017              if (called == NULL)              if (called == NULL)
6018                {                {
6019                if (find_parens(cd, NULL, recno,                if (find_parens(cd, NULL, recno,
6020                      (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0, utf8) < 0)
6021                  {                  {
6022                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
6023                  goto FAILED;                  goto FAILED;
# Line 5477  for (;; ptr++) Line 6025  for (;; ptr++)
6025    
6026                /* Fudge the value of "called" so that when it is inserted as an                /* Fudge the value of "called" so that when it is inserted as an
6027                offset below, what it actually inserted is the reference number                offset below, what it actually inserted is the reference number
6028                of the group. */                of the group. Then remember the forward reference. */
6029    
6030                called = cd->start_code + recno;                called = cd->start_code + recno;
6031                PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6032                }                }
6033    
6034              /* If not a forward reference, and the subpattern is still open,              /* If not a forward reference, and the subpattern is still open,
6035              this is a recursive call. We check to see if this is a left              this is a recursive call. We check to see if this is a left
6036              recursion that could loop for ever, and diagnose that case. */              recursion that could loop for ever, and diagnose that case. We
6037                must not, however, do this check if we are in a conditional
6038                subpattern because the condition might be testing for recursion in
6039                a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
6040                Forever loops are also detected at runtime, so those that occur in
6041                conditional subpatterns will be picked up then. */
6042    
6043              else if (GET(called, 1) == 0 &&              else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6044                       could_be_empty(called, code, bcptr, utf8, cd))                       could_be_empty(called, code, bcptr, utf8, cd))
6045                {                {
6046                *errorcodeptr = ERR40;                *errorcodeptr = ERR40;
# Line 5495  for (;; ptr++) Line 6048  for (;; ptr++)
6048                }                }
6049              }              }
6050    
6051            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item. */
           "once" brackets. Set up a "previous group" length so that a  
           subsequent quantifier will work. */  
   
           *code = OP_ONCE;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
6052    
6053            *code = OP_RECURSE;            *code = OP_RECURSE;
6054            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
6055            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
   
           *code = OP_KET;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
           length_prevgroup = 3 + 3*LINK_SIZE;  
6056            }            }
6057    
6058          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 5572  for (;; ptr++) Line 6113  for (;; ptr++)
6113          is necessary to ensure we correctly detect the start of the pattern in          is necessary to ensure we correctly detect the start of the pattern in
6114          both phases.          both phases.
6115    
6116          If we are not at the pattern start, compile code to change the ims          If we are not at the pattern start, reset the greedy defaults and the
6117          options if this setting actually changes any of them, and reset the          case value for firstbyte and reqbyte. */
         greedy defaults and the case value for firstbyte and reqbyte. */  
6118    
6119          if (*ptr == CHAR_RIGHT_PARENTHESIS)          if (*ptr == CHAR_RIGHT_PARENTHESIS)
6120            {            {
# Line 5585  for (;; ptr++) Line 6125  for (;; ptr++)
6125              }              }
6126            else            else
6127              {              {
             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))  
               {  
               *code++ = OP_OPT;  
               *code++ = newoptions & PCRE_IMS;  
               }  
6128              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6129              greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
6130              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
6131              }              }
6132    
6133            /* Change options at this level, and pass them back for use            /* Change options at this level, and pass them back for use
6134            in subsequent branches. When not at the start of the pattern, this            in subsequent branches. */
           information is also necessary so that a resetting item can be  
           compiled at the end of a group (if we are in a group). */  
6135    
6136            *optionsptr = options = newoptions;            *optionsptr = options = newoptions;
6137            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
# Line 5634  for (;; ptr++) Line 6167  for (;; ptr++)
6167        skipbytes = 2;        skipbytes = 2;
6168        }        }
6169    
6170      /* Process nested bracketed regex. Assertions may not be repeated, but      /* Process nested bracketed regex. Assertions used not to be repeatable,
6171      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a      but this was changed for Perl compatibility, so all kinds can now be
6172      non-register variable in order to be able to pass its address because some      repeated. We copy code into a non-register variable (tempcode) in order to
6173      compilers complain otherwise. Pass in a new setting for the ims options if      be able to pass its address because some compilers complain otherwise. */
     they have changed. */  
6174    
6175      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = code;                      /* For handling repetition */
6176      *code = bravalue;      *code = bravalue;
6177      tempcode = code;      tempcode = code;
6178      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;        /* Save value before bracket */
6179      length_prevgroup = 0;              /* Initialize for pre-compile phase */      tempbracount = cd->bracount;          /* Save value before bracket */
6180        length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6181    
6182      if (!compile_regex(      if (!compile_regex(
6183           newoptions,                   /* The complete new option state */           newoptions,                      /* The complete new option state */
6184           options & PCRE_IMS,           /* The previous ims option state */           &tempcode,                       /* Where to put code (updated) */
6185           &tempcode,                    /* Where to put code (updated) */           &ptr,                            /* Input pointer (updated) */
6186           &ptr,                         /* Input pointer (updated) */           errorcodeptr,                    /* Where to put an error message */
          errorcodeptr,                 /* Where to put an error message */  
6187           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
6188            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
6189           reset_bracount,               /* True if (?| group */           reset_bracount,                  /* True if (?| group */
6190           skipbytes,                    /* Skip over bracket number */           skipbytes,                       /* Skip over bracket number */
6191           &subfirstbyte,                /* For possible first char */           cond_depth +
6192           &subreqbyte,                  /* For possible last char */             ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
6193           bcptr,                        /* Current branch chain */           &subfirstbyte,                   /* For possible first char */
6194           cd,                           /* Tables block */           &subreqbyte,                     /* For possible last char */
6195           (lengthptr == NULL)? NULL :   /* Actual compile phase */           bcptr,                           /* Current branch chain */
6196             &length_prevgroup           /* Pre-compile phase */           cd,                              /* Tables block */
6197             (lengthptr == NULL)? NULL :      /* Actual compile phase */
6198               &length_prevgroup              /* Pre-compile phase */
6199           ))           ))
6200        goto FAILED;        goto FAILED;
6201    
6202        /* If this was an atomic group and there are no capturing groups within it,
6203        generate OP_ONCE_NC instead of OP_ONCE. */
6204    
6205        if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
6206          *code = OP_ONCE_NC;
6207    
6208        if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6209          cd->assert_depth -= 1;
6210    
6211      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
6212      group, while tempcode has been updated to point past the end of the group      group, while tempcode has been updated to point past the end of the group.
6213      and any option resetting that may follow it. The pattern pointer (ptr)      The pattern pointer (ptr) is on the bracket.
     is on the bracket. */  
6214    
6215      /* If this is a conditional bracket, check that there are no more than      If this is a conditional bracket, check that there are no more than
6216      two branches in the group, or just one if it's a DEFINE group. We do this      two branches in the group, or just one if it's a DEFINE group. We do this
6217      in the real compile phase, not in the pre-pass, where the whole group may      in the real compile phase, not in the pre-pass, where the whole group may
6218      not be available. */      not be available. */
6219    
6220      if (bravalue == OP_COND && lengthptr == NULL)      if (bravalue == OP_COND && lengthptr == NULL)
6221        {        {
6222        uschar *tc = code;        pcre_uchar *tc = code;
6223        int condcount = 0;        int condcount = 0;
6224    
6225        do {        do {
# Line 5735  for (;; ptr++) Line 6277  for (;; ptr++)
6277          goto FAILED;          goto FAILED;
6278          }          }
6279        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6280        *code++ = OP_BRA;        code++;   /* This already contains bravalue */
6281        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
6282        *code++ = OP_KET;        *code++ = OP_KET;
6283        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
# Line 5852  for (;; ptr++) Line 6394  for (;; ptr++)
6394    
6395        if (-c == ESC_g)        if (-c == ESC_g)
6396          {          {
6397          const uschar *p;          const pcre_uchar *p;
6398          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
6399          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6400            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
# Line 5903  for (;; ptr++) Line 6445  for (;; ptr++)
6445          }          }
6446    
6447        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6448        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax).  */
6449    
6450        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||        if (-c == ESC_k)
           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))  
6451          {          {
6452            if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6453              ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6454              {
6455              *errorcodeptr = ERR69;
6456              break;
6457              }
6458          is_recurse = FALSE;          is_recurse = FALSE;
6459          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6460            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
# Line 5927  for (;; ptr++) Line 6474  for (;; ptr++)
6474          HANDLE_REFERENCE:    /* Come here from named backref handling */          HANDLE_REFERENCE:    /* Come here from named backref handling */
6475          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
6476          previous = code;          previous = code;
6477          *code++ = OP_REF;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
6478          PUT2INC(code, 0, recno);          PUT2INC(code, 0, recno);
6479          cd->backref_map |= (recno < 32)? (1 << recno) : 1;          cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6480          if (recno > cd->top_backref) cd->top_backref = recno;          if (recno > cd->top_backref) cd->top_backref = recno;
# Line 5987  for (;; ptr++) Line 6534  for (;; ptr++)
6534            }            }
6535          else          else
6536  #endif  #endif
6537            {          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6538            so that it works in DFA mode and in lookbehinds. */
6539    
6540              {
6541            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6542            *code++ = -c;            *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
6543            }            }
6544          }          }
6545        continue;        continue;
# Line 6035  for (;; ptr++) Line 6585  for (;; ptr++)
6585    
6586      ONE_CHAR:      ONE_CHAR:
6587      previous = code;      previous = code;
6588      *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;      *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
6589      for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];      for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6590    
6591      /* Remember if \r or \n were seen */      /* Remember if \r or \n were seen */
# Line 6064  for (;; ptr++) Line 6614  for (;; ptr++)
6614        else firstbyte = reqbyte = REQ_NONE;        else firstbyte = reqbyte = REQ_NONE;
6615        }        }
6616    
6617      /* firstbyte was previously set; we can set reqbyte only the length is      /* firstbyte was previously set; we can set reqbyte only if the length is
6618      1 or the matching is caseful. */      1 or the matching is caseful. */
6619    
6620      else      else
# Line 6099  return FALSE; Line 6649  return FALSE;
6649  /* On entry, ptr is pointing past the bracket character, but on return it  /* On entry, ptr is pointing past the bracket character, but on return it
6650  points to the closing bracket, or vertical bar, or end of string. The code  points to the closing bracket, or vertical bar, or end of string. The code
6651  variable is pointing at the byte into which the BRA operator has been stored.  variable is pointing at the byte into which the BRA operator has been stored.
 If the ims options are changed at the start (for a (?ims: group) or during any  
 branch, we need to insert an OP_OPT item at the start of every following branch  
 to ensure they get set correctly at run time, and also pass the new options  
 into every subsequent branch compile.  
   
6652  This function is used during the pre-compile phase when we are trying to find  This function is used during the pre-compile phase when we are trying to find
6653  out the amount of memory needed, as well as during the real compile phase. The  out the amount of memory needed, as well as during the real compile phase. The
6654  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
6655    
6656  Arguments:  Arguments:
6657    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
   oldims         previous settings of ims option bits  
6658    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
6659    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
6660    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
6661    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
6662    reset_bracount TRUE to reset the count for each branch    reset_bracount TRUE to reset the count for each branch
6663    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
6664      cond_depth     depth of nesting for conditional subpatterns
6665    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
6666    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
6667    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
# Line 6128  Returns:         TRUE on success Line 6673  Returns:         TRUE on success
6673  */  */
6674    
6675  static BOOL  static BOOL
6676  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,  compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
6677    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6678    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,    int cond_depth, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
6679    int *lengthptr)    compile_data *cd, int *lengthptr)
6680  {  {
6681  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
6682  uschar *code = *codeptr;  pcre_uchar *code = *codeptr;
6683  uschar *last_branch = code;  pcre_uchar *last_branch = code;
6684  uschar *start_bracket = code;  pcre_uchar *start_bracket = code;
6685  uschar *reverse_count = NULL;  pcre_uchar *reverse_count = NULL;
6686  open_capitem capitem;  open_capitem capitem;
6687  int capnumber = 0;  int capnumber = 0;
6688  int firstbyte, reqbyte;  int firstbyte, reqbyte;
# Line 6145  int branchfirstbyte, branchreqbyte; Line 6690  int branchfirstbyte, branchreqbyte;
6690  int length;  int length;
6691  int orig_bracount;  int orig_bracount;
6692  int max_bracount;  int max_bracount;
 int old_external_options = cd->external_options;  
6693  branch_chain bc;  branch_chain bc;
6694    
6695  bc.outer = bcptr;  bc.outer = bcptr;
# Line 6169  pre-compile phase to find out whether an Line 6713  pre-compile phase to find out whether an
6713    
6714  /* If this is a capturing subpattern, add to the chain of open capturing items  /* If this is a capturing subpattern, add to the chain of open capturing items
6715  so that we can detect them if (*ACCEPT) is encountered. This is also used to  so that we can detect them if (*ACCEPT) is encountered. This is also used to
6716  detect groups that contain recursive back references to themselves. */  detect groups that contain recursive back references to themselves. Note that
6717    only OP_CBRA need be tested here; changing this opcode to one of its variants,
6718    e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
6719    
6720  if (*code == OP_CBRA)  if (*code == OP_CBRA)
6721    {    {
# Line 6195  for (;;) Line 6741  for (;;)
6741    
6742    if (reset_bracount) cd->bracount = orig_bracount;    if (reset_bracount) cd->bracount = orig_bracount;
6743    
   /* Handle a change of ims options at the start of the branch */  
   
   if ((options & PCRE_IMS) != oldims)  
     {  
     *code++ = OP_OPT;  
     *code++ = options & PCRE_IMS;  
     length += 2;  
     }  
   
6744    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
6745    
6746    if (lookbehind)    if (lookbehind)
# Line 6218  for (;;) Line 6755  for (;;)
6755    into the length. */    into the length. */
6756    
6757    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
6758          &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))          &branchreqbyte, &bc, cond_depth, cd,
6759            (lengthptr == NULL)? NULL : &length))
6760      {      {
6761      *ptrptr = ptr;      *ptrptr = ptr;
6762      return FALSE;      return FALSE;
6763      }      }
6764    
   /* If the external options have changed during this branch, it means that we  
   are at the top level, and a leading option setting has been encountered. We  
   need to re-set the original option values to take account of this so that,  
   during the pre-compile phase, we know to allow for a re-set at the start of  
   subsequent branches. */  
   
   if (old_external_options != cd->external_options)  
     oldims = cd->external_options & PCRE_IMS;  
   
6765    /* Keep the highest bracket count in case (?| was used and some branch    /* Keep the highest bracket count in case (?| was used and some branch
6766    has fewer than the rest. */    has fewer than the rest. */
6767    
# Line 6293  for (;;) Line 6822  for (;;)
6822        {        {
6823        int fixed_length;        int fixed_length;
6824        *code = OP_END;        *code = OP_END;
6825        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);        fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
6826            FALSE, cd);
6827        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
6828        if (fixed_length == -3)        if (fixed_length == -3)
6829          {          {
# Line 6301  for (;;) Line 6831  for (;;)
6831          }          }
6832        else if (fixed_length < 0)        else if (fixed_length < 0)
6833          {          {
6834          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;          *errorcodeptr = (fixed_length == -2)? ERR36 :
6835                            (fixed_length == -4)? ERR70: ERR25;
6836          *ptrptr = ptr;          *ptrptr = ptr;
6837          return FALSE;          return FALSE;
6838          }          }
# Line 6314  for (;;) Line 6845  for (;;)
6845    of offsets, with the field in the BRA item now becoming an offset to the    of offsets, with the field in the BRA item now becoming an offset to the
6846    first alternative. If there are no alternatives, it points to the end of the    first alternative. If there are no alternatives, it points to the end of the
6847    group. The length in the terminating ket is always the length of the whole    group. The length in the terminating ket is always the length of the whole
6848    bracketed item. If any of the ims options were changed inside the group,    bracketed item. Return leaving the pointer at the terminating char. */
   compile a resetting op-code following, except at the very end of the pattern.  
   Return leaving the pointer at the terminating char. */  
6849    
6850    if (*ptr != CHAR_VERTICAL_LINE)    if (*ptr != CHAR_VERTICAL_LINE)
6851      {      {
# Line 6360  for (;;) Line 6889  for (;;)
6889        cd->open_caps = cd->open_caps->next;        cd->open_caps = cd->open_caps->next;
6890        }        }
6891    
     /* Reset options if needed. */  
   
     if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)  
       {  
       *code++ = OP_OPT;  
       *code++ = oldims;  
       length += 2;  
       }  
   
6892      /* Retain the highest bracket number, in case resetting was used. */      /* Retain the highest bracket number, in case resetting was used. */
6893    
6894      cd->bracount = max_bracount;      cd->bracount = max_bracount;
# Line 6428  for (;;) Line 6948  for (;;)
6948  /* Try to find out if this is an anchored regular expression. Consider each  /* Try to find out if this is an anchored regular expression. Consider each
6949  alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket  alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6950  all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then  all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6951  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD will
6952  counts, since OP_CIRC can match in the middle.  be found, because ^ generates OP_CIRCM in that mode.
6953    
6954  We can also consider a regex to be anchored if OP_SOM starts all its branches.  We can also consider a regex to be anchored if OP_SOM starts all its branches.
6955  This is the code for \G, which means "match at start of match position, taking  This is the code for \G, which means "match at start of match position, taking
# Line 6450  of the more common cases more precisely. Line 6970  of the more common cases more precisely.
6970    
6971  Arguments:  Arguments:
6972    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
   options        points to the options setting  
6973    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
6974                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
6975                    the less precise approach                    the less precise approach
# Line 6460  Returns:     TRUE or FALSE Line 6979  Returns:     TRUE or FALSE
6979  */  */
6980    
6981  static BOOL  static BOOL
6982  is_anchored(register const uschar *code, int *options, unsigned int bracket_map,  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
6983    unsigned int backref_map)    unsigned int backref_map)
6984  {  {
6985  do {  do {
6986     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const pcre_uchar *scode = first_significant_code(
6987       options, PCRE_MULTILINE, FALSE);       code + _pcre_OP_lengths[*code], FALSE);
6988     register int op = *scode;     register int op = *scode;
6989    
6990     /* Non-capturing brackets */     /* Non-capturing brackets */
6991    
6992     if (op == OP_BRA)     if (op == OP_BRA  || op == OP_BRAPOS ||
6993           op == OP_SBRA || op == OP_SBRAPOS)
6994       {       {
6995       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
6996       }       }
6997    
6998     /* Capturing brackets */     /* Capturing brackets */
6999    
7000     else if (op == OP_CBRA)     else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7001                op == OP_SCBRA || op == OP_SCBRAPOS)
7002       {       {
7003       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7004       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7005       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, new_map, backref_map)) return FALSE;
7006       }       }
7007    
7008     /* Other brackets */     /* Other brackets */
7009    
7010     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||
7011                op == OP_COND)
7012       {       {
7013       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
7014       }       }
7015    
7016     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
# Line 6503  do { Line 7025  do {
7025    
7026     /* Check for explicit anchoring */     /* Check for explicit anchoring */
7027    
7028     else if (op != OP_SOD && op != OP_SOM &&     else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))  
      return FALSE;  
7029     code += GET(code, 1);     code += GET(code, 1);
7030     }     }
7031  while (*code == OP_ALT);   /* Loop for each alternative */  while (*code == OP_ALT);   /* Loop for each alternative */
# Line 6536  Returns:         TRUE or FALSE Line 7056  Returns:         TRUE or FALSE
7056  */  */
7057    
7058  static BOOL  static BOOL
7059  is_startline(const uschar *code, unsigned int bracket_map,  is_startline(const pcre_uchar *code, unsigned int bracket_map,
7060    unsigned int backref_map)    unsigned int backref_map)
7061  {  {
7062  do {  do {
7063     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const pcre_uchar *scode = first_significant_code(
7064       NULL, 0, FALSE);       code + _pcre_OP_lengths[*code], FALSE);
7065     register int op = *scode;     register int op = *scode;
7066    
7067     /* If we are at the start of a conditional assertion group, *both* the     /* If we are at the start of a conditional assertion group, *both* the
# Line 6568  do { Line 7088  do {
7088         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
7089         break;         break;
7090         }         }
7091       scode = first_significant_code(scode, NULL, 0, FALSE);       scode = first_significant_code(scode, FALSE);
7092       op = *scode;       op = *scode;
7093       }       }
7094    
7095     /* Non-capturing brackets */     /* Non-capturing brackets */
7096    
7097     if (op == OP_BRA)     if (op == OP_BRA  || op == OP_BRAPOS ||
7098           op == OP_SBRA || op == OP_SBRAPOS)
7099       {       {
7100       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7101       }       }
7102    
7103     /* Capturing brackets */     /* Capturing brackets */
7104    
7105     else if (op == OP_CBRA)     else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7106                op == OP_SCBRA || op == OP_SCBRAPOS)
7107       {       {
7108       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7109       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
# Line 6590  do { Line 7112  do {
7112    
7113     /* Other brackets */     /* Other brackets */
7114    
7115     else if (op == OP_ASSERT || op == OP_ONCE)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)
7116       {       {
7117       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7118       }       }
# Line 6605  do { Line 7127  do {
7127    
7128     /* Check for explicit circumflex */     /* Check for explicit circumflex */
7129    
7130     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
7131    
7132     /* Move on to the next alternative */     /* Move on to the next alternative */
7133    
# Line 6631  we return that char, otherwise -1. Line 7153  we return that char, otherwise -1.
7153    
7154  Arguments:  Arguments:
7155    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
   options    pointer to the options (used to check casing changes)  
7156    inassert   TRUE if in an assertion    inassert   TRUE if in an assertion
7157    
7158  Returns:     -1 or the fixed first char  Returns:     -1 or the fixed first char
7159  */  */
7160    
7161  static int  static int
7162  find_firstassertedchar(const uschar *code, int *options, BOOL inassert)  find_firstassertedchar(const pcre_uchar *code, BOOL inassert)
7163  {  {
7164  register int c = -1;  register int c = -1;
7165  do {  do {
7166     int d;     int d;
7167     const uschar *scode =     int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
7168       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);               *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? 2:0;
7169       const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
7170         TRUE);
7171     register int op = *scode;     register int op = *scode;
7172    
7173     switch(op)     switch(op)
# Line 6653  do { Line 7176  do {
7176       return -1;       return -1;
7177    
7178       case OP_BRA:       case OP_BRA:
7179         case OP_BRAPOS:
7180       case OP_CBRA:       case OP_CBRA:
7181         case OP_SCBRA:
7182         case OP_CBRAPOS:
7183         case OP_SCBRAPOS:
7184       case OP_ASSERT:       case OP_ASSERT:
7185       case OP_ONCE:       case OP_ONCE:
7186         case OP_ONCE_NC:
7187       case OP_COND:       case OP_COND:
7188       if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)       if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
7189         return -1;         return -1;
7190       if (c < 0) c = d; else if (c != d) return -1;       if (c < 0) c = d; else if (c != d) return -1;
7191       break;       break;
7192    
7193       case OP_EXACT:       /* Fall through */       case OP_EXACT:
7194       scode += 2;       scode += 2;
7195         /* Fall through */
7196    
7197       case OP_CHAR:       case OP_CHAR:
      case OP_CHARNC:  
7198       case OP_PLUS:       case OP_PLUS:
7199       case OP_MINPLUS:       case OP_MINPLUS:
7200       case OP_POSPLUS:       case OP_POSPLUS:
7201       if (!inassert) return -1;       if (!inassert) return -1;
7202       if (c < 0)       if (c < 0) c = scode[1];
7203         {         else if (c != scode[1]) return -1;
7204         c = scode[1];       break;
7205         if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;  
7206         }       case OP_EXACTI:
7207       else if (c != scode[1]) return -1;       scode += 2;
7208         /* Fall through */
7209    
7210         case OP_CHARI:
7211         case OP_PLUSI:
7212         case OP_MINPLUSI:
7213         case OP_POSPLUSI:
7214         if (!inassert) return -1;
7215         if (c < 0) c = scode[1] | REQ_CASELESS;
7216           else if (c != scode[1]) return -1;
7217       break;       break;
7218       }       }
7219    
# Line 6710  Returns:        pointer to compiled data Line 7247  Returns:        pointer to compiled data
7247                  with errorptr and erroroffset set                  with errorptr and erroroffset set
7248  */  */
7249    
7250    #ifndef COMPILE_PCRE16
7251  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7252  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
7253    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
7254    #else
7255    PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7256    pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
7257      int *erroroffset, const unsigned char *tables)
7258    #endif
7259  {  {
7260    #ifndef COMPILE_PCRE16
7261  return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);  return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7262    #else
7263    return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7264    #endif
7265  }  }
7266    
7267    
7268    #ifndef COMPILE_PCRE16
7269  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7270  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
7271    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
7272    #else
7273    PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7274    pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
7275      const char **errorptr, int *erroroffset, const unsigned char *tables)
7276    #endif
7277  {  {
7278  real_pcre *re;  real_pcre *re;
7279  int length = 1;  /* For final END opcode */  int length = 1;  /* For final END opcode */
# Line 6729  int errorcode = 0; Line 7282  int errorcode = 0;
7282  int skipatstart = 0;  int skipatstart = 0;
7283  BOOL utf8;  BOOL utf8;
7284  size_t size;  size_t size;
7285  uschar *code;  pcre_uchar *code;
7286  const uschar *codestart;  const pcre_uchar *codestart;
7287  const uschar *ptr;  const pcre_uchar *ptr;
7288  compile_data compile_block;  compile_data compile_block;
7289  compile_data *cd = &compile_block;  compile_data *cd = &compile_block;
7290    
# Line 6741  as soon as possible, so that a fairly la Line 7294  as soon as possible, so that a fairly la
7294  this purpose. The same space is used in the second phase for remembering where  this purpose. The same space is used in the second phase for remembering where
7295  to fill in forward references to subpatterns. */  to fill in forward references to subpatterns. */
7296    
7297  uschar cworkspace[COMPILE_WORK_SIZE];  pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7298    
7299  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
7300    
7301  ptr = (const uschar *)pattern;  ptr = (const pcre_uchar *)pattern;
7302    
7303  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
7304  can do is just return NULL, but we can set a code value if there is a code  can do is just return NULL, but we can set a code value if there is a code
# Line 6795  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 7348  while (ptr[skipatstart] == CHAR_LEFT_PAR
7348    int newnl = 0;    int newnl = 0;
7349    int newbsr = 0;    int newbsr = 0;
7350    
7351    if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)    if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
7352      { skipatstart += 7; options |= PCRE_UTF8; continue; }      { skipatstart += 7; options |= PCRE_UTF8; continue; }
7353    else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
7354      { skipatstart += 6; options |= PCRE_UCP; continue; }      { skipatstart += 6; options |= PCRE_UCP; continue; }
7355      else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
7356        { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
7357    
7358    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)    if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
7359      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
7360    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
7361      { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }      { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
7362    else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
7363      { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }      { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
7364    else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
7365      { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }      { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
7366    else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
7367      { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }      { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
7368    
7369    else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
7370      { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }      { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
7371    else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
7372      { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }      { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
7373    
7374    if (newnl != 0)    if (newnl != 0)
# Line 6825  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 7380  while (ptr[skipatstart] == CHAR_LEFT_PAR
7380    
7381  utf8 = (options & PCRE_UTF8) != 0;  utf8 = (options & PCRE_UTF8) != 0;
7382    
7383  /* Can't support UTF8 unless PCRE has been compiled to include the code. */  /* Can't support UTF8 unless PCRE has been compiled to include the code. The
7384    return of an error code from _pcre_valid_utf8() is a new feature, introduced in
7385    release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7386    not used here. */
7387    
7388  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
7389  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7390       (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)       (errorcode = _pcre_valid_utf8((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7391    {    {
7392    errorcode = ERR44;    errorcode = ERR44;
7393    goto PCRE_EARLY_ERROR_RETURN2;    goto PCRE_EARLY_ERROR_RETURN2;
# Line 6854  if ((options & PCRE_UCP) != 0) Line 7412  if ((options & PCRE_UCP) != 0)
7412    
7413  /* Check validity of \R options. */  /* Check validity of \R options. */
7414    
7415  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))  if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
7416         (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
7417    {    {
7418    case 0:    errorcode = ERR56;
7419    case PCRE_BSR_ANYCRLF:    goto PCRE_EARLY_ERROR_RETURN;
   case PCRE_BSR_UNICODE:  
   break;  
   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;  
7420    }    }
7421    
7422  /* Handle different types of newline. The three bits give seven cases. The  /* Handle different types of newline. The three bits give seven cases. The
# Line 6929  cd->name_table = NULL; Line 7485  cd->name_table = NULL;
7485  cd->start_workspace = cworkspace;  cd->start_workspace = cworkspace;
7486  cd->start_code = cworkspace;  cd->start_code = cworkspace;
7487  cd->hwm = cworkspace;  cd->hwm = cworkspace;
7488  cd->start_pattern = (const uschar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
7489  cd->end_pattern = (const uschar *)(pattern + strlen(pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC(pattern));
7490  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7491  cd->external_options = options;  cd->external_options = options;
7492  cd->external_flags = 0;  cd->external_flags = 0;
# Line 6945  outside can help speed up starting point Line 7501  outside can help speed up starting point
7501  ptr += skipatstart;  ptr += skipatstart;
7502  code = cworkspace;  code = cworkspace;
7503  *code = OP_BRA;  *code = OP_BRA;
7504  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
7505    &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,    FALSE, 0, 0, &firstbyte, &reqbyte, NULL, cd, &length);
   &length);  
7506  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7507    
7508  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
# Line 7001  field; this time it's used for rememberi Line 7556  field; this time it's used for rememberi
7556  */  */
7557    
7558  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7559    cd->assert_depth = 0;
7560  cd->bracount = 0;  cd->bracount = 0;
7561  cd->names_found = 0;  cd->names_found = 0;
7562  cd->name_table = (uschar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7563  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
7564  cd->start_code = codestart;  cd->start_code = codestart;
7565  cd->hwm = cworkspace;  cd->hwm = cworkspace;
# Line 7016  cd->open_caps = NULL; Line 7572  cd->open_caps = NULL;
7572  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
7573  of the function here. */  of the function here. */
7574    
7575  ptr = (const uschar *)pattern + skipatstart;  ptr = (const pcre_uchar *)pattern + skipatstart;
7576  code = (uschar *)codestart;  code = (pcre_uchar *)codestart;
7577  *code = OP_BRA;  *code = OP_BRA;
7578  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,  (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
7579    &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);    &firstbyte, &reqbyte, NULL, cd, NULL);
7580  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
7581  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7582  re->flags = cd->external_flags;  re->flags = cd->external_flags;
7583    
7584  if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqbyte = REQ_NONE;   /* Must disable after (*ACCEPT) */
7585    
7586  /* If not reached end of pattern on success, there's an excess bracket. */  /* If not reached end of pattern on success, there's an excess bracket. */
7587    
# Line 7045  if (code - codestart > length) errorcode Line 7601  if (code - codestart > length) errorcode
7601  while (errorcode == 0 && cd->hwm > cworkspace)  while (errorcode == 0 && cd->hwm > cworkspace)
7602    {    {
7603    int offset, recno;    int offset, recno;
7604    const uschar *groupptr;    const pcre_uchar *groupptr;
7605    cd->hwm -= LINK_SIZE;    cd->hwm -= LINK_SIZE;
7606    offset = GET(cd->hwm, 0);    offset = GET(cd->hwm, 0);
7607    recno = GET(codestart, offset);    recno = GET(codestart, offset);
7608    groupptr = _pcre_find_bracket(codestart, utf8, recno);    groupptr = _pcre_find_bracket(codestart, utf8, recno);
7609    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
7610      else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));      else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
7611    }    }
7612    
7613  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
# Line 7069  length, and set their lengths. */ Line 7625  length, and set their lengths. */
7625    
7626  if (cd->check_lookbehind)  if (cd->check_lookbehind)
7627    {    {
7628    uschar *cc = (uschar *)codestart;    pcre_uchar *cc = (pcre_uchar *)codestart;
7629    
7630    /* Loop, searching for OP_REVERSE items, and process those that do not have    /* Loop, searching for OP_REVERSE items, and process those that do not have
7631    their length set. (Actually, it will also re-process any that have a length    their length set. (Actually, it will also re-process any that have a length
7632    of zero, but that is a pathological case, and it does no harm.) When we find    of zero, but that is a pathological case, and it does no harm.) When we find
7633    one, we temporarily terminate the branch it is in while we scan it. */    one, we temporarily terminate the branch it is in while we scan it. */
7634    
7635    for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);    for (cc = (pcre_uchar *)_pcre_find_bracket(codestart, utf8, -1);
7636         cc != NULL;         cc != NULL;
7637         cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))         cc = (pcre_uchar *)_pcre_find_bracket(cc, utf8, -1))
7638      {      {
7639      if (GET(cc, 1) == 0)      if (GET(cc, 1) == 0)
7640        {        {
7641        int fixed_length;        int fixed_length;
7642        uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);        pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
7643        int end_op = *be;        int end_op = *be;
7644        *be = OP_END;        *be = OP_END;
7645        fixed_length = find_fixedlength(cc, re->options, TRUE, cd);        fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
7646            cd);
7647        *be = end_op;        *be = end_op;
7648        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
7649        if (fixed_length < 0)        if (fixed_length < 0)
7650          {          {
7651          errorcode = (fixed_length == -2)? ERR36 : ERR25;          errorcode = (fixed_length == -2)? ERR36 :
7652                        (fixed_length == -4)? ERR70 : ERR25;
7653          break;          break;
7654          }          }
7655        PUT(cc, 1, fixed_length);        PUT(cc, 1, fixed_length);
# Line 7106  if (errorcode != 0) Line 7664  if (errorcode != 0)
7664    {    {
7665    (pcre_free)(re);    (pcre_free)(re);
7666    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
7667    *erroroffset = (int)(ptr - (const uschar *)pattern);    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
7668    PCRE_EARLY_ERROR_RETURN2:    PCRE_EARLY_ERROR_RETURN2:
7669    *errorptr = find_error_text(errorcode);    *errorptr = find_error_text(errorcode);
7670    if (errorcodeptr != NULL) *errorcodeptr = errorcode;    if (errorcodeptr != NULL) *errorcodeptr = errorcode;
# Line 7125  start with ^. and also when all branches Line 7683  start with ^. and also when all branches
7683    
7684  if ((re->options & PCRE_ANCHORED) == 0)  if ((re->options & PCRE_ANCHORED) == 0)
7685    {    {
7686    int temp_options = re->options;   /* May get changed during these scans */    if (is_anchored(codestart, 0, cd->backref_map))
   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))  
7687      re->options |= PCRE_ANCHORED;      re->options |= PCRE_ANCHORED;
7688    else    else
7689      {      {
7690      if (firstbyte < 0)      if (firstbyte < 0)
7691        firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);        firstbyte = find_firstassertedchar(codestart, FALSE);
7692      if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */      if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
7693        {        {
7694        int ch = firstbyte & 255;        int ch = firstbyte & 255;
# Line 7193  if (code - codestart > length) Line 7750  if (code - codestart > length)
7750    {    {
7751    (pcre_free)(re);    (pcre_free)(re);
7752    *errorptr = find_error_text(ERR23);    *errorptr = find_error_text(ERR23);
7753    *erroroffset = ptr - (uschar *)pattern;    *erroroffset = ptr - (pcre_uchar *)pattern;
7754    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;
7755    return NULL;    return NULL;
7756    }    }

Legend:
Removed from v.545  
changed lines
  Added in v.763

  ViewVC Help
Powered by ViewVC 1.1.5