/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1364 by ph10, Sat Oct 5 15:45:11 2013 UTC revision 1611 by ph10, Thu Nov 26 20:29:13 2015 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2013 University of Cambridge             Copyright (c) 1997-2014 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 47  supporting internal functions that are n Line 47  supporting internal functions that are n
47  #endif  #endif
48    
49  #define NLBLOCK cd             /* Block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
50  #define PSSTART start_pattern  /* Field containing processed string start */  #define PSSTART start_pattern  /* Field containing pattern start */
51  #define PSEND   end_pattern    /* Field containing processed string end */  #define PSEND   end_pattern    /* Field containing pattern end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
# Line 174  static const short int escapes[] = { Line 174  static const short int escapes[] = {
174       -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,       -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
175       CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,       CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
176       CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,       CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
177       CHAR_GRAVE_ACCENT,       7,       CHAR_GRAVE_ACCENT,       ESC_a,
178       -ESC_b,                  0,       -ESC_b,                  0,
179       -ESC_d,                  ESC_e,       -ESC_d,                  ESC_e,
180       ESC_f,                   0,       ESC_f,                   0,
# Line 202  static const short int escapes[] = { Line 202  static const short int escapes[] = {
202  /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',  /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
203  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
204  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
205  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
206  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
207  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
208  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
209  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
210  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 219  static const short int escapes[] = { Line 219  static const short int escapes[] = {
219  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
220  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
221  };  };
222    
223    /* We also need a table of characters that may follow \c in an EBCDIC
224    environment for characters 0-31. */
225    
226    static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
227    
228  #endif  #endif
229    
230    
# Line 260  static const verbitem verbs[] = { Line 266  static const verbitem verbs[] = {
266  static const int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
267    
268    
269    /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
270    another regex library. */
271    
272    static const pcre_uchar sub_start_of_word[] = {
273      CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
274      CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
275    
276    static const pcre_uchar sub_end_of_word[] = {
277      CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
278      CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
279      CHAR_RIGHT_PARENTHESIS, '\0' };
280    
281    
282  /* Tables of names of POSIX character classes and their lengths. The names are  /* Tables of names of POSIX character classes and their lengths. The names are
283  now all in a single string, to reduce the number of relocations when a shared  now all in a single string, to reduce the number of relocations when a shared
284  library is dynamically loaded. The list of lengths is terminated by a zero  library is dynamically loaded. The list of lengths is terminated by a zero
285  length entry. The first three must be alpha, lower, upper, as this is assumed  length entry. The first three must be alpha, lower, upper, as this is assumed
286  for handling case independence. */  for handling case independence. The indices for graph, print, and punct are
287    needed, so identify them. */
288    
289  static const char posix_names[] =  static const char posix_names[] =
290    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
# Line 275  static const char posix_names[] = Line 295  static const char posix_names[] =
295  static const pcre_uint8 posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
296    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
297    
298    #define PC_GRAPH  8
299    #define PC_PRINT  9
300    #define PC_PUNCT 10
301    
302    
303  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
304  base map, with an optional addition or removal of another map. Then, for some  base map, with an optional addition or removal of another map. Then, for some
305  classes, there is some additional tweaking: for [:blank:] the vertical space  classes, there is some additional tweaking: for [:blank:] the vertical space
# Line 302  static const int posix_class_maps[] = { Line 327  static const int posix_class_maps[] = {
327    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
328  };  };
329    
330  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class  /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
331  substitutes must be in the order of the names, defined above, and there are  Unicode property escapes. */
 both positive and negative cases. NULL means no substitute. */  
332    
333  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
334  static const pcre_uchar string_PNd[]  = {  static const pcre_uchar string_PNd[]  = {
# Line 329  static const pcre_uchar string_pXwd[] = Line 353  static const pcre_uchar string_pXwd[] =
353  static const pcre_uchar *substitutes[] = {  static const pcre_uchar *substitutes[] = {
354    string_PNd,           /* \D */    string_PNd,           /* \D */
355    string_pNd,           /* \d */    string_pNd,           /* \d */
356    string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */    string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
357    string_pXsp,          /* \s */    string_pXsp,          /* \s */   /* space and POSIX space are the same. */
358    string_PXwd,          /* \W */    string_PXwd,          /* \W */
359    string_pXwd           /* \w */    string_pXwd           /* \w */
360  };  };
361    
362    /* The POSIX class substitutes must be in the order of the POSIX class names,
363    defined above, and there are both positive and negative cases. NULL means no
364    general substitute of a Unicode property escape (\p or \P). However, for some
365    POSIX classes (e.g. graph, print, punct) a special property code is compiled
366    directly. */
367    
368  static const pcre_uchar string_pL[] =   {  static const pcre_uchar string_pL[] =   {
369    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
# Line 382  static const pcre_uchar *posix_substitut Line 412  static const pcre_uchar *posix_substitut
412    NULL,                 /* graph */    NULL,                 /* graph */
413    NULL,                 /* print */    NULL,                 /* print */
414    NULL,                 /* punct */    NULL,                 /* punct */
415    string_pXps,          /* space */    /* NOTE: Xps is POSIX space */    string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
416    string_pXwd,          /* word */    string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
417    NULL,                 /* xdigit */    NULL,                 /* xdigit */
418    /* Negated cases */    /* Negated cases */
419    string_PL,            /* ^alpha */    string_PL,            /* ^alpha */
# Line 397  static const pcre_uchar *posix_substitut Line 427  static const pcre_uchar *posix_substitut
427    NULL,                 /* ^graph */    NULL,                 /* ^graph */
428    NULL,                 /* ^print */    NULL,                 /* ^print */
429    NULL,                 /* ^punct */    NULL,                 /* ^punct */
430    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
431    string_PXwd,          /* ^word */    string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
432    NULL                  /* ^xdigit */    NULL                  /* ^xdigit */
433  };  };
434  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
# Line 434  static const char error_texts[] = Line 464  static const char error_texts[] =
464    "range out of order in character class\0"    "range out of order in character class\0"
465    "nothing to repeat\0"    "nothing to repeat\0"
466    /* 10 */    /* 10 */
467    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/    "internal error: invalid forward reference offset\0"
468    "internal error: unexpected repeat\0"    "internal error: unexpected repeat\0"
469    "unrecognized character after (? or (?-\0"    "unrecognized character after (? or (?-\0"
470    "POSIX named classes are supported only within a class\0"    "POSIX named classes are supported only within a class\0"
# Line 462  static const char error_texts[] = Line 492  static const char error_texts[] =
492    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
493    "this version of PCRE is compiled without UTF support\0"    "this version of PCRE is compiled without UTF support\0"
494    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
495    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
496    /* 35 */    /* 35 */
497    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
498    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 503  static const char error_texts[] = Line 533  static const char error_texts[] =
533    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
534    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
535    "this version of PCRE is not compiled with Unicode property support\0"    "this version of PCRE is not compiled with Unicode property support\0"
536    #ifndef EBCDIC
537    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
538    #else
539      "\\c must be followed by a letter or one of [\\]^_?\0"
540    #endif
541    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
542    /* 70 */    /* 70 */
543    "internal error: unknown opcode in find_fixedlength()\0"    "internal error: unknown opcode in find_fixedlength()\0"
# Line 516  static const char error_texts[] = Line 550  static const char error_texts[] =
550    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
551    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
552    "setting UTF is disabled by the application\0"    "setting UTF is disabled by the application\0"
553      "non-hex character in \\x{} (closing brace missing?)\0"
554      /* 80 */
555      "non-octal character in \\o{} (closing brace missing?)\0"
556      "missing opening brace after \\o\0"
557      "parentheses are too deeply nested\0"
558      "invalid range in character class\0"
559      "group name must start with a non-digit\0"
560      /* 85 */
561      "parentheses are too deeply nested (stack check)\0"
562      "digits missing in \\x{} or \\o{}\0"
563    ;    ;
564    
565  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 773  static const pcre_uint8 posspropstab[3][ Line 817  static const pcre_uint8 posspropstab[3][
817    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
818  };  };
819    
820    /* This table is used when converting repeating opcodes into possessified
821    versions as a result of an explicit possessive quantifier such as ++. A zero
822    value means there is no possessified version - in those cases the item in
823    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
824    because all relevant opcodes are less than that. */
825    
826    static const pcre_uint8 opcode_possessify[] = {
827      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
828      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
829    
830      0,                       /* NOTI */
831      OP_POSSTAR, 0,           /* STAR, MINSTAR */
832      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
833      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
834      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
835      0,                       /* EXACT */
836      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
837    
838      OP_POSSTARI, 0,          /* STARI, MINSTARI */
839      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
840      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
841      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
842      0,                       /* EXACTI */
843      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
844    
845      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
846      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
847      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
848      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
849      0,                       /* NOTEXACT */
850      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
851    
852      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
853      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
854      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
855      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
856      0,                       /* NOTEXACTI */
857      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
858    
859      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
860      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
861      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
862      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
863      0,                       /* TYPEEXACT */
864      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
865    
866      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
867      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
868      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
869      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
870      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
871    
872      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
873      0, 0,                    /* REF, REFI */
874      0, 0,                    /* DNREF, DNREFI */
875      0, 0                     /* RECURSE, CALLOUT */
876    };
877    
878    
879    
880  /*************************************************  /*************************************************
# Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 981  return (*p == CHAR_RIGHT_CURLY_BRACKET);
981  *************************************************/  *************************************************/
982    
983  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
984  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
985  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
986  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
987  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
988  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
989    
990  Arguments:  Arguments:
991    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
992    chptr          points to the data character    chptr          points to a returned data character
993    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
994    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
995    options        the options bits    options        the options bits
# Line 1092  else Line 1193  else
1193      break;      break;
1194    
1195      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1196      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1197      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1198        recommended to avoid the ambiguities in the old syntax.
1199    
1200      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1201      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1202      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1203      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1204      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1205      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1206      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1207    
1208        Inside a character class, \ followed by a digit is always either a literal
1209        8 or 9 or an octal number. */
1210    
1211      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1212      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1128  else Line 1233  else
1233          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1234          break;          break;
1235          }          }
1236        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1237          {          {
1238          escape = -s;          escape = -s;
1239          break;          break;
# Line 1136  else Line 1241  else
1241        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1242        }        }
1243    
1244      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1245      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1246      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1247        changed so as not to insert the binary zero. */
1248    
1249      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1250        {  
1251        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1252    
1253      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1254      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1162  else Line 1265  else
1265  #endif  #endif
1266      break;      break;
1267    
1268      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1269      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.      specifying character codes in octal. The only supported form is \o{ddd}. */
1270      If not, { is treated as a data character. */  
1271        case CHAR_o:
1272        if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1273        if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1274          {
1275          ptr += 2;
1276          c = 0;
1277          overflow = FALSE;
1278          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1279            {
1280            register pcre_uint32 cc = *ptr++;
1281            if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1282    #ifdef COMPILE_PCRE32
1283            if (c >= 0x20000000l) { overflow = TRUE; break; }
1284    #endif
1285            c = (c << 3) + cc - CHAR_0 ;
1286    #if defined COMPILE_PCRE8
1287            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1288    #elif defined COMPILE_PCRE16
1289            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1290    #elif defined COMPILE_PCRE32
1291            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1292    #endif
1293            }
1294          if (overflow)
1295            {
1296            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1297            *errorcodeptr = ERR34;
1298            }
1299          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1300            {
1301            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1302            }
1303          else *errorcodeptr = ERR80;
1304          }
1305        break;
1306    
1307        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1308        numbers. Otherwise it is a lowercase x letter. */
1309    
1310      case CHAR_x:      case CHAR_x:
1311      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1312        {        {
       /* In JavaScript, \x must be followed by two hexadecimal numbers.  
       Otherwise it is a lowercase x letter. */  
1313        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1314          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1315          {          {
# Line 1187  else Line 1326  else
1326  #endif  #endif
1327            }            }
1328          }          }
1329        break;        }    /* End JavaScript handling */
       }  
1330    
1331      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1332        {      greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1333        const pcre_uchar *pt = ptr + 2;      digits. If not, { used to be treated as a data character. However, Perl
1334        seems to read hex digits up to the first non-such, and ignore the rest, so
1335        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1336        now gives an error. */
1337    
1338        c = 0;      else
1339        overflow = FALSE;        {
1340        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1341          {          {
1342          register pcre_uint32 cc = *pt++;          ptr += 2;
1343          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1344              {
1345              *errorcodeptr = ERR86;
1346              break;
1347              }
1348            c = 0;
1349            overflow = FALSE;
1350            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1351              {
1352              register pcre_uint32 cc = *ptr++;
1353              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1354    
1355  #ifdef COMPILE_PCRE32  #ifdef COMPILE_PCRE32
1356          if (c >= 0x10000000l) { overflow = TRUE; break; }            if (c >= 0x10000000l) { overflow = TRUE; break; }
1357  #endif  #endif
1358    
1359  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1360          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1361          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1362  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1363          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1364          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1365  #endif  #endif
1366    
1367  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1368          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1369  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1370          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1371  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1372          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1373  #endif  #endif
1374          }            }
1375    
1376        if (overflow)          if (overflow)
1377          {            {
1378          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1379          *errorcodeptr = ERR34;            *errorcodeptr = ERR34;
1380          }            }
1381    
1382        if (*pt == CHAR_RIGHT_CURLY_BRACKET)          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1383          {            {
1384          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1385          ptr = pt;            }
         break;  
         }  
1386    
1387        /* If the sequence of hex digits does not end with '}', then we don't          /* If the sequence of hex digits does not end with '}', give an error.
1388        recognize this construct; fall through to the normal \x handling. */          We used just to recognize this construct and fall through to the normal
1389        }          \x handling, but nowadays Perl gives an error, which seems much more
1390            sensible, so we do too. */
1391    
1392      /* Read just a single-byte hex-defined char */          else *errorcodeptr = ERR79;
1393            }   /* End of \x{} processing */
1394    
1395      c = 0;        /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1396      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)  
1397        {        else
1398        pcre_uint32 cc;                          /* Some compilers don't like */          {
1399        cc = *(++ptr);                           /* ++ in initializers */          c = 0;
1400            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1401              {
1402              pcre_uint32 cc;                          /* Some compilers don't like */
1403              cc = *(++ptr);                           /* ++ in initializers */
1404  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1405        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1406        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1407  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1408        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1409        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1410  #endif  #endif
1411        }            }
1412            }     /* End of \xdd handling */
1413          }       /* End of Perl-style \x handling */
1414      break;      break;
1415    
1416      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 1278  else Line 1435  else
1435      c ^= 0x40;      c ^= 0x40;
1436  #else             /* EBCDIC coding */  #else             /* EBCDIC coding */
1437      if (c >= CHAR_a && c <= CHAR_z) c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1438      c ^= 0xC0;      if (c == CHAR_QUESTION_MARK)
1439          c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1440        else
1441          {
1442          for (i = 0; i < 32; i++)
1443            {
1444            if (c == ebcdic_escape_c[i]) break;
1445            }
1446          if (i < 32) c = i; else *errorcodeptr = ERR68;
1447          }
1448  #endif  #endif
1449      break;      break;
1450    
# Line 1443  read_repeat_counts(const pcre_uchar *p, Line 1609  read_repeat_counts(const pcre_uchar *p,
1609  int min = 0;  int min = 0;
1610  int max = -1;  int max = -1;
1611    
1612  /* Read the minimum value and do a paranoid check: a negative value indicates  while (IS_DIGIT(*p))
 an integer overflow. */  
   
 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);  
 if (min < 0 || min > 65535)  
1613    {    {
1614    *errorcodeptr = ERR5;    min = min * 10 + (int)(*p++ - CHAR_0);
1615    return p;    if (min > 65535)
1616        {
1617        *errorcodeptr = ERR5;
1618        return p;
1619        }
1620    }    }
1621    
 /* Read the maximum value if there is one, and again do a paranoid on its size.  
 Also, max must not be less than min. */  
   
1622  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1623    {    {
1624    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1625      {      {
1626      max = 0;      max = 0;
1627      while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);      while(IS_DIGIT(*p))
     if (max < 0 || max > 65535)  
1628        {        {
1629        *errorcodeptr = ERR5;        max = max * 10 + (int)(*p++ - CHAR_0);
1630        return p;        if (max > 65535)
1631            {
1632            *errorcodeptr = ERR5;
1633            return p;
1634            }
1635        }        }
1636      if (max < min)      if (max < min)
1637        {        {
# Line 1475  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1641  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1641      }      }
1642    }    }
1643    
 /* Fill in the required variables, and pass back the pointer to the terminating  
 '}'. */  
   
1644  *minp = min;  *minp = min;
1645  *maxp = max;  *maxp = max;
1646  return p;  return p;
# Line 1524  for (;;) Line 1687  for (;;)
1687    
1688      case OP_CALLOUT:      case OP_CALLOUT:
1689      case OP_CREF:      case OP_CREF:
1690      case OP_NCREF:      case OP_DNCREF:
1691      case OP_RREF:      case OP_RREF:
1692      case OP_NRREF:      case OP_DNRREF:
1693      case OP_DEF:      case OP_DEF:
1694      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1695      break;      break;
# Line 1560  Arguments: Line 1723  Arguments:
1723    utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode    utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1724    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1725    cd       the "compile data" structure    cd       the "compile data" structure
1726      recurses    chain of recurse_check to catch mutual recursion
1727    
1728  Returns:   the fixed length,  Returns:   the fixed length,
1729               or -1 if there is no fixed length,               or -1 if there is no fixed length,
# Line 1569  Returns:   the fixed length, Line 1733  Returns:   the fixed length,
1733  */  */
1734    
1735  static int  static int
1736  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1737      recurse_check *recurses)
1738  {  {
1739  int length = -1;  int length = -1;
1740    recurse_check this_recurse;
1741  register int branchlength = 0;  register int branchlength = 0;
1742  register pcre_uchar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1743    
# Line 1597  for (;;) Line 1762  for (;;)
1762      case OP_ONCE:      case OP_ONCE:
1763      case OP_ONCE_NC:      case OP_ONCE_NC:
1764      case OP_COND:      case OP_COND:
1765      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1766          recurses);
1767      if (d < 0) return d;      if (d < 0) return d;
1768      branchlength += d;      branchlength += d;
1769      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1631  for (;;) Line 1797  for (;;)
1797      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1798      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1799      if (cc > cs && cc < ce) return -1;                    /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1800      d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);      else   /* Check for mutual recursion */
1801          {
1802          recurse_check *r = recurses;
1803          for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1804          if (r != NULL) return -1;   /* Mutual recursion */
1805          }
1806        this_recurse.prev = recurses;
1807        this_recurse.group = cs;
1808        d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1809      if (d < 0) return d;      if (d < 0) return d;
1810      branchlength += d;      branchlength += d;
1811      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1644  for (;;) Line 1818  for (;;)
1818      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1819      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1820      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1821      cc += PRIV(OP_lengths)[*cc];      cc += 1 + LINK_SIZE;
1822      break;      break;
1823    
1824      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
# Line 1663  for (;;) Line 1837  for (;;)
1837      case OP_COMMIT:      case OP_COMMIT:
1838      case OP_CREF:      case OP_CREF:
1839      case OP_DEF:      case OP_DEF:
1840        case OP_DNCREF:
1841        case OP_DNRREF:
1842      case OP_DOLL:      case OP_DOLL:
1843      case OP_DOLLM:      case OP_DOLLM:
1844      case OP_EOD:      case OP_EOD:
1845      case OP_EODN:      case OP_EODN:
1846      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1847      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1848      case OP_PRUNE:      case OP_PRUNE:
1849      case OP_REVERSE:      case OP_REVERSE:
# Line 1764  for (;;) Line 1938  for (;;)
1938    
1939      switch (*cc)      switch (*cc)
1940        {        {
       case OP_CRPLUS:  
       case OP_CRMINPLUS:  
1941        case OP_CRSTAR:        case OP_CRSTAR:
1942        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1943          case OP_CRPLUS:
1944          case OP_CRMINPLUS:
1945        case OP_CRQUERY:        case OP_CRQUERY:
1946        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1947          case OP_CRPOSSTAR:
1948          case OP_CRPOSPLUS:
1949          case OP_CRPOSQUERY:
1950        return -1;        return -1;
1951    
1952        case OP_CRRANGE:        case OP_CRRANGE:
1953        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1954          case OP_CRPOSRANGE:
1955        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1956        branchlength += (int)GET2(cc,1);        branchlength += (int)GET2(cc,1);
1957        cc += 1 + 2 * IMM2_SIZE;        cc += 1 + 2 * IMM2_SIZE;
# Line 1981  for (;;) Line 2159  for (;;)
2159        {        {
2160        case OP_CHAR:        case OP_CHAR:
2161        case OP_CHARI:        case OP_CHARI:
2162          case OP_NOT:
2163          case OP_NOTI:
2164        case OP_EXACT:        case OP_EXACT:
2165        case OP_EXACTI:        case OP_EXACTI:
2166          case OP_NOTEXACT:
2167          case OP_NOTEXACTI:
2168        case OP_UPTO:        case OP_UPTO:
2169        case OP_UPTOI:        case OP_UPTOI:
2170          case OP_NOTUPTO:
2171          case OP_NOTUPTOI:
2172        case OP_MINUPTO:        case OP_MINUPTO:
2173        case OP_MINUPTOI:        case OP_MINUPTOI:
2174          case OP_NOTMINUPTO:
2175          case OP_NOTMINUPTOI:
2176        case OP_POSUPTO:        case OP_POSUPTO:
2177        case OP_POSUPTOI:        case OP_POSUPTOI:
2178          case OP_NOTPOSUPTO:
2179          case OP_NOTPOSUPTOI:
2180        case OP_STAR:        case OP_STAR:
2181        case OP_STARI:        case OP_STARI:
2182          case OP_NOTSTAR:
2183          case OP_NOTSTARI:
2184        case OP_MINSTAR:        case OP_MINSTAR:
2185        case OP_MINSTARI:        case OP_MINSTARI:
2186          case OP_NOTMINSTAR:
2187          case OP_NOTMINSTARI:
2188        case OP_POSSTAR:        case OP_POSSTAR:
2189        case OP_POSSTARI:        case OP_POSSTARI:
2190          case OP_NOTPOSSTAR:
2191          case OP_NOTPOSSTARI:
2192        case OP_PLUS:        case OP_PLUS:
2193        case OP_PLUSI:        case OP_PLUSI:
2194          case OP_NOTPLUS:
2195          case OP_NOTPLUSI:
2196        case OP_MINPLUS:        case OP_MINPLUS:
2197        case OP_MINPLUSI:        case OP_MINPLUSI:
2198          case OP_NOTMINPLUS:
2199          case OP_NOTMINPLUSI:
2200        case OP_POSPLUS:        case OP_POSPLUS:
2201        case OP_POSPLUSI:        case OP_POSPLUSI:
2202          case OP_NOTPOSPLUS:
2203          case OP_NOTPOSPLUSI:
2204        case OP_QUERY:        case OP_QUERY:
2205        case OP_QUERYI:        case OP_QUERYI:
2206          case OP_NOTQUERY:
2207          case OP_NOTQUERYI:
2208        case OP_MINQUERY:        case OP_MINQUERY:
2209        case OP_MINQUERYI:        case OP_MINQUERYI:
2210          case OP_NOTMINQUERY:
2211          case OP_NOTMINQUERYI:
2212        case OP_POSQUERY:        case OP_POSQUERY:
2213        case OP_POSQUERYI:        case OP_POSQUERYI:
2214          case OP_NOTPOSQUERY:
2215          case OP_NOTPOSQUERYI:
2216        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2217        break;        break;
2218        }        }
# Line 2186  Arguments: Line 2392  Arguments:
2392  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2393  */  */
2394    
 typedef struct recurse_check {  
   struct recurse_check *prev;  
   const pcre_uchar *group;  
 } recurse_check;  
   
2395  static BOOL  static BOOL
2396  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2397    BOOL utf, compile_data *cd, recurse_check *recurses)    BOOL utf, compile_data *cd, recurse_check *recurses)
# Line 2226  for (code = first_significant_code(code Line 2427  for (code = first_significant_code(code
2427    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2428      {      {
2429      const pcre_uchar *scode = cd->start_code + GET(code, 1);      const pcre_uchar *scode = cd->start_code + GET(code, 1);
2430        const pcre_uchar *endgroup = scode;
2431      BOOL empty_branch;      BOOL empty_branch;
2432    
2433      /* Test for forward reference or uncompleted reference. This is disabled      /* Test for forward reference or uncompleted reference. This is disabled
# Line 2240  for (code = first_significant_code(code Line 2442  for (code = first_significant_code(code
2442        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2443        }        }
2444    
2445      /* If we are scanning a completed pattern, there are no forward references      /* If the reference is to a completed group, we need to detect whether this
2446      and all groups are complete. We need to detect whether this is a recursive      is a recursive call, as otherwise there will be an infinite loop. If it is
2447      call, as otherwise there will be an infinite loop. If it is a recursion,      a recursion, just skip over it. Simple recursions are easily detected. For
2448      just skip over it. Simple recursions are easily detected. For mutual      mutual recursions we keep a chain on the stack. */
     recursions we keep a chain on the stack. */  
2449    
2450        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2451        if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2452      else      else
2453        {        {
2454        recurse_check *r = recurses;        recurse_check *r = recurses;
       const pcre_uchar *endgroup = scode;  
   
       do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);  
       if (code >= scode && code <= endgroup) continue;  /* Simple recursion */  
   
2455        for (r = recurses; r != NULL; r = r->prev)        for (r = recurses; r != NULL; r = r->prev)
2456          if (r->group == scode) break;          if (r->group == scode) break;
2457        if (r != NULL) continue;   /* Mutual recursion */        if (r != NULL) continue;   /* Mutual recursion */
# Line 2308  for (code = first_significant_code(code Line 2506  for (code = first_significant_code(code
2506    if (c == OP_BRA  || c == OP_BRAPOS ||    if (c == OP_BRA  || c == OP_BRAPOS ||
2507        c == OP_CBRA || c == OP_CBRAPOS ||        c == OP_CBRA || c == OP_CBRAPOS ||
2508        c == OP_ONCE || c == OP_ONCE_NC ||        c == OP_ONCE || c == OP_ONCE_NC ||
2509        c == OP_COND)        c == OP_COND || c == OP_SCOND)
2510      {      {
2511      BOOL empty_branch;      BOOL empty_branch;
2512      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 2324  for (code = first_significant_code(code Line 2522  for (code = first_significant_code(code
2522        empty_branch = FALSE;        empty_branch = FALSE;
2523        do        do
2524          {          {
2525          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2526            empty_branch = TRUE;            recurses)) empty_branch = TRUE;
2527          code += GET(code, 1);          code += GET(code, 1);
2528          }          }
2529        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 2366  for (code = first_significant_code(code Line 2564  for (code = first_significant_code(code
2564        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2565        case OP_CRQUERY:        case OP_CRQUERY:
2566        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2567          case OP_CRPOSSTAR:
2568          case OP_CRPOSQUERY:
2569        break;        break;
2570    
2571        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2572        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2573        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2574          case OP_CRPOSPLUS:
2575        return FALSE;        return FALSE;
2576    
2577        case OP_CRRANGE:        case OP_CRRANGE:
2578        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2579          case OP_CRPOSRANGE:
2580        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2581        break;        break;
2582        }        }
# Line 2653  switch(ptype) Line 2855  switch(ptype)
2855    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2856    means that Perl space and POSIX space are now identical. PCRE was changed    means that Perl space and POSIX space are now identical. PCRE was changed
2857    at release 8.34. */    at release 8.34. */
2858    
2859    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2860    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2861    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    switch(c)
2862            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||      {
2863            c == CHAR_FF || c == CHAR_CR)      HSPACE_CASES:
2864            == negated;      VSPACE_CASES:
2865        return negated;
2866    
2867        default:
2868        return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2869        }
2870      break;  /* Control never reaches here */
2871    
2872    case PT_WORD:    case PT_WORD:
2873    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
# Line 2708  get_chr_property_list(const pcre_uchar * Line 2916  get_chr_property_list(const pcre_uchar *
2916    const pcre_uint8 *fcc, pcre_uint32 *list)    const pcre_uint8 *fcc, pcre_uint32 *list)
2917  {  {
2918  pcre_uchar c = *code;  pcre_uchar c = *code;
2919    pcre_uchar base;
2920  const pcre_uchar *end;  const pcre_uchar *end;
 const pcre_uint32 *clist_src;  
 pcre_uint32 *clist_dest;  
2921  pcre_uint32 chr;  pcre_uint32 chr;
2922  pcre_uchar base;  
2923    #ifdef SUPPORT_UCP
2924    pcre_uint32 *clist_dest;
2925    const pcre_uint32 *clist_src;
2926    #else
2927    utf = utf;  /* Suppress "unused parameter" compiler warning */
2928    #endif
2929    
2930  list[0] = c;  list[0] = c;
2931  list[1] = FALSE;  list[1] = FALSE;
# Line 2818  switch(c) Line 3031  switch(c)
3031      return code + 2;      return code + 2;
3032      }      }
3033    
3034    /* Convert only if we have anough space. */    /* Convert only if we have enough space. */
3035    
3036    clist_src = PRIV(ucd_caseless_sets) + code[1];    clist_src = PRIV(ucd_caseless_sets) + code[1];
3037    clist_dest = list + 2;    clist_dest = list + 2;
3038    code += 2;    code += 2;
3039    
3040    do {    do {
      /* Early return if there is not enough space. */  
3041       if (clist_dest >= list + 8)       if (clist_dest >= list + 8)
3042         {         {
3043           /* Early return if there is not enough space. This should never
3044           happen, since all clists are shorter than 5 character now. */
3045         list[2] = code[0];         list[2] = code[0];
3046         list[3] = code[1];         list[3] = code[1];
3047         return code;         return code;
3048         }         }
3049       *clist_dest++ = *clist_src;       *clist_dest++ = *clist_src;
3050       }       }
3051     while(*clist_src++ != NOTACHAR);    while(*clist_src++ != NOTACHAR);
3052    
3053    /* Enough space to store all characters. */    /* All characters are stored. The terminating NOTACHAR
3054      is copied form the clist itself. */
3055    
3056    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3057    return code;    return code;
# Line 2846  switch(c) Line 3061  switch(c)
3061    case OP_CLASS:    case OP_CLASS:
3062  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3063    case OP_XCLASS:    case OP_XCLASS:
   
3064    if (c == OP_XCLASS)    if (c == OP_XCLASS)
3065      end = code + GET(code, 0);      end = code + GET(code, 0) - 1;
3066    else    else
3067  #endif  #endif
3068      end = code + 32 / sizeof(pcre_uchar);      end = code + 32 / sizeof(pcre_uchar);
# Line 2859  switch(c) Line 3073  switch(c)
3073      case OP_CRMINSTAR:      case OP_CRMINSTAR:
3074      case OP_CRQUERY:      case OP_CRQUERY:
3075      case OP_CRMINQUERY:      case OP_CRMINQUERY:
3076        case OP_CRPOSSTAR:
3077        case OP_CRPOSQUERY:
3078      list[1] = TRUE;      list[1] = TRUE;
3079      end++;      end++;
3080      break;      break;
3081    
3082        case OP_CRPLUS:
3083        case OP_CRMINPLUS:
3084        case OP_CRPOSPLUS:
3085        end++;
3086        break;
3087    
3088      case OP_CRRANGE:      case OP_CRRANGE:
3089      case OP_CRMINRANGE:      case OP_CRMINRANGE:
3090        case OP_CRPOSRANGE:
3091      list[1] = (GET2(end, 1) == 0);      list[1] = (GET2(end, 1) == 0);
3092      end += 1 + 2 * IMM2_SIZE;      end += 1 + 2 * IMM2_SIZE;
3093      break;      break;
3094      }      }
3095    list[2] = end - code;    list[2] = (pcre_uint32)(end - code);
3096    return end;    return end;
3097    }    }
3098  return NULL;    /* Opcode not accepted */  return NULL;    /* Opcode not accepted */
# Line 2895  Returns:      TRUE if the auto-possessif Line 3118  Returns:      TRUE if the auto-possessif
3118    
3119  static BOOL  static BOOL
3120  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3121    const pcre_uint32* base_list)    const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3122  {  {
3123  pcre_uchar c;  pcre_uchar c;
3124  pcre_uint32 list[8];  pcre_uint32 list[8];
3125  const pcre_uint32* chr_ptr;  const pcre_uint32 *chr_ptr;
3126  const pcre_uint32* ochr_ptr;  const pcre_uint32 *ochr_ptr;
3127  const pcre_uint32* list_ptr;  const pcre_uint32 *list_ptr;
3128    const pcre_uchar *next_code;
3129    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3130    const pcre_uchar *xclass_flags;
3131    #endif
3132    const pcre_uint8 *class_bitset;
3133    const pcre_uint8 *set1, *set2, *set_end;
3134  pcre_uint32 chr;  pcre_uint32 chr;
3135    BOOL accepted, invert_bits;
3136    BOOL entered_a_group = FALSE;
3137    
3138    if (*rec_limit == 0) return FALSE;
3139    --(*rec_limit);
3140    
3141    /* Note: the base_list[1] contains whether the current opcode has greedy
3142    (represented by a non-zero value) quantifier. This is a different from
3143    other character type lists, which stores here that the character iterator
3144    matches to an empty string (also represented by a non-zero value). */
3145    
3146  for(;;)  for(;;)
3147    {    {
3148      /* All operations move the code pointer forward.
3149      Therefore infinite recursions are not possible. */
3150    
3151    c = *code;    c = *code;
3152    
3153    /* Skip over callouts */    /* Skip over callouts */
# Line 2925  for(;;) Line 3167  for(;;)
3167    switch(c)    switch(c)
3168      {      {
3169      case OP_END:      case OP_END:
3170      /* TRUE only in greedy case. The non-greedy case could be replaced by an      case OP_KETRPOS:
3171      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses      /* TRUE only in greedy case. The non-greedy case could be replaced by
3172      more memory, which we cannot get at this stage.) */      an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3173        uses more memory, which we cannot get at this stage.) */
3174    
3175      return base_list[1] != 0;      return base_list[1] != 0;
3176    
3177      case OP_KET:      case OP_KET:
3178      /* If the bracket is capturing, and referenced by an OP_RECURSE, the      /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3179      non-greedy case cannot be converted to a possessive form. We do not test      it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3180      the bracket type at the moment, but we might do it in the future to improve      cannot be converted to a possessive form. */
     this condition. (But note that recursive calls are always atomic.) */  
3181    
3182      if (base_list[1] == 0) return FALSE;      if (base_list[1] == 0) return FALSE;
3183    
3184        switch(*(code - GET(code, 1)))
3185          {
3186          case OP_ASSERT:
3187          case OP_ASSERT_NOT:
3188          case OP_ASSERTBACK:
3189          case OP_ASSERTBACK_NOT:
3190          case OP_ONCE:
3191          case OP_ONCE_NC:
3192          /* Atomic sub-patterns and assertions can always auto-possessify their
3193          last iterator. However, if the group was entered as a result of checking
3194          a previous iterator, this is not possible. */
3195    
3196          return !entered_a_group;
3197          }
3198    
3199        code += PRIV(OP_lengths)[c];
3200        continue;
3201    
3202        case OP_ONCE:
3203        case OP_ONCE_NC:
3204        case OP_BRA:
3205        case OP_CBRA:
3206        next_code = code + GET(code, 1);
3207        code += PRIV(OP_lengths)[c];
3208    
3209        while (*next_code == OP_ALT)
3210          {
3211          if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3212            return FALSE;
3213          code = next_code + 1 + LINK_SIZE;
3214          next_code += GET(next_code, 1);
3215          }
3216    
3217        entered_a_group = TRUE;
3218        continue;
3219    
3220        case OP_BRAZERO:
3221        case OP_BRAMINZERO:
3222    
3223        next_code = code + 1;
3224        if (*next_code != OP_BRA && *next_code != OP_CBRA
3225            && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3226    
3227        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3228    
3229        /* The bracket content will be checked by the
3230        OP_BRA/OP_CBRA case above. */
3231        next_code += 1 + LINK_SIZE;
3232        if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3233          return FALSE;
3234    
3235      code += PRIV(OP_lengths)[c];      code += PRIV(OP_lengths)[c];
3236      continue;      continue;
3237    
3238        default:
3239        break;
3240      }      }
3241    
3242    /* Check for a supported opcode, and load its properties. */    /* Check for a supported opcode, and load its properties. */
# Line 2961  for(;;) Line 3258  for(;;)
3258      list_ptr = base_list;      list_ptr = base_list;
3259      }      }
3260    
3261      /* Character bitsets can also be compared to certain opcodes. */
3262    
3263      else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3264    #ifdef COMPILE_PCRE8
3265          /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3266          || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3267    #endif
3268          )
3269        {
3270    #ifdef COMPILE_PCRE8
3271        if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3272    #else
3273        if (base_list[0] == OP_CLASS)
3274    #endif
3275          {
3276          set1 = (pcre_uint8 *)(base_end - base_list[2]);
3277          list_ptr = list;
3278          }
3279        else
3280          {
3281          set1 = (pcre_uint8 *)(code - list[2]);
3282          list_ptr = base_list;
3283          }
3284    
3285        invert_bits = FALSE;
3286        switch(list_ptr[0])
3287          {
3288          case OP_CLASS:
3289          case OP_NCLASS:
3290          set2 = (pcre_uint8 *)
3291            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3292          break;
3293    
3294    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3295          case OP_XCLASS:
3296          xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3297          if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3298          if ((*xclass_flags & XCL_MAP) == 0)
3299            {
3300            /* No bits are set for characters < 256. */
3301            if (list[1] == 0) return TRUE;
3302            /* Might be an empty repeat. */
3303            continue;
3304            }
3305          set2 = (pcre_uint8 *)(xclass_flags + 1);
3306          break;
3307    #endif
3308    
3309          case OP_NOT_DIGIT:
3310          invert_bits = TRUE;
3311          /* Fall through */
3312          case OP_DIGIT:
3313          set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3314          break;
3315    
3316          case OP_NOT_WHITESPACE:
3317          invert_bits = TRUE;
3318          /* Fall through */
3319          case OP_WHITESPACE:
3320          set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3321          break;
3322    
3323          case OP_NOT_WORDCHAR:
3324          invert_bits = TRUE;
3325          /* Fall through */
3326          case OP_WORDCHAR:
3327          set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3328          break;
3329    
3330          default:
3331          return FALSE;
3332          }
3333    
3334        /* Because the sets are unaligned, we need
3335        to perform byte comparison here. */
3336        set_end = set1 + 32;
3337        if (invert_bits)
3338          {
3339          do
3340            {
3341            if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3342            }
3343          while (set1 < set_end);
3344          }
3345        else
3346          {
3347          do
3348            {
3349            if ((*set1++ & *set2++) != 0) return FALSE;
3350            }
3351          while (set1 < set_end);
3352          }
3353    
3354        if (list[1] == 0) return TRUE;
3355        /* Might be an empty repeat. */
3356        continue;
3357        }
3358    
3359    /* Some property combinations also acceptable. Unicode property opcodes are    /* Some property combinations also acceptable. Unicode property opcodes are
3360    processed specially; the rest can be handled with a lookup table. */    processed specially; the rest can be handled with a lookup table. */
3361    
# Line 2968  for(;;) Line 3363  for(;;)
3363      {      {
3364      pcre_uint32 leftop, rightop;      pcre_uint32 leftop, rightop;
3365    
     if (list[1] != 0) return FALSE;   /* Must match at least one character */  
3366      leftop = base_list[0];      leftop = base_list[0];
3367      rightop = list[0];      rightop = list[0];
3368    
3369  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3370        accepted = FALSE; /* Always set in non-unicode case. */
3371      if (leftop == OP_PROP || leftop == OP_NOTPROP)      if (leftop == OP_PROP || leftop == OP_NOTPROP)
3372        {        {
3373        if (rightop == OP_EOD) return TRUE;        if (rightop == OP_EOD)
3374        if (rightop == OP_PROP || rightop == OP_NOTPROP)          accepted = TRUE;
3375          else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3376          {          {
3377          int n;          int n;
3378          const pcre_uint8 *p;          const pcre_uint8 *p;
# Line 2997  for(;;) Line 3393  for(;;)
3393          n = propposstab[base_list[2]][list[2]];          n = propposstab[base_list[2]][list[2]];
3394          switch(n)          switch(n)
3395            {            {
3396            case 0: return FALSE;            case 0: break;
3397            case 1: return bothprop;            case 1: accepted = bothprop; break;
3398            case 2: return (base_list[3] == list[3]) != same;            case 2: accepted = (base_list[3] == list[3]) != same; break;
3399            case 3: return !same;            case 3: accepted = !same; break;
3400    
3401            case 4:  /* Left general category, right particular category */            case 4:  /* Left general category, right particular category */
3402            return risprop && catposstab[base_list[3]][list[3]] == same;            accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3403              break;
3404    
3405            case 5:  /* Right general category, left particular category */            case 5:  /* Right general category, left particular category */
3406            return lisprop && catposstab[list[3]][base_list[3]] == same;            accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3407              break;
3408    
3409            /* This code is logically tricky. Think hard before fiddling with it.            /* This code is logically tricky. Think hard before fiddling with it.
3410            The posspropstab table has four entries per row. Each row relates to            The posspropstab table has four entries per row. Each row relates to
3411            one of PCRE's special properties such as ALNUM or SPACE or WORD.            one of PCRE's special properties such as ALNUM or SPACE or WORD.
3412            Only WORD actually needs all four entries, but using repeats for the            Only WORD actually needs all four entries, but using repeats for the
3413            others means they can all use the same code below.            others means they can all use the same code below.
3414    
3415            The first two entries in each row are Unicode general categories, and            The first two entries in each row are Unicode general categories, and
3416            apply always, because all the characters they include are part of the            apply always, because all the characters they include are part of the
3417            PCRE character set. The third and fourth entries are a general and a            PCRE character set. The third and fourth entries are a general and a
# Line 3023  for(;;) Line 3421  for(;;)
3421            category contains more characters than the specials that are defined            category contains more characters than the specials that are defined
3422            for the property being tested against. Therefore, it cannot be used            for the property being tested against. Therefore, it cannot be used
3423            in a NOTPROP case.            in a NOTPROP case.
3424    
3425            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3426            Underscore is covered by ucp_P or ucp_Po. */            Underscore is covered by ucp_P or ucp_Po. */
3427    
# Line 3031  for(;;) Line 3429  for(;;)
3429            case 7:  /* Left space vs right general category */            case 7:  /* Left space vs right general category */
3430            case 8:  /* Left word vs right general category */            case 8:  /* Left word vs right general category */
3431            p = posspropstab[n-6];            p = posspropstab[n-6];
3432            return risprop && lisprop ==            accepted = risprop && lisprop ==
3433              (list[3] != p[0] &&              (list[3] != p[0] &&
3434               list[3] != p[1] &&               list[3] != p[1] &&
3435              (list[3] != p[2] || !lisprop));              (list[3] != p[2] || !lisprop));
3436              break;
3437    
3438            case 9:   /* Right alphanum vs left general category */            case 9:   /* Right alphanum vs left general category */
3439            case 10:  /* Right space vs left general category */            case 10:  /* Right space vs left general category */
3440            case 11:  /* Right word vs left general category */            case 11:  /* Right word vs left general category */
3441            p = posspropstab[n-9];            p = posspropstab[n-9];
3442            return lisprop && risprop ==            accepted = lisprop && risprop ==
3443              (base_list[3] != p[0] &&              (base_list[3] != p[0] &&
3444               base_list[3] != p[1] &&               base_list[3] != p[1] &&
3445              (base_list[3] != p[2] || !risprop));              (base_list[3] != p[2] || !risprop));
3446              break;
3447    
3448            case 12:  /* Left alphanum vs right particular category */            case 12:  /* Left alphanum vs right particular category */
3449            case 13:  /* Left space vs right particular category */            case 13:  /* Left space vs right particular category */
3450            case 14:  /* Left word vs right particular category */            case 14:  /* Left word vs right particular category */
3451            p = posspropstab[n-12];            p = posspropstab[n-12];
3452            return risprop && lisprop ==            accepted = risprop && lisprop ==
3453              (catposstab[p[0]][list[3]] &&              (catposstab[p[0]][list[3]] &&
3454               catposstab[p[1]][list[3]] &&               catposstab[p[1]][list[3]] &&
3455              (list[3] != p[3] || !lisprop));              (list[3] != p[3] || !lisprop));
3456              break;
3457    
3458            case 15:  /* Right alphanum vs left particular category */            case 15:  /* Right alphanum vs left particular category */
3459            case 16:  /* Right space vs left particular category */            case 16:  /* Right space vs left particular category */
3460            case 17:  /* Right word vs left particular category */            case 17:  /* Right word vs left particular category */
3461            p = posspropstab[n-15];            p = posspropstab[n-15];
3462            return lisprop && risprop ==            accepted = lisprop && risprop ==
3463              (catposstab[p[0]][base_list[3]] &&              (catposstab[p[0]][base_list[3]] &&
3464               catposstab[p[1]][base_list[3]] &&               catposstab[p[1]][base_list[3]] &&
3465              (base_list[3] != p[3] || !risprop));              (base_list[3] != p[3] || !risprop));
3466              break;
3467            }            }
3468          }          }
       return FALSE;  
3469        }        }
3470    
3471      else      else
3472  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
3473    
3474      return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&      accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3475             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3476             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3477    
3478        if (!accepted) return FALSE;
3479    
3480        if (list[1] == 0) return TRUE;
3481        /* Might be an empty repeat. */
3482        continue;
3483      }      }
3484    
3485    /* Control reaches here only if one of the items is a small character list.    /* Control reaches here only if one of the items is a small character list.
# Line 3186  for(;;) Line 3593  for(;;)
3593        case OP_EOD:    /* Can always possessify before \z */        case OP_EOD:    /* Can always possessify before \z */
3594        break;        break;
3595    
3596    #ifdef SUPPORT_UCP
3597        case OP_PROP:        case OP_PROP:
3598        case OP_NOTPROP:        case OP_NOTPROP:
3599        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3600              list_ptr[0] == OP_NOTPROP))              list_ptr[0] == OP_NOTPROP))
3601          return FALSE;          return FALSE;
3602        break;        break;
3603    #endif
       /* The class comparisons work only when the class is the second item  
       of the pair, because there are at present no possessive forms of the  
       class opcodes. Note also that the "code" variable that is used below  
       points after the second item, and that the pointer for the first item  
       is not available, so even if there were possessive forms of the class  
       opcodes, the correct comparison could not be done. */  
3604    
3605        case OP_NCLASS:        case OP_NCLASS:
3606        if (chr > 255) return FALSE;        if (chr > 255) return FALSE;
3607        /* Fall through */        /* Fall through */
3608    
3609        case OP_CLASS:        case OP_CLASS:
       if (list_ptr != list) return FALSE;   /* Class is first opcode */  
3610        if (chr > 255) break;        if (chr > 255) break;
3611        if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)        class_bitset = (pcre_uint8 *)
3612          return FALSE;          ((list_ptr == list ? code : base_end) - list_ptr[2]);
3613          if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3614        break;        break;
3615    
3616  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3617        case OP_XCLASS:        case OP_XCLASS:
3618        if (list_ptr != list) return FALSE;   /* Class is first opcode */        if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3619        if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))            list_ptr[2] + LINK_SIZE, utf)) return FALSE;
         return FALSE;  
3620        break;        break;
3621  #endif  #endif
3622    
# Line 3232  for(;;) Line 3633  for(;;)
3633    if (list[1] == 0) return TRUE;    if (list[1] == 0) return TRUE;
3634    }    }
3635    
3636  return FALSE;  /* Control never reaches here. There used to be a fail-save return FALSE; here,
3637    but some compilers complain about an unreachable statement. */
3638    
3639  }  }
3640    
3641    
# Line 3257  auto_possessify(pcre_uchar *code, BOOL u Line 3660  auto_possessify(pcre_uchar *code, BOOL u
3660  {  {
3661  register pcre_uchar c;  register pcre_uchar c;
3662  const pcre_uchar *end;  const pcre_uchar *end;
3663    pcre_uchar *repeat_opcode;
3664  pcre_uint32 list[8];  pcre_uint32 list[8];
3665    int rec_limit;
3666    
3667  for (;;)  for (;;)
3668    {    {
3669    c = *code;    c = *code;
3670    
3671      /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3672      it may compile without complaining, but may get into a loop here if the code
3673      pointer points to a bad value. This is, of course a documentated possibility,
3674      when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3675      just give up on this optimization. */
3676    
3677      if (c >= OP_TABLE_LENGTH) return;
3678    
3679    if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)    if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3680      {      {
3681      c -= get_repeat_base(c) - OP_STAR;      c -= get_repeat_base(c) - OP_STAR;
# Line 3270  for (;;) Line 3683  for (;;)
3683        get_chr_property_list(code, utf, cd->fcc, list) : NULL;        get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3684      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3685    
3686      if (end != NULL && compare_opcodes(end, utf, cd, list))      rec_limit = 1000;
3687        if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3688        {        {
3689        switch(c)        switch(c)
3690          {          {
# Line 3303  for (;;) Line 3717  for (;;)
3717          break;          break;
3718    
3719          case OP_MINUPTO:          case OP_MINUPTO:
3720          *code += OP_MINUPTO - OP_UPTO;          *code += OP_POSUPTO - OP_MINUPTO;
3721          break;          break;
3722          }          }
3723        }        }
3724      c = *code;      c = *code;
3725      }      }
3726      else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3727        {
3728    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3729        if (c == OP_XCLASS)
3730          repeat_opcode = code + GET(code, 1);
3731        else
3732    #endif
3733          repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3734    
3735        c = *repeat_opcode;
3736        if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3737          {
3738          /* end must not be NULL. */
3739          end = get_chr_property_list(code, utf, cd->fcc, list);
3740    
3741          list[1] = (c & 1) == 0;
3742    
3743          rec_limit = 1000;
3744          if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3745            {
3746            switch (c)
3747              {
3748              case OP_CRSTAR:
3749              case OP_CRMINSTAR:
3750              *repeat_opcode = OP_CRPOSSTAR;
3751              break;
3752    
3753              case OP_CRPLUS:
3754              case OP_CRMINPLUS:
3755              *repeat_opcode = OP_CRPOSPLUS;
3756              break;
3757    
3758              case OP_CRQUERY:
3759              case OP_CRMINQUERY:
3760              *repeat_opcode = OP_CRPOSQUERY;
3761              break;
3762    
3763              case OP_CRRANGE:
3764              case OP_CRMINRANGE:
3765              *repeat_opcode = OP_CRPOSRANGE;
3766              break;
3767              }
3768            }
3769          }
3770        c = *code;
3771        }
3772    
3773    switch(c)    switch(c)
3774      {      {
# Line 3335  for (;;) Line 3795  for (;;)
3795        code += 2;        code += 2;
3796      break;      break;
3797    
3798    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3799      case OP_XCLASS:      case OP_XCLASS:
3800      code += GET(code, 1);      code += GET(code, 1);
3801      break;      break;
3802    #endif
3803    
3804      case OP_MARK:      case OP_MARK:
3805      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
# Line 3443  didn't consider this to be a POSIX class Line 3905  didn't consider this to be a POSIX class
3905  The problem in trying to be exactly like Perl is in the handling of escapes. We  The problem in trying to be exactly like Perl is in the handling of escapes. We
3906  have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX  have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3907  class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code  class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3908  below handles the special case of \], but does not try to do any other escape  below handles the special cases \\ and \], but does not try to do any other
3909  processing. This makes it different from Perl for cases such as [:l\ower:]  escape processing. This makes it different from Perl for cases such as
3910  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize  [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3911  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3912  I think.  when Perl does, I think.
3913    
3914  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3915  It seems that the appearance of a nested POSIX class supersedes an apparent  It seems that the appearance of a nested POSIX class supersedes an apparent
# Line 3474  pcre_uchar terminator;          /* Don't Line 3936  pcre_uchar terminator;          /* Don't
3936  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3937  for (++ptr; *ptr != CHAR_NULL; ptr++)  for (++ptr; *ptr != CHAR_NULL; ptr++)
3938    {    {
3939    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)    if (*ptr == CHAR_BACKSLASH &&
3940          (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3941           ptr[1] == CHAR_BACKSLASH))
3942      ptr++;      ptr++;
3943    else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;    else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3944    else              *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3945      else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3946      {      {
3947      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      *endptr = ptr;
3948        {      return TRUE;
       *endptr = ptr;  
       return TRUE;  
       }  
     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&  
          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||  
           ptr[1] == CHAR_EQUALS_SIGN) &&  
         check_posix_syntax(ptr, endptr))  
       return FALSE;  
3949      }      }
3950    }    }
3951  return FALSE;  return FALSE;
# Line 3542  have their offsets adjusted. That one of Line 3999  have their offsets adjusted. That one of
3999  is called, the partially compiled regex must be temporarily terminated with  is called, the partially compiled regex must be temporarily terminated with
4000  OP_END.  OP_END.
4001    
4002  This function has been extended with the possibility of forward references for  This function has been extended to cope with forward references for recursions
4003  recursions and subroutine calls. It must also check the list of such references  and subroutine calls. It must check the list of such references for the
4004  for the group we are dealing with. If it finds that one of the recursions in  group we are dealing with. If it finds that one of the recursions in the
4005  the current group is on this list, it adjusts the offset in the list, not the  current group is on this list, it does not adjust the value in the reference
4006  value in the reference (which is a group number).  (which is a group number). After the group has been scanned, all the offsets in
4007    the forward reference list for the group are adjusted.
4008    
4009  Arguments:  Arguments:
4010    group      points to the start of the group    group      points to the start of the group
4011    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
4012    utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode    utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4013    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
4014    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm_offset   the hwm forward reference offset at the start of the group
4015    
4016  Returns:     nothing  Returns:     nothing
4017  */  */
4018    
4019  static void  static void
4020  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4021    pcre_uchar *save_hwm)    size_t save_hwm_offset)
4022  {  {
4023    int offset;
4024    pcre_uchar *hc;
4025  pcre_uchar *ptr = group;  pcre_uchar *ptr = group;
4026    
4027  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4028    {    {
4029    int offset;    for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4030    pcre_uchar *hc;         hc += LINK_SIZE)
   
   /* See if this recursion is on the forward reference list. If so, adjust the  
   reference. */  
   
   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)  
4031      {      {
4032      offset = (int)GET(hc, 0);      offset = (int)GET(hc, 0);
4033      if (cd->start_code + offset == ptr + 1)      if (cd->start_code + offset == ptr + 1) break;
       {  
       PUT(hc, 0, offset + adjust);  
       break;  
       }  
4034      }      }
4035    
4036    /* Otherwise, adjust the recursion offset if it's after the start of this    /* If we have not found this recursion on the forward reference list, adjust
4037    group. */    the recursion's offset if it's after the start of this group. */
4038    
4039    if (hc >= cd->hwm)    if (hc >= cd->hwm)
4040      {      {
# Line 3593  while ((ptr = (pcre_uchar *)find_recurse Line 4044  while ((ptr = (pcre_uchar *)find_recurse
4044    
4045    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
4046    }    }
4047    
4048    /* Now adjust all forward reference offsets for the group. */
4049    
4050    for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4051         hc += LINK_SIZE)
4052      {
4053      offset = (int)GET(hc, 0);
4054      PUT(hc, 0, offset + adjust);
4055      }
4056  }  }
4057    
4058    
# Line 3695  for (c = *cptr; c <= d; c++) Line 4155  for (c = *cptr; c <= d; c++)
4155    
4156  if (c > d) return -1;  /* Reached end of range */  if (c > d) return -1;  /* Reached end of range */
4157    
4158    /* Found a character that has a single other case. Search for the end of the
4159    range, which is either the end of the input range, or a character that has zero
4160    or more than one other cases. */
4161    
4162  *ocptr = othercase;  *ocptr = othercase;
4163  next = othercase + 1;  next = othercase + 1;
4164    
4165  for (++c; c <= d; c++)  for (++c; c <= d; c++)
4166    {    {
4167    if (UCD_OTHERCASE(c) != next) break;    if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4168    next++;    next++;
4169    }    }
4170    
# Line 3738  add_to_class(pcre_uint8 *classbits, pcre Line 4202  add_to_class(pcre_uint8 *classbits, pcre
4202    compile_data *cd, pcre_uint32 start, pcre_uint32 end)    compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4203  {  {
4204  pcre_uint32 c;  pcre_uint32 c;
4205    pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4206  int n8 = 0;  int n8 = 0;
4207    
4208  /* If caseless matching is required, scan the range and process alternate  /* If caseless matching is required, scan the range and process alternate
# Line 3772  if ((options & PCRE_CASELESS) != 0) Line 4237  if ((options & PCRE_CASELESS) != 0)
4237        range. Otherwise, use a recursive call to add the additional range. */        range. Otherwise, use a recursive call to add the additional range. */
4238    
4239        else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */        else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4240        else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */        else if (od > end && oc <= end + 1)
4241            {
4242            end = od;       /* Extend upwards */
4243            if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4244            }
4245        else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);        else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4246        }        }
4247      }      }
# Line 3781  if ((options & PCRE_CASELESS) != 0) Line 4250  if ((options & PCRE_CASELESS) != 0)
4250    
4251    /* Not UTF-mode, or no UCP */    /* Not UTF-mode, or no UCP */
4252    
4253    for (c = start; c <= end && c < 256; c++)    for (c = start; c <= classbits_end; c++)
4254      {      {
4255      SETBIT(classbits, cd->fcc[c]);      SETBIT(classbits, cd->fcc[c]);
4256      n8++;      n8++;
# Line 3806  in all cases. */ Line 4275  in all cases. */
4275    
4276  #endif /* COMPILE_PCRE[8|16] */  #endif /* COMPILE_PCRE[8|16] */
4277    
4278  /* If all characters are less than 256, use the bit map. Otherwise use extra  /* Use the bitmap for characters < 256. Otherwise use extra data.*/
 data. */  
4279    
4280  if (end < 0x100)  for (c = start; c <= classbits_end; c++)
4281    {    {
4282    for (c = start; c <= end; c++)    /* Regardless of start, c will always be <= 255. */
4283      {    SETBIT(classbits, c);
4284      n8++;    n8++;
     SETBIT(classbits, c);  
     }  
4285    }    }
4286    
4287  else  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4288    if (start <= 0xff) start = 0xff + 1;
4289    
4290    if (end >= start)
4291    {    {
4292    pcre_uchar *uchardata = *uchardptr;    pcre_uchar *uchardata = *uchardptr;
   
4293  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4294    if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */    if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4295      {      {
# Line 3861  else Line 4329  else
4329    
4330    *uchardptr = uchardata;   /* Updata extra data pointer */    *uchardptr = uchardata;   /* Updata extra data pointer */
4331    }    }
4332    #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4333    
4334  return n8;    /* Number of 8-bit characters */  return n8;    /* Number of 8-bit characters */
4335  }  }
# Line 4012  const pcre_uchar *tempptr; Line 4481  const pcre_uchar *tempptr;
4481  const pcre_uchar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
4482  pcre_uchar *previous = NULL;  pcre_uchar *previous = NULL;
4483  pcre_uchar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
4484  pcre_uchar *save_hwm = NULL;  size_t item_hwm_offset = 0;
4485  pcre_uint8 classbits[32];  pcre_uint8 classbits[32];
4486    
4487  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
# Line 4082  for (;; ptr++) Line 4551  for (;; ptr++)
4551    BOOL reset_bracount;    BOOL reset_bracount;
4552    int class_has_8bitchar;    int class_has_8bitchar;
4553    int class_one_char;    int class_one_char;
4554    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4555      BOOL xclass_has_prop;
4556    #endif
4557    int newoptions;    int newoptions;
4558    int recno;    int recno;
4559    int refsign;    int refsign;
# Line 4167  for (;; ptr++) Line 4639  for (;; ptr++)
4639    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
4640    reference list. */    reference list. */
4641    
4642    else if (cd->hwm > cd->start_workspace + cd->workspace_size -    else if (cd->hwm > cd->start_workspace + cd->workspace_size)
            WORK_SIZE_SAFETY_MARGIN)  
4643      {      {
4644      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
4645      goto FAILED;      goto FAILED;
# Line 4199  for (;; ptr++) Line 4670  for (;; ptr++)
4670          }          }
4671        goto NORMAL_CHAR;        goto NORMAL_CHAR;
4672        }        }
4673        /* Control does not reach here. */
4674      }      }
4675    
4676    /* Fill in length of a previous callout, except when the next thing is    /* In extended mode, skip white space and comments. We need a loop in order
4677    a quantifier. */    to check for more white space and more comments after a comment. */
   
   is_quantifier =  
     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||  
     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));  
   
   if (!is_quantifier && previous_callout != NULL &&  
        after_manual_callout-- <= 0)  
     {  
     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */  
       complete_callout(previous_callout, ptr, cd);  
     previous_callout = NULL;  
     }  
   
   /* In extended mode, skip white space and comments. */  
4678    
4679    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
4680      {      {
4681      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;      for (;;)
     if (c == CHAR_NUMBER_SIGN)  
4682        {        {
4683          while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4684          if (c != CHAR_NUMBER_SIGN) break;
4685        ptr++;        ptr++;
4686        while (*ptr != CHAR_NULL)        while (*ptr != CHAR_NULL)
4687          {          {
4688          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4689              {                          /* IS_NEWLINE sets cd->nllen. */
4690              ptr += cd->nllen;
4691              break;
4692              }
4693          ptr++;          ptr++;
4694  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4695          if (utf) FORWARDCHAR(ptr);          if (utf) FORWARDCHAR(ptr);
4696  #endif  #endif
4697          }          }
4698        if (*ptr != CHAR_NULL) continue;        c = *ptr;     /* Either NULL or the char after a newline */
4699          }
4700        }
4701    
4702        /* Else fall through to handle end of string */    /* Skip over (?# comments. We need to do this here because we want to know if
4703        c = 0;    the next thing is a quantifier, and these comments may come between an item
4704      and its quantifier. */
4705    
4706      if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4707          ptr[2] == CHAR_NUMBER_SIGN)
4708        {
4709        ptr += 3;
4710        while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4711        if (*ptr == CHAR_NULL)
4712          {
4713          *errorcodeptr = ERR18;
4714          goto FAILED;
4715        }        }
4716        continue;
4717      }      }
4718    
4719    /* No auto callout for quantifiers. */    /* See if the next thing is a quantifier. */
4720    
4721    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)    is_quantifier =
4722        c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4723        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4724    
4725      /* Fill in length of a previous callout, except when the next thing is a
4726      quantifier or when processing a property substitution string in UCP mode. */
4727    
4728      if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4729           after_manual_callout-- <= 0)
4730        {
4731        if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4732          complete_callout(previous_callout, ptr, cd);
4733        previous_callout = NULL;
4734        }
4735    
4736      /* Create auto callout, except for quantifiers, or while processing property
4737      strings that are substituted for \w etc in UCP mode. */
4738    
4739      if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4740      {      {
4741      previous_callout = code;      previous_callout = code;
4742      code = auto_callout(code, ptr, cd);      code = auto_callout(code, ptr, cd);
4743      }      }
4744    
4745      /* Process the next pattern item. */
4746    
4747    switch(c)    switch(c)
4748      {      {
4749      /* ===================================================================*/      /* ===================================================================*/
4750      case 0:                        /* The branch terminates at string end */      case CHAR_NULL:                /* The branch terminates at string end */
4751      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
4752      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
4753      *firstcharptr = firstchar;      *firstcharptr = firstchar;
# Line 4280  for (;; ptr++) Line 4777  for (;; ptr++)
4777      previous = NULL;      previous = NULL;
4778      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
4779        {        {
4780        if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;        if (firstcharflags == REQ_UNSET)
4781            zerofirstcharflags = firstcharflags = REQ_NONE;
4782        *code++ = OP_CIRCM;        *code++ = OP_CIRCM;
4783        }        }
4784      else *code++ = OP_CIRC;      else *code++ = OP_CIRC;
# Line 4301  for (;; ptr++) Line 4799  for (;; ptr++)
4799      zeroreqchar = reqchar;      zeroreqchar = reqchar;
4800      zeroreqcharflags = reqcharflags;      zeroreqcharflags = reqcharflags;
4801      previous = code;      previous = code;
4802        item_hwm_offset = cd->hwm - cd->start_workspace;
4803      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4804      break;      break;
4805    
# Line 4328  for (;; ptr++) Line 4827  for (;; ptr++)
4827        }        }
4828      goto NORMAL_CHAR;      goto NORMAL_CHAR;
4829    
4830        /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4831        used for "start of word" and "end of word". As these are otherwise illegal
4832        sequences, we don't break anything by recognizing them. They are replaced
4833        by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4834        erroneous and are handled by the normal code below. */
4835    
4836      case CHAR_LEFT_SQUARE_BRACKET:      case CHAR_LEFT_SQUARE_BRACKET:
4837        if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4838          {
4839          nestptr = ptr + 7;
4840          ptr = sub_start_of_word - 1;
4841          continue;
4842          }
4843    
4844        if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4845          {
4846          nestptr = ptr + 7;
4847          ptr = sub_end_of_word - 1;
4848          continue;
4849          }
4850    
4851        /* Handle a real character class. */
4852    
4853      previous = code;      previous = code;
4854        item_hwm_offset = cd->hwm - cd->start_workspace;
4855    
4856      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4857      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
# Line 4385  for (;; ptr++) Line 4907  for (;; ptr++)
4907    
4908      should_flip_negation = FALSE;      should_flip_negation = FALSE;
4909    
4910        /* Extended class (xclass) will be used when characters > 255
4911        might match. */
4912    
4913    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4914        xclass = FALSE;
4915        class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4916        class_uchardata_base = class_uchardata;   /* Save the start */
4917    #endif
4918    
4919      /* For optimization purposes, we track some properties of the class:      /* For optimization purposes, we track some properties of the class:
4920      class_has_8bitchar will be non-zero if the class contains at least one <      class_has_8bitchar will be non-zero if the class contains at least one <
4921      256 character; class_one_char will be 1 if the class contains just one      256 character; class_one_char will be 1 if the class contains just one
4922      character. */      character; xclass_has_prop will be TRUE if unicode property checks
4923        are present in the class. */
4924    
4925      class_has_8bitchar = 0;      class_has_8bitchar = 0;
4926      class_one_char = 0;      class_one_char = 0;
4927    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4928        xclass_has_prop = FALSE;
4929    #endif
4930    
4931      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
4932      temporary bit of memory, in case the class contains fewer than two      temporary bit of memory, in case the class contains fewer than two
# Line 4400  for (;; ptr++) Line 4935  for (;; ptr++)
4935    
4936      memset(classbits, 0, 32 * sizeof(pcre_uint8));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
4937    
 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  
     xclass = FALSE;  
     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */  
     class_uchardata_base = class_uchardata;   /* Save the start */  
 #endif  
   
4938      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
4939      means that an initial ] is taken as a data character. At the start of the      means that an initial ] is taken as a data character. At the start of the
4940      loop, c contains the first byte of the character. */      loop, c contains the first byte of the character. */
# Line 4428  for (;; ptr++) Line 4957  for (;; ptr++)
4957        (which is on the stack). We have to remember that there was XCLASS data,        (which is on the stack). We have to remember that there was XCLASS data,
4958        however. */        however. */
4959    
4960          if (class_uchardata > class_uchardata_base) xclass = TRUE;
4961    
4962        if (lengthptr != NULL && class_uchardata > class_uchardata_base)        if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4963          {          {
4964          xclass = TRUE;          *lengthptr += (int)(class_uchardata - class_uchardata_base);
         *lengthptr += class_uchardata - class_uchardata_base;  
4965          class_uchardata = class_uchardata_base;          class_uchardata = class_uchardata_base;
4966          }          }
4967  #endif  #endif
# Line 4493  for (;; ptr++) Line 5023  for (;; ptr++)
5023            posix_class = 0;            posix_class = 0;
5024    
5025          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
5026          different escape sequences that use Unicode properties. */          different escape sequences that use Unicode properties \p or \P. Others
5027            that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5028            directly. */
5029    
5030  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
5031          if ((options & PCRE_UCP) != 0)          if ((options & PCRE_UCP) != 0)
5032            {            {
5033              unsigned int ptype = 0;
5034            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5035    
5036              /* The posix_substitutes table specifies which POSIX classes can be
5037              converted to \p or \P items. */
5038    
5039            if (posix_substitutes[pc] != NULL)            if (posix_substitutes[pc] != NULL)
5040              {              {
5041              nestptr = tempptr + 1;              nestptr = tempptr + 1;
5042              ptr = posix_substitutes[pc] - 1;              ptr = posix_substitutes[pc] - 1;
5043              continue;              continue;
5044              }              }
5045    
5046              /* There are three other classes that generate special property calls
5047              that are recognized only in an XCLASS. */
5048    
5049              else switch(posix_class)
5050                {
5051                case PC_GRAPH:
5052                ptype = PT_PXGRAPH;
5053                /* Fall through */
5054                case PC_PRINT:
5055                if (ptype == 0) ptype = PT_PXPRINT;
5056                /* Fall through */
5057                case PC_PUNCT:
5058                if (ptype == 0) ptype = PT_PXPUNCT;
5059                *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5060                *class_uchardata++ = ptype;
5061                *class_uchardata++ = 0;
5062                xclass_has_prop = TRUE;
5063                ptr = tempptr + 1;
5064                continue;
5065    
5066                /* For the other POSIX classes (ascii, xdigit) we are going to fall
5067                through to the non-UCP case and build a bit map for characters with
5068                code points less than 256. If we are in a negated POSIX class
5069                within a non-negated overall class, characters with code points
5070                greater than 255 must all match. In the special case where we have
5071                not yet generated any xclass data, and this is the final item in
5072                the overall class, we need do nothing: later on, the opcode
5073                OP_NCLASS will be used to indicate that characters greater than 255
5074                are acceptable. If we have already seen an xclass item or one may
5075                follow (we have to assume that it might if this is not the end of
5076                the class), explicitly match all wide codepoints. */
5077    
5078                default:
5079                if (!negate_class && local_negate &&
5080                    (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5081                  {
5082                  *class_uchardata++ = XCL_RANGE;
5083                  class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5084                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5085                  }
5086                break;
5087                }
5088            }            }
5089  #endif  #endif
5090          /* In the non-UCP case, we build the bit map for the POSIX class in a          /* In the non-UCP case, or when UCP makes no difference, we build the
5091          chunk of local store because we may be adding and subtracting from it,          bit map for the POSIX class in a chunk of local store because we may be
5092          and we don't want to subtract bits that may be in the main map already.          adding and subtracting from it, and we don't want to subtract bits that
5093          At the end we or the result into the bit map that is being built. */          may be in the main map already. At the end we or the result into the
5094            bit map that is being built. */
5095    
5096          posix_class *= 3;          posix_class *= 3;
5097    
# Line 4631  for (;; ptr++) Line 5212  for (;; ptr++)
5212              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5213              previously set by something earlier in the character class.              previously set by something earlier in the character class.
5214              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5215              we could just adjust the appropriate bit. From PCRE 8.34 we no              we could just adjust the appropriate bit. From PCRE 8.34 we no
5216              longer treat \s and \S specially. */              longer treat \s and \S specially. */
5217    
5218              case ESC_s:              case ESC_s:
# Line 4665  for (;; ptr++) Line 5246  for (;; ptr++)
5246                cd, PRIV(vspace_list));                cd, PRIV(vspace_list));
5247              continue;              continue;
5248    
 #ifdef SUPPORT_UCP  
5249              case ESC_p:              case ESC_p:
5250              case ESC_P:              case ESC_P:
5251    #ifdef SUPPORT_UCP
5252                {                {
5253                BOOL negated;                BOOL negated;
5254                unsigned int ptype = 0, pdata = 0;                unsigned int ptype = 0, pdata = 0;
# Line 4677  for (;; ptr++) Line 5258  for (;; ptr++)
5258                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
5259                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
5260                *class_uchardata++ = pdata;                *class_uchardata++ = pdata;
5261                  xclass_has_prop = TRUE;
5262                class_has_8bitchar--;                /* Undo! */                class_has_8bitchar--;                /* Undo! */
5263                continue;                continue;
5264                }                }
5265    #else
5266                *errorcodeptr = ERR45;
5267                goto FAILED;
5268  #endif  #endif
5269              /* Unrecognized escapes are faulted if PCRE is running in its              /* Unrecognized escapes are faulted if PCRE is running in its
5270              strict mode. By default, for compatibility with Perl, they are              strict mode. By default, for compatibility with Perl, they are
# Line 4762  for (;; ptr++) Line 5347  for (;; ptr++)
5347  #endif  #endif
5348          d = *ptr;  /* Not UTF-8 mode */          d = *ptr;  /* Not UTF-8 mode */
5349    
5350          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape
5351          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          sequence, but not any of the other escapes. Perl treats a hyphen as a
5352          in such circumstances. */          literal in such circumstances. However, in Perl's warning mode, a
5353            warning is given, so PCRE now faults it as it is almost certainly a
5354            mistake on the user's part. */
5355    
5356          if (!inescq && d == CHAR_BACKSLASH)          if (!inescq)
5357            {            {
5358            int descape;            if (d == CHAR_BACKSLASH)
5359            descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);              {
5360            if (*errorcodeptr != 0) goto FAILED;              int descape;
5361                descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5362                if (*errorcodeptr != 0) goto FAILED;
5363    
5364            /* \b is backspace; any other special means the '-' was literal. */              /* 0 means a character was put into d; \b is backspace; any other
5365                special causes an error. */
5366    
5367            if (descape != 0)              if (descape != 0)
             {  
             if (descape == ESC_b) d = CHAR_BS; else  
5368                {                {
5369                ptr = oldptr;                if (descape == ESC_b) d = CHAR_BS; else
5370                goto CLASS_SINGLE_CHARACTER;  /* A few lines below */                  {
5371                    *errorcodeptr = ERR83;
5372                    goto FAILED;
5373                    }
5374                }                }
5375              }              }
5376    
5377              /* A hyphen followed by a POSIX class is treated in the same way. */
5378    
5379              else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5380                       (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5381                        ptr[1] == CHAR_EQUALS_SIGN) &&
5382                       check_posix_syntax(ptr, &tempptr))
5383                {
5384                *errorcodeptr = ERR83;
5385                goto FAILED;
5386                }
5387            }            }
5388    
5389          /* Check that the two values are in the correct order. Optimize          /* Check that the two values are in the correct order. Optimize
# Line 4819  for (;; ptr++) Line 5421  for (;; ptr++)
5421        CLASS_SINGLE_CHARACTER:        CLASS_SINGLE_CHARACTER:
5422        if (class_one_char < 2) class_one_char++;        if (class_one_char < 2) class_one_char++;
5423    
5424        /* If class_one_char is 1, we have the first single character in the        /* If xclass_has_prop is false and class_one_char is 1, we have the first
5425        class, and there have been no prior ranges, or XCLASS items generated by        single character in the class, and there have been no prior ranges, or
5426        escapes. If this is the final character in the class, we can optimize by        XCLASS items generated by escapes. If this is the final character in the
5427        turning the item into a 1-character OP_CHAR[I] if it's positive, or        class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5428        OP_NOT[I] if it's negative. In the positive case, it can cause firstchar        if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5429        to be set. Otherwise, there can be no first char if this item is first,        can cause firstchar to be set. Otherwise, there can be no first char if
5430        whatever repeat count may follow. In the case of reqchar, save the        this item is first, whatever repeat count may follow. In the case of
5431        previous value for reinstating. */        reqchar, save the previous value for reinstating. */
5432    
5433        if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)        if (!inescq &&
5434    #ifdef SUPPORT_UCP
5435              !xclass_has_prop &&
5436    #endif
5437              class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5438          {          {
5439          ptr++;          ptr++;
5440          zeroreqchar = reqchar;          zeroreqchar = reqchar;
# Line 4944  for (;; ptr++) Line 5550  for (;; ptr++)
5550      actual compiled code. */      actual compiled code. */
5551    
5552  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
5553      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))      if (xclass && (xclass_has_prop || !should_flip_negation ||
5554            (options & PCRE_UCP) != 0))
5555  #elif !defined COMPILE_PCRE8  #elif !defined COMPILE_PCRE8
5556      if (xclass && !should_flip_negation)      if (xclass && (xclass_has_prop || !should_flip_negation))
5557  #endif  #endif
5558  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5559        {        {
# Line 4954  for (;; ptr++) Line 5561  for (;; ptr++)
5561        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
5562        code += LINK_SIZE;        code += LINK_SIZE;
5563        *code = negate_class? XCL_NOT:0;        *code = negate_class? XCL_NOT:0;
5564          if (xclass_has_prop) *code |= XCL_HASPROP;
5565    
5566        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
5567        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
# Line 4963  for (;; ptr++) Line 5571  for (;; ptr++)
5571          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
5572          memmove(code + (32 / sizeof(pcre_uchar)), code,          memmove(code + (32 / sizeof(pcre_uchar)), code,
5573            IN_UCHARS(class_uchardata - code));            IN_UCHARS(class_uchardata - code));
5574            if (negate_class && !xclass_has_prop)
5575              for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5576          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
5577          code = class_uchardata + (32 / sizeof(pcre_uchar));          code = class_uchardata + (32 / sizeof(pcre_uchar));
5578          }          }
# Line 4973  for (;; ptr++) Line 5583  for (;; ptr++)
5583        PUT(previous, 1, (int)(code - previous));        PUT(previous, 1, (int)(code - previous));
5584        break;   /* End of class handling */        break;   /* End of class handling */
5585        }        }
5586    
5587        /* Even though any XCLASS list is now discarded, we must allow for
5588        its memory. */
5589    
5590        if (lengthptr != NULL)
5591          *lengthptr += (int)(class_uchardata - class_uchardata_base);
5592  #endif  #endif
5593    
5594      /* If there are no characters > 255, or they are all to be included or      /* If there are no characters > 255, or they are all to be included or
# Line 5045  for (;; ptr++) Line 5661  for (;; ptr++)
5661    
5662      tempcode = previous;      tempcode = previous;
5663    
5664        /* Before checking for a possessive quantifier, we must skip over
5665        whitespace and comments in extended mode because Perl allows white space at
5666        this point. */
5667    
5668        if ((options & PCRE_EXTENDED) != 0)
5669          {
5670          const pcre_uchar *p = ptr + 1;
5671          for (;;)
5672            {
5673            while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5674            if (*p != CHAR_NUMBER_SIGN) break;
5675            p++;
5676            while (*p != CHAR_NULL)
5677              {
5678              if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5679                {                        /* IS_NEWLINE sets cd->nllen. */
5680                p += cd->nllen;
5681                break;
5682                }
5683              p++;
5684    #ifdef SUPPORT_UTF
5685              if (utf) FORWARDCHAR(p);
5686    #endif
5687              }           /* Loop for comment characters */
5688            }             /* Loop for multiple comments */
5689          ptr = p - 1;    /* Character before the next significant one. */
5690          }
5691    
5692      /* If the next character is '+', we have a possessive quantifier. This      /* If the next character is '+', we have a possessive quantifier. This
5693      implies greediness, whatever the setting of the PCRE_UNGREEDY option.      implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5694      If the next character is '?' this is a minimizing repeat, by default,      If the next character is '?' this is a minimizing repeat, by default,
# Line 5337  for (;; ptr++) Line 5981  for (;; ptr++)
5981      opcodes such as BRA and CBRA, as this is the place where they get converted      opcodes such as BRA and CBRA, as this is the place where they get converted
5982      into the more special varieties such as BRAPOS and SBRA. A test for >=      into the more special varieties such as BRAPOS and SBRA. A test for >=
5983      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5984      ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow      ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5985      repetition of assertions, but now it does, for Perl compatibility. */      Originally, PCRE did not allow repetition of assertions, but now it does,
5986        for Perl compatibility. */
5987    
5988      else if (*previous >= OP_ASSERT && *previous <= OP_COND)      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5989        {        {
5990        register int i;        register int i;
5991        int len = (int)(code - previous);        int len = (int)(code - previous);
5992          size_t base_hwm_offset = item_hwm_offset;
5993        pcre_uchar *bralink = NULL;        pcre_uchar *bralink = NULL;
5994        pcre_uchar *brazeroptr = NULL;        pcre_uchar *brazeroptr = NULL;
5995    
# Line 5356  for (;; ptr++) Line 6002  for (;; ptr++)
6002        /* There is no sense in actually repeating assertions. The only potential        /* There is no sense in actually repeating assertions. The only potential
6003        use of repetition is in cases when the assertion is optional. Therefore,        use of repetition is in cases when the assertion is optional. Therefore,
6004        if the minimum is greater than zero, just ignore the repeat. If the        if the minimum is greater than zero, just ignore the repeat. If the
6005        maximum is not not zero or one, set it to 1. */        maximum is not zero or one, set it to 1. */
6006    
6007        if (*previous < OP_ONCE)    /* Assertion */        if (*previous < OP_ONCE)    /* Assertion */
6008          {          {
# Line 5398  for (;; ptr++) Line 6044  for (;; ptr++)
6044          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6045            {            {
6046            *code = OP_END;            *code = OP_END;
6047            adjust_recurse(previous, 1, utf, cd, save_hwm);            adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6048            memmove(previous + 1, previous, IN_UCHARS(len));            memmove(previous + 1, previous, IN_UCHARS(len));
6049            code++;            code++;
6050            if (repeat_max == 0)            if (repeat_max == 0)
# Line 5422  for (;; ptr++) Line 6068  for (;; ptr++)
6068            {            {
6069            int offset;            int offset;
6070            *code = OP_END;            *code = OP_END;
6071            adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);            adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6072            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6073            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
6074            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 5485  for (;; ptr++) Line 6131  for (;; ptr++)
6131              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
6132                {                {
6133                pcre_uchar *hc;                pcre_uchar *hc;
6134                pcre_uchar *this_hwm = cd->hwm;                size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6135                memcpy(code, previous, IN_UCHARS(len));                memcpy(code, previous, IN_UCHARS(len));
6136    
6137                while (cd->hwm > cd->start_workspace + cd->workspace_size -                while (cd->hwm > cd->start_workspace + cd->workspace_size -
6138                       WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))                       WORK_SIZE_SAFETY_MARGIN -
6139                         (this_hwm_offset - base_hwm_offset))
6140                  {                  {
                 int save_offset = save_hwm - cd->start_workspace;  
                 int this_offset = this_hwm - cd->start_workspace;  
6141                  *errorcodeptr = expand_workspace(cd);                  *errorcodeptr = expand_workspace(cd);
6142                  if (*errorcodeptr != 0) goto FAILED;                  if (*errorcodeptr != 0) goto FAILED;
                 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;  
                 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;  
6143                  }                  }
6144    
6145                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6146                       hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6147                       hc += LINK_SIZE)
6148                  {                  {
6149                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
6150                  cd->hwm += LINK_SIZE;                  cd->hwm += LINK_SIZE;
6151                  }                  }
6152                save_hwm = this_hwm;                base_hwm_offset = this_hwm_offset;
6153                code += len;                code += len;
6154                }                }
6155              }              }
# Line 5549  for (;; ptr++) Line 6194  for (;; ptr++)
6194          else for (i = repeat_max - 1; i >= 0; i--)          else for (i = repeat_max - 1; i >= 0; i--)
6195            {            {
6196            pcre_uchar *hc;            pcre_uchar *hc;
6197            pcre_uchar *this_hwm = cd->hwm;            size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6198    
6199            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
6200    
# Line 5571  for (;; ptr++) Line 6216  for (;; ptr++)
6216            copying them. */            copying them. */
6217    
6218            while (cd->hwm > cd->start_workspace + cd->workspace_size -            while (cd->hwm > cd->start_workspace + cd->workspace_size -
6219                   WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))                   WORK_SIZE_SAFETY_MARGIN -
6220                     (this_hwm_offset - base_hwm_offset))
6221              {              {
             int save_offset = save_hwm - cd->start_workspace;  
             int this_offset = this_hwm - cd->start_workspace;  
6222              *errorcodeptr = expand_workspace(cd);              *errorcodeptr = expand_workspace(cd);
6223              if (*errorcodeptr != 0) goto FAILED;              if (*errorcodeptr != 0) goto FAILED;
             save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;  
             this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;  
6224              }              }
6225    
6226            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6227                   hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6228                   hc += LINK_SIZE)
6229              {              {
6230              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6231              cd->hwm += LINK_SIZE;              cd->hwm += LINK_SIZE;
6232              }              }
6233            save_hwm = this_hwm;            base_hwm_offset = this_hwm_offset;
6234            code += len;            code += len;
6235            }            }
6236    
# Line 5669  for (;; ptr++) Line 6313  for (;; ptr++)
6313              while (*scode == OP_ALT);              while (*scode == OP_ALT);
6314              }              }
6315    
6316              /* A conditional group with only one branch has an implicit empty
6317              alternative branch. */
6318    
6319              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6320                *bracode = OP_SCOND;
6321    
6322            /* Handle possessive quantifiers. */            /* Handle possessive quantifiers. */
6323    
6324            if (possessive_quantifier)            if (possessive_quantifier)
# Line 5682  for (;; ptr++) Line 6332  for (;; ptr++)
6332                {                {
6333                int nlen = (int)(code - bracode);                int nlen = (int)(code - bracode);
6334                *code = OP_END;                *code = OP_END;
6335                adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);                adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6336                memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));                memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6337                code += 1 + LINK_SIZE;                code += 1 + LINK_SIZE;
6338                nlen += 1 + LINK_SIZE;                nlen += 1 + LINK_SIZE;
6339                *bracode = OP_BRAPOS;                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6340                *code++ = OP_KETRPOS;                *code++ = OP_KETRPOS;
6341                PUTINC(code, 0, nlen);                PUTINC(code, 0, nlen);
6342                PUT(bracode, 1, nlen);                PUT(bracode, 1, nlen);
# Line 5729  for (;; ptr++) Line 6379  for (;; ptr++)
6379        goto FAILED;        goto FAILED;
6380        }        }
6381    
6382      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', possessive_quantifier is
6383      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,      TRUE. For some opcodes, there are special alternative opcodes for this
6384      there are special alternative opcodes for this case. For anything else, we      case. For anything else, we wrap the entire repeated item inside OP_ONCE
6385      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      brackets. Logically, the '+' notation is just syntactic sugar, taken from
6386      notation is just syntactic sugar, taken from Sun's Java package, but the      Sun's Java package, but the special opcodes can optimize it.
     special opcodes can optimize it.  
6387    
6388      Some (but not all) possessively repeated subpatterns have already been      Some (but not all) possessively repeated subpatterns have already been
6389      completely handled in the code just above. For them, possessive_quantifier      completely handled in the code just above. For them, possessive_quantifier
6390      is always FALSE at this stage.      is always FALSE at this stage. Note that the repeated item starts at
6391        tempcode, not at previous, which might be the first part of a string whose
6392      Note that the repeated item starts at tempcode, not at previous, which      (former) last char we repeated. */
     might be the first part of a string whose (former) last char we repeated.  
   
     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But  
     an 'upto' may follow. We skip over an 'exact' item, and then test the  
     length of what remains before proceeding. */  
6393    
6394      if (possessive_quantifier)      if (possessive_quantifier)
6395        {        {
6396        int len;        int len;
6397    
6398        if (*tempcode == OP_TYPEEXACT)        /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6399          However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6400          {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6401          remains is greater than zero, there's a further opcode that can be
6402          handled. If not, do nothing, leaving the EXACT alone. */
6403    
6404          switch(*tempcode)
6405            {
6406            case OP_TYPEEXACT:
6407          tempcode += PRIV(OP_lengths)[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
6408            ((tempcode[1 + IMM2_SIZE] == OP_PROP            ((tempcode[1 + IMM2_SIZE] == OP_PROP
6409            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6410            break;
6411    
6412        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)          /* CHAR opcodes are used for exacts whose count is 1. */
6413          {  
6414            case OP_CHAR:
6415            case OP_CHARI:
6416            case OP_NOT:
6417            case OP_NOTI:
6418            case OP_EXACT:
6419            case OP_EXACTI:
6420            case OP_NOTEXACT:
6421            case OP_NOTEXACTI:
6422          tempcode += PRIV(OP_lengths)[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
6423  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6424          if (utf && HAS_EXTRALEN(tempcode[-1]))          if (utf && HAS_EXTRALEN(tempcode[-1]))
6425            tempcode += GET_EXTRALEN(tempcode[-1]);            tempcode += GET_EXTRALEN(tempcode[-1]);
6426  #endif  #endif
6427            break;
6428    
6429            /* For the class opcodes, the repeat operator appears at the end;
6430            adjust tempcode to point to it. */
6431    
6432            case OP_CLASS:
6433            case OP_NCLASS:
6434            tempcode += 1 + 32/sizeof(pcre_uchar);
6435            break;
6436    
6437    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6438            case OP_XCLASS:
6439            tempcode += GET(tempcode, 1);
6440            break;
6441    #endif
6442          }          }
6443    
6444          /* If tempcode is equal to code (which points to the end of the repeated
6445          item), it means we have skipped an EXACT item but there is no following
6446          QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6447          all other cases, tempcode will be pointing to the repeat opcode, and will
6448          be less than code, so the value of len will be greater than 0. */
6449    
6450        len = (int)(code - tempcode);        len = (int)(code - tempcode);
6451          if (len > 0)
6452            {
6453            unsigned int repcode = *tempcode;
6454    
6455            /* There is a table for possessifying opcodes, all of which are less
6456            than OP_CALLOUT. A zero entry means there is no possessified version.
6457            */
6458    
6459            if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6460              *tempcode = opcode_possessify[repcode];
6461    
6462            /* For opcode without a special possessified version, wrap the item in
6463            ONCE brackets. Because we are moving code along, we must ensure that any
6464            pending recursive references are updated. */
6465    
6466            else
6467              {
6468              *code = OP_END;
6469              adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6470              memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6471              code += 1 + LINK_SIZE;
6472              len += 1 + LINK_SIZE;
6473              tempcode[0] = OP_ONCE;
6474              *code++ = OP_KET;
6475              PUTINC(code, 0, len);
6476              PUT(tempcode, 1, len);
6477              }
6478            }
6479    
6480    #ifdef NEVER
6481        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
6482          {          {
6483          case OP_STAR:  *tempcode = OP_POSSTAR; break;          case OP_STAR:  *tempcode = OP_POSSTAR; break;
# Line 5793  for (;; ptr++) Line 6505  for (;; ptr++)
6505          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6506          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6507    
6508            case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6509            case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6510            case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6511            case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6512    
6513          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
6514          pending recursive references are updated. */          pending recursive references are updated. */
6515    
6516          default:          default:
6517          *code = OP_END;          *code = OP_END;
6518          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6519          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6520          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
6521          len += 1 + LINK_SIZE;          len += 1 + LINK_SIZE;
# Line 5808  for (;; ptr++) Line 6525  for (;; ptr++)
6525          PUT(tempcode, 1, len);          PUT(tempcode, 1, len);
6526          break;          break;
6527          }          }
6528    #endif
6529        }        }
6530    
6531      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 5826  for (;; ptr++) Line 6544  for (;; ptr++)
6544      parenthesis forms.  */      parenthesis forms.  */
6545    
6546      case CHAR_LEFT_PARENTHESIS:      case CHAR_LEFT_PARENTHESIS:
6547      newoptions = options;      ptr++;
     skipbytes = 0;  
     bravalue = OP_CBRA;  
     save_hwm = cd->hwm;  
     reset_bracount = FALSE;  
6548    
6549      /* First deal with various "verbs" that can be introduced by '*'. */      /* Now deal with various "verbs" that can be introduced by '*'. */
6550    
     ptr++;  
6551      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6552           || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))           || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6553        {        {
# Line 5923  for (;; ptr++) Line 6636  for (;; ptr++)
6636                goto FAILED;                goto FAILED;
6637                }                }
6638              setverb = *code++ = verbs[i].op_arg;              setverb = *code++ = verbs[i].op_arg;
6639              *code++ = arglen;              if (lengthptr != NULL)    /* In pass 1 just add in the length */
6640              memcpy(code, arg, IN_UCHARS(arglen));                {                       /* to avoid potential workspace */
6641              code += arglen;                *lengthptr += arglen;   /* overflow. */
6642                  *code++ = 0;
6643                  }
6644                else
6645                  {
6646                  *code++ = arglen;
6647                  memcpy(code, arg, IN_UCHARS(arglen));
6648                  code += arglen;
6649                  }
6650              *code++ = 0;              *code++ = 0;
6651              }              }
6652    
# Line 5955  for (;; ptr++) Line 6676  for (;; ptr++)
6676        goto FAILED;        goto FAILED;
6677        }        }
6678    
6679        /* Initialize for "real" parentheses */
6680    
6681        newoptions = options;
6682        skipbytes = 0;
6683        bravalue = OP_CBRA;
6684        item_hwm_offset = cd->hwm - cd->start_workspace;
6685        reset_bracount = FALSE;
6686    
6687      /* Deal with the extended parentheses; all are introduced by '?', and the      /* Deal with the extended parentheses; all are introduced by '?', and the
6688      appearance of any of them means that this is not a capturing group. */      appearance of any of them means that this is not a capturing group. */
6689    
6690      else if (*ptr == CHAR_QUESTION_MARK)      if (*ptr == CHAR_QUESTION_MARK)
6691        {        {
6692        int i, set, unset, namelen;        int i, set, unset, namelen;
6693        int *optset;        int *optset;
# Line 5967  for (;; ptr++) Line 6696  for (;; ptr++)
6696    
6697        switch (*(++ptr))        switch (*(++ptr))
6698          {          {
         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */  
         ptr++;  
         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
         if (*ptr == CHAR_NULL)  
           {  
           *errorcodeptr = ERR18;  
           goto FAILED;  
           }  
         continue;  
   
   
6699          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
6700          case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */          case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6701          reset_bracount = TRUE;          reset_bracount = TRUE;
6702            cd->dupgroups = TRUE;     /* Record (?| encountered */
6703          /* Fall through */          /* Fall through */
6704    
6705          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
# Line 5996  for (;; ptr++) Line 6715  for (;; ptr++)
6715          tempptr = ptr;          tempptr = ptr;
6716    
6717          /* A condition can be an assertion, a number (referring to a numbered          /* A condition can be an assertion, a number (referring to a numbered
6718          group), a name (referring to a named group), or 'R', referring to          group's having been set), a name (referring to a named group), or 'R',
6719          recursion. R<digits> and R&name are also permitted for recursion tests.          referring to recursion. R<digits> and R&name are also permitted for
6720            recursion tests.
6721          There are several syntaxes for testing a named group: (?(name)) is used  
6722          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).          There are ways of testing a named group: (?(name)) is used by Python;
6723            Perl 5.10 onwards uses (?(<name>) or (?('name')).
6724          There are two unfortunate ambiguities, caused by history. (a) 'R' can  
6725          be the recursive thing or the name 'R' (and similarly for 'R' followed          There is one unfortunate ambiguity, caused by history. 'R' can be the
6726          by digits), and (b) a number could be a name that consists of digits.          recursive thing or the name 'R' (and similarly for 'R' followed by
6727          In both cases, we look for a name first; if not found, we try the other          digits). We look for a name first; if not found, we try the other case.
         cases.  
6728    
6729          For compatibility with auto-callouts, we allow a callout to be          For compatibility with auto-callouts, we allow a callout to be
6730          specified before a condition that is an assertion. First, check for the          specified before a condition that is an assertion. First, check for the
# Line 6027  for (;; ptr++) Line 6745  for (;; ptr++)
6745          if (tempptr[1] == CHAR_QUESTION_MARK &&          if (tempptr[1] == CHAR_QUESTION_MARK &&
6746                (tempptr[2] == CHAR_EQUALS_SIGN ||                (tempptr[2] == CHAR_EQUALS_SIGN ||
6747                 tempptr[2] == CHAR_EXCLAMATION_MARK ||                 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6748                 tempptr[2] == CHAR_LESS_THAN_SIGN))                   (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6749                       (tempptr[3] == CHAR_EQUALS_SIGN ||
6750                        tempptr[3] == CHAR_EXCLAMATION_MARK))))
6751              {
6752              cd->iscondassert = TRUE;
6753            break;            break;
6754              }
6755    
6756          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6757          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6758    
6759          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6760          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
6761          refsign = -1;          refsign = -1;     /* => not a number */
6762            namelen = -1;     /* => not a name; must set to avoid warning */
6763            name = NULL;      /* Always set to avoid warning */
6764            recno = 0;        /* Always set to avoid warning */
6765    
6766          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
6767    
6768          if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)          ptr++;
6769            if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6770            {            {
6771            terminator = -1;            terminator = -1;
6772            ptr += 2;            ptr += 2;
# Line 6047  for (;; ptr++) Line 6774  for (;; ptr++)
6774            }            }
6775    
6776          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6777          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6778            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6779    
6780          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (*ptr == CHAR_LESS_THAN_SIGN)
6781            {            {
6782            terminator = CHAR_GREATER_THAN_SIGN;            terminator = CHAR_GREATER_THAN_SIGN;
6783            ptr++;            ptr++;
6784            }            }
6785          else if (ptr[1] == CHAR_APOSTROPHE)          else if (*ptr == CHAR_APOSTROPHE)
6786            {            {
6787            terminator = CHAR_APOSTROPHE;            terminator = CHAR_APOSTROPHE;
6788            ptr++;            ptr++;
# Line 6062  for (;; ptr++) Line 6790  for (;; ptr++)
6790          else          else
6791            {            {
6792            terminator = CHAR_NULL;            terminator = CHAR_NULL;
6793            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6794                else if (IS_DIGIT(*ptr)) refsign = 0;
6795            }            }
6796    
6797          /* We now expect to read a name; any thing else is an error */          /* Handle a number */
6798    
6799          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (refsign >= 0)
6800            {            {
6801            ptr += 1;  /* To get the right offset */            while (IS_DIGIT(*ptr))
6802            *errorcodeptr = ERR28;              {
6803            goto FAILED;              if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6804                  {
6805                  while (IS_DIGIT(*ptr)) ptr++;
6806                  *errorcodeptr = ERR61;
6807                  goto FAILED;
6808                  }
6809                recno = recno * 10 + (int)(*ptr - CHAR_0);
6810                ptr++;
6811                }
6812            }            }
6813    
6814          /* Read the name, but also get it as a number if it's all digits */          /* Otherwise we expect to read a name; anything else is an error. When
6815            a name is one of a number of duplicates, a different opcode is used and
6816            it needs more memory. Unfortunately we cannot tell whether a name is a
6817            duplicate in the first pass, so we have to allow for more memory. */
6818    
6819          recno = 0;          else
         name = ++ptr;  
         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)  
6820            {            {
6821            if (recno >= 0)            if (IS_DIGIT(*ptr))
6822              recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;              {
6823            ptr++;              *errorcodeptr = ERR84;
6824                goto FAILED;
6825                }
6826              if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6827                {
6828                *errorcodeptr = ERR28;   /* Assertion expected */
6829                goto FAILED;
6830                }
6831              name = ptr++;
6832              while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6833                {
6834                ptr++;
6835                }
6836              namelen = (int)(ptr - name);
6837              if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6838            }            }
6839          namelen = (int)(ptr - name);  
6840            /* Check the terminator */
6841    
6842          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6843              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6844            {            {
6845            ptr--;      /* Error offset */            ptr--;                  /* Error offset */
6846            *errorcodeptr = ERR26;            *errorcodeptr = ERR26;  /* Malformed number or name */
6847            goto FAILED;            goto FAILED;
6848            }            }
6849    
# Line 6099  for (;; ptr++) Line 6852  for (;; ptr++)
6852          if (lengthptr != NULL) break;          if (lengthptr != NULL) break;
6853    
6854          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
6855          reference. If the string started with "+" or "-" we require the rest to          reference. If refsign is not negative, it means we have a number in
6856          be digits, in which case recno will be set. */          recno. */
6857    
6858          if (refsign > 0)          if (refsign >= 0)
6859            {            {
6860            if (recno <= 0)            if (recno <= 0)
6861              {              {
6862              *errorcodeptr = ERR58;              *errorcodeptr = ERR35;
6863              goto FAILED;              goto FAILED;
6864              }              }
6865            recno = (refsign == CHAR_MINUS)?            if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6866              cd->bracount - recno + 1 : recno +cd->bracount;              cd->bracount - recno + 1 : recno + cd->bracount;
6867            if (recno <= 0 || recno > cd->final_bracount)            if (recno <= 0 || recno > cd->final_bracount)
6868              {              {
6869              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
6870              goto FAILED;              goto FAILED;
6871              }              }
6872            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
6873              if (recno > cd->top_backref) cd->top_backref = recno;
6874            break;            break;
6875            }            }
6876    
6877          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise look for the name. */
         name. If we find a name, add one to the opcode to change OP_CREF or  
         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
6878    
6879          slot = cd->name_table;          slot = cd->name_table;
6880          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 6133  for (;; ptr++) Line 6883  for (;; ptr++)
6883            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6884            }            }
6885    
6886          /* Found the named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6887            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6888            appropriate data values. Otherwise, just insert the unique subpattern
6889            number. */
6890    
6891          if (i < cd->names_found)          if (i < cd->names_found)
6892            {            {
6893            recno = GET2(slot, 0);            int offset = i++;
6894            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6895            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6896              if (recno > cd->top_backref) cd->top_backref = recno;
6897              for (; i < cd->names_found; i++)
6898                {
6899                slot += cd->name_entry_size;
6900                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6901                  (slot+IMM2_SIZE)[namelen] != 0) break;
6902                count++;
6903                }
6904    
6905              if (count > 1)
6906                {
6907                PUT2(code, 2+LINK_SIZE, offset);
6908                PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6909                skipbytes += IMM2_SIZE;
6910                code[1+LINK_SIZE]++;
6911                }
6912              else  /* Not a duplicated name */
6913                {
6914                PUT2(code, 2+LINK_SIZE, recno);
6915                }
6916            }            }
6917    
6918          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
6919          after the opening parenthesis [e.g. (?(abc)...] and in this case there          after the opening parenthesis [e.g. (?(abc)...] and in this case there
6920          are some further alternatives to try. For the cases where terminator !=          are some further alternatives to try. For the cases where terminator !=
6921          0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have          CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6922          now checked all the possibilities, so give an error. */          we have now checked all the possibilities, so give an error. */
6923    
6924          else if (terminator != CHAR_NULL)          else if (terminator != CHAR_NULL)
6925            {            {
# Line 6167  for (;; ptr++) Line 6940  for (;; ptr++)
6940                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
6941                goto FAILED;                goto FAILED;
6942                }                }
6943                if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
6944                  {
6945                  *errorcodeptr = ERR61;
6946                  goto FAILED;
6947                  }
6948              recno = recno * 10 + name[i] - CHAR_0;              recno = recno * 10 + name[i] - CHAR_0;
6949              }              }
6950            if (recno == 0) recno = RREF_ANY;            if (recno == 0) recno = RREF_ANY;
# Line 6183  for (;; ptr++) Line 6961  for (;; ptr++)
6961            skipbytes = 1;            skipbytes = 1;
6962            }            }
6963    
6964          /* Check for the "name" actually being a subpattern number. We are          /* Reference to an unidentified subpattern. */
         in the second pass here, so final_bracount is set. */  
   
         else if (recno > 0 && recno <= cd->final_bracount)  
           {  
           PUT2(code, 2+LINK_SIZE, recno);  
           }  
   
         /* Either an unidentified subpattern, or a reference to (?(0) */  
6965    
6966          else          else
6967            {            {
6968            *errorcodeptr = (recno == 0)? ERR35: ERR15;            *errorcodeptr = ERR15;
6969            goto FAILED;            goto FAILED;
6970            }            }
6971          break;          break;
# Line 6208  for (;; ptr++) Line 6978  for (;; ptr++)
6978          ptr++;          ptr++;
6979          break;          break;
6980    
6981            /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6982            thing to do, but Perl allows all assertions to be quantified, and when
6983            they contain capturing parentheses there may be a potential use for
6984            this feature. Not that that applies to a quantified (?!) but we allow
6985            it for uniformity. */
6986    
6987          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
6988          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6989          ptr++;          ptr++;
6990          if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */          if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6991                 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6992                (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6993            {            {
6994            *code++ = OP_FAIL;            *code++ = OP_FAIL;
6995            previous = NULL;            previous = NULL;
# Line 6308  for (;; ptr++) Line 7085  for (;; ptr++)
7085          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7086            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7087          name = ++ptr;          name = ++ptr;
7088            if (IS_DIGIT(*ptr))
7089              {
7090              *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7091              goto FAILED;
7092              }
7093          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7094          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
7095    
# Line 6422  for (;; ptr++) Line 7203  for (;; ptr++)
7203    
7204          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
7205          name = ++ptr;          name = ++ptr;
7206            if (IS_DIGIT(*ptr))
7207              {
7208              *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7209              goto FAILED;
7210              }
7211          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7212          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
7213    
# Line 6435  for (;; ptr++) Line 7221  for (;; ptr++)
7221          if (lengthptr != NULL)          if (lengthptr != NULL)
7222            {            {
7223            named_group *ng;            named_group *ng;
7224              recno = 0;
7225    
7226            if (namelen == 0)            if (namelen == 0)
7227              {              {
# Line 6452  for (;; ptr++) Line 7239  for (;; ptr++)
7239              goto FAILED;              goto FAILED;
7240              }              }
7241    
           /* The name table does not exist in the first pass; instead we must  
           scan the list of names encountered so far in order to get the  
           number. If the name is not found, set the value to 0 for a forward  
           reference. */  
   
           ng = cd->named_groups;  
           for (i = 0; i < cd->names_found; i++, ng++)  
             {  
             if (namelen == ng->length &&  
                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)  
               break;  
             }  
           recno = (i < cd->names_found)? ng->number : 0;  
   
7242            /* Count named back references. */            /* Count named back references. */
7243    
7244            if (!is_recurse) cd->namedrefcount++;            if (!is_recurse) cd->namedrefcount++;
7245    
7246              /* We have to allow for a named reference to a duplicated name (this
7247              cannot be determined until the second pass). This needs an extra
7248              16-bit data item. */
7249    
7250              *lengthptr += IMM2_SIZE;
7251    
7252              /* If this is a forward reference and we are within a (?|...) group,
7253              the reference may end up as the number of a group which we are
7254              currently inside, that is, it could be a recursive reference. In the
7255              real compile this will be picked up and the reference wrapped with
7256              OP_ONCE to make it atomic, so we must space in case this occurs. */
7257    
7258              /* In fact, this can happen for a non-forward reference because
7259              another group with the same number might be created later. This
7260              issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7261              only mode, we finesse the bug by allowing more memory always. */
7262    
7263              *lengthptr += 2 + 2*LINK_SIZE;
7264    
7265              /* It is even worse than that. The current reference may be to an
7266              existing named group with a different number (so apparently not
7267              recursive) but which later on is also attached to a group with the
7268              current number. This can only happen if $(| has been previous
7269              encountered. In that case, we allow yet more memory, just in case.
7270              (Again, this is fixed "properly" in PCRE2. */
7271    
7272              if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7273    
7274              /* Otherwise, check for recursion here. The name table does not exist
7275              in the first pass; instead we must scan the list of names encountered
7276              so far in order to get the number. If the name is not found, leave
7277              the value of recno as 0 for a forward reference. */
7278    
7279              else
7280                {
7281                ng = cd->named_groups;
7282                for (i = 0; i < cd->names_found; i++, ng++)
7283                  {
7284                  if (namelen == ng->length &&
7285                      STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7286                    {
7287                    open_capitem *oc;
7288                    recno = ng->number;
7289                    if (is_recurse) break;
7290                    for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7291                      {
7292                      if (oc->number == recno)
7293                        {
7294                        oc->flag = TRUE;
7295                        break;
7296                        }
7297                      }
7298                    }
7299                  }
7300                }
7301            }            }
7302    
7303          /* In the real compile, search the name table. We check the name          /* In the real compile, search the name table. We check the name
# Line 6523  for (;; ptr++) Line 7352  for (;; ptr++)
7352              {              {
7353              if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;              if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7354              previous = code;              previous = code;
7355                item_hwm_offset = cd->hwm - cd->start_workspace;
7356              *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;              *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7357              PUT2INC(code, 0, index);              PUT2INC(code, 0, index);
7358              PUT2INC(code, 0, count);              PUT2INC(code, 0, count);
# Line 6560  for (;; ptr++) Line 7390  for (;; ptr++)
7390    
7391    
7392          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
7393          case CHAR_R:              /* Recursion */          case CHAR_R:              /* Recursion, same as (?0) */
7394          ptr++;                    /* Same as (?0)      */          recno = 0;
7395          /* Fall through */          if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7396              {
7397              *errorcodeptr = ERR29;
7398              goto FAILED;
7399              }
7400            goto HANDLE_RECURSION;
7401    
7402    
7403          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
# Line 6599  for (;; ptr++) Line 7434  for (;; ptr++)
7434    
7435            recno = 0;            recno = 0;
7436            while(IS_DIGIT(*ptr))            while(IS_DIGIT(*ptr))
7437                {
7438                if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7439                  {
7440                  while (IS_DIGIT(*ptr)) ptr++;
7441                  *errorcodeptr = ERR61;
7442                  goto FAILED;
7443                  }
7444              recno = recno * 10 + *ptr++ - CHAR_0;              recno = recno * 10 + *ptr++ - CHAR_0;
7445                }
7446    
7447            if (*ptr != (pcre_uchar)terminator)            if (*ptr != (pcre_uchar)terminator)
7448              {              {
# Line 6636  for (;; ptr++) Line 7479  for (;; ptr++)
7479            HANDLE_RECURSION:            HANDLE_RECURSION:
7480    
7481            previous = code;            previous = code;
7482              item_hwm_offset = cd->hwm - cd->start_workspace;
7483            called = cd->start_code;            called = cd->start_code;
7484    
7485            /* When we are actually compiling, find the bracket that is being            /* When we are actually compiling, find the bracket that is being
# Line 6814  for (;; ptr++) Line 7658  for (;; ptr++)
7658        skipbytes = IMM2_SIZE;        skipbytes = IMM2_SIZE;
7659        }        }
7660    
7661      /* Process nested bracketed regex. Assertions used not to be repeatable,      /* Process nested bracketed regex. First check for parentheses nested too
7662      but this was changed for Perl compatibility, so all kinds can now be      deeply. */
7663      repeated. We copy code into a non-register variable (tempcode) in order to  
7664      be able to pass its address because some compilers complain otherwise. */      if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7665          {
7666          *errorcodeptr = ERR82;
7667          goto FAILED;
7668          }
7669    
7670        /* All assertions used not to be repeatable, but this was changed for Perl
7671        compatibility. All kinds can now be repeated except for assertions that are
7672        conditions (Perl also forbids these to be repeated). We copy code into a
7673        non-register variable (tempcode) in order to be able to pass its address
7674        because some compilers complain otherwise. At the start of a conditional
7675        group whose condition is an assertion, cd->iscondassert is set. We unset it
7676        here so as to allow assertions later in the group to be quantified. */
7677    
7678        if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7679            cd->iscondassert)
7680          {
7681          previous = NULL;
7682          cd->iscondassert = FALSE;
7683          }
7684        else
7685          {
7686          previous = code;
7687          item_hwm_offset = cd->hwm - cd->start_workspace;
7688          }
7689    
     previous = code;                      /* For handling repetition */  
7690      *code = bravalue;      *code = bravalue;
7691      tempcode = code;      tempcode = code;
7692      tempreqvary = cd->req_varyopt;        /* Save value before bracket */      tempreqvary = cd->req_varyopt;        /* Save value before bracket */
# Line 6848  for (;; ptr++) Line 7715  for (;; ptr++)
7715           ))           ))
7716        goto FAILED;        goto FAILED;
7717    
7718        cd->parens_depth -= 1;
7719    
7720      /* If this was an atomic group and there are no capturing groups within it,      /* If this was an atomic group and there are no capturing groups within it,
7721      generate OP_ONCE_NC instead of OP_ONCE. */      generate OP_ONCE_NC instead of OP_ONCE. */
7722    
# Line 7062  for (;; ptr++) Line 7931  for (;; ptr++)
7931        if (escape == ESC_g)        if (escape == ESC_g)
7932          {          {
7933          const pcre_uchar *p;          const pcre_uchar *p;
7934          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          pcre_uint32 cf;
7935    
7936            item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
7937          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7938            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7939    
7940          /* These two statements stop the compiler for warning about possibly          /* These two statements stop the compiler for warning about possibly
7941          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7942          fact, because we actually check for a number below, the paths that          fact, because we do the check for a number below, the paths that
7943          would actually be in error are never taken. */          would actually be in error are never taken. */
7944    
7945          skipbytes = 0;          skipbytes = 0;
7946          reset_bracount = FALSE;          reset_bracount = FALSE;
7947    
7948          /* Test for a name */          /* If it's not a signed or unsigned number, treat it as a name. */
7949    
7950          if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)          cf = ptr[1];
7951            if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7952            {            {
           BOOL is_a_number = TRUE;  
           for (p = ptr + 1; *p != CHAR_NULL && *p != (pcre_uchar)terminator; p++)  
             {  
             if (!MAX_255(*p)) { is_a_number = FALSE; break; }  
             if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;  
             if ((cd->ctypes[*p] & ctype_word) == 0) break;  
             }  
           if (*p != (pcre_uchar)terminator)  
             {  
             *errorcodeptr = ERR57;  
             break;  
             }  
           if (is_a_number)  
             {  
             ptr++;  
             goto HANDLE_NUMERICAL_RECURSION;  
             }  
7953            is_recurse = TRUE;            is_recurse = TRUE;
7954            goto NAMED_REF_OR_RECURSE;            goto NAMED_REF_OR_RECURSE;
7955            }            }
7956    
7957          /* Test a signed number in angle brackets or quotes. */          /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7958            or a digit. */
7959    
7960          p = ptr + 2;          p = ptr + 2;
7961          while (IS_DIGIT(*p)) p++;          while (IS_DIGIT(*p)) p++;
7962          if (*p != (pcre_uchar)terminator)          if (*p != (pcre_uchar)terminator)
7963            {            {
7964            *errorcodeptr = ERR57;            *errorcodeptr = ERR57;
7965            break;            goto FAILED;
7966            }            }
7967          ptr++;          ptr++;
7968          goto HANDLE_NUMERICAL_RECURSION;          goto HANDLE_NUMERICAL_RECURSION;
# Line 7121  for (;; ptr++) Line 7977  for (;; ptr++)
7977            ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))            ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7978            {            {
7979            *errorcodeptr = ERR69;            *errorcodeptr = ERR69;
7980            break;            goto FAILED;
7981            }            }
7982          is_recurse = FALSE;          is_recurse = FALSE;
7983          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
# Line 7145  for (;; ptr++) Line 8001  for (;; ptr++)
8001          HANDLE_REFERENCE:          HANDLE_REFERENCE:
8002          if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;          if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8003          previous = code;          previous = code;
8004            item_hwm_offset = cd->hwm - cd->start_workspace;
8005          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8006          PUT2INC(code, 0, recno);          PUT2INC(code, 0, recno);
8007          cd->backref_map |= (recno < 32)? (1 << recno) : 1;          cd->backref_map |= (recno < 32)? (1 << recno) : 1;
# Line 7174  for (;; ptr++) Line 8031  for (;; ptr++)
8031          if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))          if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8032            goto FAILED;            goto FAILED;
8033          previous = code;          previous = code;
8034            item_hwm_offset = cd->hwm - cd->start_workspace;
8035          *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8036          *code++ = ptype;          *code++ = ptype;
8037          *code++ = pdata;          *code++ = pdata;
# Line 7214  for (;; ptr++) Line 8072  for (;; ptr++)
8072    
8073            {            {
8074            previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;            previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8075              item_hwm_offset = cd->hwm - cd->start_workspace;
8076            *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;            *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8077            }            }
8078          }          }
# Line 7239  for (;; ptr++) Line 8098  for (;; ptr++)
8098    
8099      /* ===================================================================*/      /* ===================================================================*/
8100      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
8101      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in a UTF mode, it may be a
8102      multi-byte literal character. */      multi-unit literal character. */
8103    
8104      default:      default:
8105      NORMAL_CHAR:      NORMAL_CHAR:
# Line 7257  for (;; ptr++) Line 8116  for (;; ptr++)
8116    
8117      ONE_CHAR:      ONE_CHAR:
8118      previous = code;      previous = code;
8119        item_hwm_offset = cd->hwm - cd->start_workspace;
8120    
8121      /* For caseless UTF-8 mode when UCP support is available, check whether      /* For caseless UTF-8 mode when UCP support is available, check whether
8122      this character has more than one other case. If so, generate a special      this character has more than one other case. If so, generate a special
# Line 7404  int length; Line 8264  int length;
8264  unsigned int orig_bracount;  unsigned int orig_bracount;
8265  unsigned int max_bracount;  unsigned int max_bracount;
8266  branch_chain bc;  branch_chain bc;
8267    size_t save_hwm_offset;
8268    
8269    /* If set, call the external function that checks for stack availability. */
8270    
8271    if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8272      {
8273      *errorcodeptr= ERR85;
8274      return FALSE;
8275      }
8276    
8277    /* Miscellaneous initialization */
8278    
8279  bc.outer = bcptr;  bc.outer = bcptr;
8280  bc.current_branch = code;  bc.current_branch = code;
# Line 7411  bc.current_branch = code; Line 8282  bc.current_branch = code;
8282  firstchar = reqchar = 0;  firstchar = reqchar = 0;
8283  firstcharflags = reqcharflags = REQ_UNSET;  firstcharflags = reqcharflags = REQ_UNSET;
8284    
8285    save_hwm_offset = cd->hwm - cd->start_workspace;
8286    
8287  /* Accumulate the length for use in the pre-compile phase. Start with the  /* Accumulate the length for use in the pre-compile phase. Start with the
8288  length of the BRA and KET and any extra bytes that are required at the  length of the BRA and KET and any extra bytes that are required at the
8289  beginning. We accumulate in a local variable to save frequent testing of  beginning. We accumulate in a local variable to save frequent testing of
# Line 7552  for (;;) Line 8425  for (;;)
8425        int fixed_length;        int fixed_length;
8426        *code = OP_END;        *code = OP_END;
8427        fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,        fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8428          FALSE, cd);          FALSE, cd, NULL);
8429        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
8430        if (fixed_length == -3)        if (fixed_length == -3)
8431          {          {
# Line 7604  for (;;) Line 8477  for (;;)
8477    
8478      /* If it was a capturing subpattern, check to see if it contained any      /* If it was a capturing subpattern, check to see if it contained any
8479      recursive back references. If so, we must wrap it in atomic brackets.      recursive back references. If so, we must wrap it in atomic brackets.
8480      In any event, remove the block from the chain. */      Because we are moving code along, we must ensure that any pending recursive
8481        references are updated. In any event, remove the block from the chain. */
8482    
8483      if (capnumber > 0)      if (capnumber > 0)
8484        {        {
8485        if (cd->open_caps->flag)        if (cd->open_caps->flag)
8486          {          {
8487            *code = OP_END;
8488            adjust_recurse(start_bracket, 1 + LINK_SIZE,
8489              (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8490          memmove(start_bracket + 1 + LINK_SIZE, start_bracket,          memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8491            IN_UCHARS(code - start_bracket));            IN_UCHARS(code - start_bracket));
8492          *start_bracket = OP_ONCE;          *start_bracket = OP_ONCE;
# Line 7829  do { Line 8706  do {
8706       switch (*scode)       switch (*scode)
8707         {         {
8708         case OP_CREF:         case OP_CREF:
8709         case OP_NCREF:         case OP_DNCREF:
8710         case OP_RREF:         case OP_RREF:
8711         case OP_NRREF:         case OP_DNRREF:
8712         case OP_DEF:         case OP_DEF:
8713           case OP_FAIL:
8714         return FALSE;         return FALSE;
8715    
8716         default:     /* Assertion */         default:     /* Assertion */
# Line 8228  PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. * Line 9106  PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. *
9106      { skipatstart += 6; options |= PCRE_UTF8; continue; }      { skipatstart += 6; options |= PCRE_UTF8; continue; }
9107    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9108      { skipatstart += 6; options |= PCRE_UCP; continue; }      { skipatstart += 6; options |= PCRE_UCP; continue; }
9109      else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9110        { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9111    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9112      { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }      { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9113    
# Line 8412  cd->names_found = 0; Line 9292  cd->names_found = 0;
9292  cd->name_entry_size = 0;  cd->name_entry_size = 0;
9293  cd->name_table = NULL;  cd->name_table = NULL;
9294  cd->dupnames = FALSE;  cd->dupnames = FALSE;
9295    cd->dupgroups = FALSE;
9296  cd->namedrefcount = 0;  cd->namedrefcount = 0;
9297  cd->start_code = cworkspace;  cd->start_code = cworkspace;
9298  cd->hwm = cworkspace;  cd->hwm = cworkspace;
9299    cd->iscondassert = FALSE;
9300  cd->start_workspace = cworkspace;  cd->start_workspace = cworkspace;
9301  cd->workspace_size = COMPILE_WORK_SIZE;  cd->workspace_size = COMPILE_WORK_SIZE;
9302  cd->named_groups = named_groups;  cd->named_groups = named_groups;
# Line 8422  cd->named_group_list_size = NAMED_GROUP_ Line 9304  cd->named_group_list_size = NAMED_GROUP_
9304  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
9305  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9306  cd->req_varyopt = 0;  cd->req_varyopt = 0;
9307    cd->parens_depth = 0;
9308  cd->assert_depth = 0;  cd->assert_depth = 0;
9309  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
9310  cd->external_options = options;  cd->external_options = options;
# Line 8436  outside can help speed up starting point Line 9319  outside can help speed up starting point
9319  ptr += skipatstart;  ptr += skipatstart;
9320  code = cworkspace;  code = cworkspace;
9321  *code = OP_BRA;  *code = OP_BRA;
9322    
9323  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9324    FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,    FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9325    cd, &length);    cd, &length);
# Line 8450  if (length > MAX_PATTERN_SIZE) Line 9334  if (length > MAX_PATTERN_SIZE)
9334    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN;
9335    }    }
9336    
 /* If there are groups with duplicate names and there are also references by  
 name, we must allow for the possibility of named references to duplicated  
 groups. These require an extra data item each. */  
   
 if (cd->dupnames && cd->namedrefcount > 0)  
   length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar);  
   
9337  /* Compute the size of the data block for storing the compiled pattern. Integer  /* Compute the size of the data block for storing the compiled pattern. Integer
9338  overflow should no longer be possible because nowadays we limit the maximum  overflow should no longer be possible because nowadays we limit the maximum
9339  value of cd->names_found and cd->name_entry_size. */  value of cd->names_found and cd->name_entry_size. */
# Line 8507  field; this time it's used for rememberi Line 9384  field; this time it's used for rememberi
9384  */  */
9385    
9386  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9387    cd->parens_depth = 0;
9388  cd->assert_depth = 0;  cd->assert_depth = 0;
9389  cd->bracount = 0;  cd->bracount = 0;
9390  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
# Line 8514  cd->name_table = (pcre_uchar *)re + re-> Line 9392  cd->name_table = (pcre_uchar *)re + re->
9392  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
9393  cd->start_code = codestart;  cd->start_code = codestart;
9394  cd->hwm = (pcre_uchar *)(cd->start_workspace);  cd->hwm = (pcre_uchar *)(cd->start_workspace);
9395    cd->iscondassert = FALSE;
9396  cd->req_varyopt = 0;  cd->req_varyopt = 0;
9397  cd->had_accept = FALSE;  cd->had_accept = FALSE;
9398  cd->had_pruneorskip = FALSE;  cd->had_pruneorskip = FALSE;
# Line 8586  if (cd->hwm > cd->start_workspace) Line 9465  if (cd->hwm > cd->start_workspace)
9465      int offset, recno;      int offset, recno;
9466      cd->hwm -= LINK_SIZE;      cd->hwm -= LINK_SIZE;
9467      offset = GET(cd->hwm, 0);      offset = GET(cd->hwm, 0);
9468    
9469        /* Check that the hwm handling hasn't gone wrong. This whole area is
9470        rewritten in PCRE2 because there are some obscure cases. */
9471    
9472        if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9473          {
9474          errorcode = ERR10;
9475          break;
9476          }
9477    
9478      recno = GET(codestart, offset);      recno = GET(codestart, offset);
9479      if (recno != prev_recno)      if (recno != prev_recno)
9480        {        {
# Line 8609  subpattern. */ Line 9498  subpattern. */
9498    
9499  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9500    
9501  /* Unless disabled, check whether single character iterators can be  /* Unless disabled, check whether any single character iterators can be
9502  auto-possessified. The function overwrites the appropriate opcode values. */  auto-possessified. The function overwrites the appropriate opcode values, so
9503    the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9504    used in this code because at least one compiler gives a warning about loss of
9505    "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9506    function call. */
9507    
9508  if ((options & PCRE_NO_AUTO_POSSESSIFY) == 0)  if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9509    auto_possessify((pcre_uchar *)codestart, utf, cd);    {
9510      pcre_uchar *temp = (pcre_uchar *)codestart;
9511      auto_possessify(temp, utf, cd);
9512      }
9513