/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1364 by ph10, Sat Oct 5 15:45:11 2013 UTC revision 1411 by ph10, Fri Dec 6 17:11:44 2013 UTC
# Line 260  static const verbitem verbs[] = { Line 260  static const verbitem verbs[] = {
260  static const int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261    
262    
263    /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
264    another regex library. */
265    
266    static const pcre_uchar sub_start_of_word[] = {
267      CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
268      CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
269    
270    static const pcre_uchar sub_end_of_word[] = {
271      CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
272      CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
273      CHAR_RIGHT_PARENTHESIS, '\0' };
274    
275    
276  /* Tables of names of POSIX character classes and their lengths. The names are  /* Tables of names of POSIX character classes and their lengths. The names are
277  now all in a single string, to reduce the number of relocations when a shared  now all in a single string, to reduce the number of relocations when a shared
278  library is dynamically loaded. The list of lengths is terminated by a zero  library is dynamically loaded. The list of lengths is terminated by a zero
279  length entry. The first three must be alpha, lower, upper, as this is assumed  length entry. The first three must be alpha, lower, upper, as this is assumed
280  for handling case independence. */  for handling case independence. The indices for graph, print, and punct are
281    needed, so identify them. */
282    
283  static const char posix_names[] =  static const char posix_names[] =
284    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
# Line 275  static const char posix_names[] = Line 289  static const char posix_names[] =
289  static const pcre_uint8 posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
290    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
291    
292    #define PC_GRAPH  8
293    #define PC_PRINT  9
294    #define PC_PUNCT 10
295    
296    
297  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
298  base map, with an optional addition or removal of another map. Then, for some  base map, with an optional addition or removal of another map. Then, for some
299  classes, there is some additional tweaking: for [:blank:] the vertical space  classes, there is some additional tweaking: for [:blank:] the vertical space
# Line 302  static const int posix_class_maps[] = { Line 321  static const int posix_class_maps[] = {
321    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
322  };  };
323    
324  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class  /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
325  substitutes must be in the order of the names, defined above, and there are  Unicode property escapes. */
 both positive and negative cases. NULL means no substitute. */  
326    
327  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
328  static const pcre_uchar string_PNd[]  = {  static const pcre_uchar string_PNd[]  = {
# Line 329  static const pcre_uchar string_pXwd[] = Line 347  static const pcre_uchar string_pXwd[] =
347  static const pcre_uchar *substitutes[] = {  static const pcre_uchar *substitutes[] = {
348    string_PNd,           /* \D */    string_PNd,           /* \D */
349    string_pNd,           /* \d */    string_pNd,           /* \d */
350    string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */    string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
351    string_pXsp,          /* \s */    string_pXsp,          /* \s */   /* space and POSIX space are the same. */
352    string_PXwd,          /* \W */    string_PXwd,          /* \W */
353    string_pXwd           /* \w */    string_pXwd           /* \w */
354  };  };
355    
356    /* The POSIX class substitutes must be in the order of the POSIX class names,
357    defined above, and there are both positive and negative cases. NULL means no
358    general substitute of a Unicode property escape (\p or \P). However, for some
359    POSIX classes (e.g. graph, print, punct) a special property code is compiled
360    directly. */
361    
362  static const pcre_uchar string_pL[] =   {  static const pcre_uchar string_pL[] =   {
363    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
364    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
# Line 382  static const pcre_uchar *posix_substitut Line 406  static const pcre_uchar *posix_substitut
406    NULL,                 /* graph */    NULL,                 /* graph */
407    NULL,                 /* print */    NULL,                 /* print */
408    NULL,                 /* punct */    NULL,                 /* punct */
409    string_pXps,          /* space */    /* NOTE: Xps is POSIX space */    string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
410    string_pXwd,          /* word */    string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
411    NULL,                 /* xdigit */    NULL,                 /* xdigit */
412    /* Negated cases */    /* Negated cases */
413    string_PL,            /* ^alpha */    string_PL,            /* ^alpha */
# Line 397  static const pcre_uchar *posix_substitut Line 421  static const pcre_uchar *posix_substitut
421    NULL,                 /* ^graph */    NULL,                 /* ^graph */
422    NULL,                 /* ^print */    NULL,                 /* ^print */
423    NULL,                 /* ^punct */    NULL,                 /* ^punct */
424    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
425    string_PXwd,          /* ^word */    string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
426    NULL                  /* ^xdigit */    NULL                  /* ^xdigit */
427  };  };
428  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
# Line 462  static const char error_texts[] = Line 486  static const char error_texts[] =
486    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
487    "this version of PCRE is compiled without UTF support\0"    "this version of PCRE is compiled without UTF support\0"
488    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
489    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
490    /* 35 */    /* 35 */
491    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
492    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 516  static const char error_texts[] = Line 540  static const char error_texts[] =
540    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
541    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
542    "setting UTF is disabled by the application\0"    "setting UTF is disabled by the application\0"
543      "non-hex character in \\x{} (closing brace missing?)\0"
544      /* 80 */
545      "non-octal character in \\o{} (closing brace missing?)\0"
546      "missing opening brace after \\o\0"
547      "parentheses are too deeply nested\0"
548      "invalid range in character class\0"
549      "group name must start with a non-digit\0"
550    ;    ;
551    
552  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 773  static const pcre_uint8 posspropstab[3][ Line 804  static const pcre_uint8 posspropstab[3][
804    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
805  };  };
806    
807    /* This table is used when converting repeating opcodes into possessified
808    versions as a result of an explicit possessive quantifier such as ++. A zero
809    value means there is no possessified version - in those cases the item in
810    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
811    because all relevant opcodes are less than that. */
812    
813    static const pcre_uint8 opcode_possessify[] = {
814      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
815      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
816    
817      0,                       /* NOTI */
818      OP_POSSTAR, 0,           /* STAR, MINSTAR */
819      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
820      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
821      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
822      0,                       /* EXACT */
823      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
824    
825      OP_POSSTARI, 0,          /* STARI, MINSTARI */
826      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
827      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
828      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
829      0,                       /* EXACTI */
830      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
831    
832      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
833      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
834      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
835      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
836      0,                       /* NOTEXACT */
837      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
838    
839      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
840      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
841      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
842      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
843      0,                       /* NOTEXACTI */
844      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
845    
846      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
847      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
848      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
849      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
850      0,                       /* TYPEEXACT */
851      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
852    
853      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
854      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
855      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
856      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
857      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
858    
859      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
860      0, 0,                    /* REF, REFI */
861      0, 0,                    /* DNREF, DNREFI */
862      0, 0                     /* RECURSE, CALLOUT */
863    };
864    
865    
866    
867  /*************************************************  /*************************************************
# Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 968  return (*p == CHAR_RIGHT_CURLY_BRACKET);
968  *************************************************/  *************************************************/
969    
970  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
971  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
972  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
973  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
974  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
975  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
976    
977  Arguments:  Arguments:
978    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
979    chptr          points to the data character    chptr          points to a returned data character
980    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
981    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
982    options        the options bits    options        the options bits
# Line 1092  else Line 1180  else
1180      break;      break;
1181    
1182      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1183      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1184      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1185        recommended to avoid the ambiguities in the old syntax.
1186    
1187      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1188      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1189      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1190      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1191      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1192      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1193      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1194    
1195        Inside a character class, \ followed by a digit is always either a literal
1196        8 or 9 or an octal number. */
1197    
1198      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1199      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1128  else Line 1220  else
1220          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1221          break;          break;
1222          }          }
1223        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1224          {          {
1225          escape = -s;          escape = -s;
1226          break;          break;
# Line 1136  else Line 1228  else
1228        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1229        }        }
1230    
1231      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1232      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1233      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1234        changed so as not to insert the binary zero. */
1235    
1236      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1237        {  
1238        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1239    
1240      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1241      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1162  else Line 1252  else
1252  #endif  #endif
1253      break;      break;
1254    
1255      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1256      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.      specifying character codes in octal. The only supported form is \o{ddd}. */
1257      If not, { is treated as a data character. */  
1258        case CHAR_o:
1259        if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1260          {
1261          ptr += 2;
1262          c = 0;
1263          overflow = FALSE;
1264          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1265            {
1266            register pcre_uint32 cc = *ptr++;
1267            if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1268    #ifdef COMPILE_PCRE32
1269            if (c >= 0x20000000l) { overflow = TRUE; break; }
1270    #endif
1271            c = (c << 3) + cc - CHAR_0 ;
1272    #if defined COMPILE_PCRE8
1273            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1274    #elif defined COMPILE_PCRE16
1275            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1276    #elif defined COMPILE_PCRE32
1277            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1278    #endif
1279            }
1280          if (overflow)
1281            {
1282            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1283            *errorcodeptr = ERR34;
1284            }
1285          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1286            {
1287            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1288            }
1289          else *errorcodeptr = ERR80;
1290          }
1291        break;
1292    
1293        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1294        numbers. Otherwise it is a lowercase x letter. */
1295    
1296      case CHAR_x:      case CHAR_x:
1297      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1298        {        {
       /* In JavaScript, \x must be followed by two hexadecimal numbers.  
       Otherwise it is a lowercase x letter. */  
1299        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1300          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1301          {          {
# Line 1187  else Line 1312  else
1312  #endif  #endif
1313            }            }
1314          }          }
1315        break;        }    /* End JavaScript handling */
       }  
1316    
1317      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1318        {      greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1319        const pcre_uchar *pt = ptr + 2;      digits. If not, { used to be treated as a data character. However, Perl
1320        seems to read hex digits up to the first non-such, and ignore the rest, so
1321        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1322        now gives an error. */
1323    
1324        c = 0;      else
1325        overflow = FALSE;        {
1326        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1327          {          {
1328          register pcre_uint32 cc = *pt++;          ptr += 2;
1329          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          c = 0;
1330            overflow = FALSE;
1331            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1332              {
1333              register pcre_uint32 cc = *ptr++;
1334              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1335    
1336  #ifdef COMPILE_PCRE32  #ifdef COMPILE_PCRE32
1337          if (c >= 0x10000000l) { overflow = TRUE; break; }            if (c >= 0x10000000l) { overflow = TRUE; break; }
1338  #endif  #endif
1339    
1340  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1341          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1342          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1343  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1344          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1345          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1346  #endif  #endif
1347    
1348  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1349          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1350  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1351          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1352  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1353          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1354  #endif  #endif
1355          }            }
1356    
1357        if (overflow)          if (overflow)
1358          {            {
1359          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1360          *errorcodeptr = ERR34;            *errorcodeptr = ERR34;
1361          }            }
1362    
1363        if (*pt == CHAR_RIGHT_CURLY_BRACKET)          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1364          {            {
1365          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1366          ptr = pt;            }
         break;  
         }  
1367    
1368        /* If the sequence of hex digits does not end with '}', then we don't          /* If the sequence of hex digits does not end with '}', give an error.
1369        recognize this construct; fall through to the normal \x handling. */          We used just to recognize this construct and fall through to the normal
1370        }          \x handling, but nowadays Perl gives an error, which seems much more
1371            sensible, so we do too. */
1372    
1373      /* Read just a single-byte hex-defined char */          else *errorcodeptr = ERR79;
1374            }   /* End of \x{} processing */
1375    
1376      c = 0;        /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1377      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)  
1378        {        else
1379        pcre_uint32 cc;                          /* Some compilers don't like */          {
1380        cc = *(++ptr);                           /* ++ in initializers */          c = 0;
1381            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1382              {
1383              pcre_uint32 cc;                          /* Some compilers don't like */
1384              cc = *(++ptr);                           /* ++ in initializers */
1385  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1386        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1387        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1388  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1389        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1390        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1391  #endif  #endif
1392        }            }
1393            }     /* End of \xdd handling */
1394          }       /* End of Perl-style \x handling */
1395      break;      break;
1396    
1397      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 1524  for (;;) Line 1662  for (;;)
1662    
1663      case OP_CALLOUT:      case OP_CALLOUT:
1664      case OP_CREF:      case OP_CREF:
1665      case OP_NCREF:      case OP_DNCREF:
1666      case OP_RREF:      case OP_RREF:
1667      case OP_NRREF:      case OP_DNRREF:
1668      case OP_DEF:      case OP_DEF:
1669      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1670      break;      break;
# Line 1663  for (;;) Line 1801  for (;;)
1801      case OP_COMMIT:      case OP_COMMIT:
1802      case OP_CREF:      case OP_CREF:
1803      case OP_DEF:      case OP_DEF:
1804        case OP_DNCREF:
1805        case OP_DNRREF:
1806      case OP_DOLL:      case OP_DOLL:
1807      case OP_DOLLM:      case OP_DOLLM:
1808      case OP_EOD:      case OP_EOD:
1809      case OP_EODN:      case OP_EODN:
1810      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1811      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1812      case OP_PRUNE:      case OP_PRUNE:
1813      case OP_REVERSE:      case OP_REVERSE:
# Line 1764  for (;;) Line 1902  for (;;)
1902    
1903      switch (*cc)      switch (*cc)
1904        {        {
       case OP_CRPLUS:  
       case OP_CRMINPLUS:  
1905        case OP_CRSTAR:        case OP_CRSTAR:
1906        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1907          case OP_CRPLUS:
1908          case OP_CRMINPLUS:
1909        case OP_CRQUERY:        case OP_CRQUERY:
1910        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1911          case OP_CRPOSSTAR:
1912          case OP_CRPOSPLUS:
1913          case OP_CRPOSQUERY:
1914        return -1;        return -1;
1915    
1916        case OP_CRRANGE:        case OP_CRRANGE:
1917        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1918          case OP_CRPOSRANGE:
1919        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1920        branchlength += (int)GET2(cc,1);        branchlength += (int)GET2(cc,1);
1921        cc += 1 + 2 * IMM2_SIZE;        cc += 1 + 2 * IMM2_SIZE;
# Line 2366  for (code = first_significant_code(code Line 2508  for (code = first_significant_code(code
2508        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2509        case OP_CRQUERY:        case OP_CRQUERY:
2510        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2511          case OP_CRPOSSTAR:
2512          case OP_CRPOSQUERY:
2513        break;        break;
2514    
2515        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2516        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2517        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2518          case OP_CRPOSPLUS:
2519        return FALSE;        return FALSE;
2520    
2521        case OP_CRRANGE:        case OP_CRRANGE:
2522        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2523          case OP_CRPOSRANGE:
2524        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2525        break;        break;
2526        }        }
# Line 2653  switch(ptype) Line 2799  switch(ptype)
2799    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2800    means that Perl space and POSIX space are now identical. PCRE was changed    means that Perl space and POSIX space are now identical. PCRE was changed
2801    at release 8.34. */    at release 8.34. */
2802    
2803    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2804    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2805    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    switch(c)
2806            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||      {
2807            c == CHAR_FF || c == CHAR_CR)      HSPACE_CASES:
2808            == negated;      VSPACE_CASES:
2809        return negated;
2810    
2811        default:
2812        return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2813        }
2814      break;  /* Control never reaches here */
2815    
2816    case PT_WORD:    case PT_WORD:
2817    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
# Line 2708  get_chr_property_list(const pcre_uchar * Line 2860  get_chr_property_list(const pcre_uchar *
2860    const pcre_uint8 *fcc, pcre_uint32 *list)    const pcre_uint8 *fcc, pcre_uint32 *list)
2861  {  {
2862  pcre_uchar c = *code;  pcre_uchar c = *code;
2863    pcre_uchar base;
2864  const pcre_uchar *end;  const pcre_uchar *end;
 const pcre_uint32 *clist_src;  
 pcre_uint32 *clist_dest;  
2865  pcre_uint32 chr;  pcre_uint32 chr;
2866  pcre_uchar base;  
2867    #ifdef SUPPORT_UCP
2868    pcre_uint32 *clist_dest;
2869    const pcre_uint32 *clist_src;
2870    #else
2871    utf = utf;  /* Suppress "unused parameter" compiler warning */
2872    #endif
2873    
2874  list[0] = c;  list[0] = c;
2875  list[1] = FALSE;  list[1] = FALSE;
# Line 2818  switch(c) Line 2975  switch(c)
2975      return code + 2;      return code + 2;
2976      }      }
2977    
2978    /* Convert only if we have anough space. */    /* Convert only if we have enough space. */
2979    
2980    clist_src = PRIV(ucd_caseless_sets) + code[1];    clist_src = PRIV(ucd_caseless_sets) + code[1];
2981    clist_dest = list + 2;    clist_dest = list + 2;
2982    code += 2;    code += 2;
2983    
2984    do {    do {
      /* Early return if there is not enough space. */  
2985       if (clist_dest >= list + 8)       if (clist_dest >= list + 8)
2986         {         {
2987           /* Early return if there is not enough space. This should never
2988           happen, since all clists are shorter than 5 character now. */
2989         list[2] = code[0];         list[2] = code[0];
2990         list[3] = code[1];         list[3] = code[1];
2991         return code;         return code;
2992         }         }
2993       *clist_dest++ = *clist_src;       *clist_dest++ = *clist_src;
2994       }       }
2995     while(*clist_src++ != NOTACHAR);    while(*clist_src++ != NOTACHAR);
2996    
2997    /* Enough space to store all characters. */    /* All characters are stored. The terminating NOTACHAR
2998      is copied form the clist itself. */
2999    
3000    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3001    return code;    return code;
# Line 2846  switch(c) Line 3005  switch(c)
3005    case OP_CLASS:    case OP_CLASS:
3006  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3007    case OP_XCLASS:    case OP_XCLASS:
   
3008    if (c == OP_XCLASS)    if (c == OP_XCLASS)
3009      end = code + GET(code, 0);      end = code + GET(code, 0) - 1;
3010    else    else
3011  #endif  #endif
3012      end = code + 32 / sizeof(pcre_uchar);      end = code + 32 / sizeof(pcre_uchar);
# Line 2859  switch(c) Line 3017  switch(c)
3017      case OP_CRMINSTAR:      case OP_CRMINSTAR:
3018      case OP_CRQUERY:      case OP_CRQUERY:
3019      case OP_CRMINQUERY:      case OP_CRMINQUERY:
3020        case OP_CRPOSSTAR:
3021        case OP_CRPOSQUERY:
3022      list[1] = TRUE;      list[1] = TRUE;
3023      end++;      end++;
3024      break;      break;
3025    
3026        case OP_CRPLUS:
3027        case OP_CRMINPLUS:
3028        case OP_CRPOSPLUS:
3029        end++;
3030        break;
3031    
3032      case OP_CRRANGE:      case OP_CRRANGE:
3033      case OP_CRMINRANGE:      case OP_CRMINRANGE:
3034        case OP_CRPOSRANGE:
3035      list[1] = (GET2(end, 1) == 0);      list[1] = (GET2(end, 1) == 0);
3036      end += 1 + 2 * IMM2_SIZE;      end += 1 + 2 * IMM2_SIZE;
3037      break;      break;
# Line 2895  Returns:      TRUE if the auto-possessif Line 3062  Returns:      TRUE if the auto-possessif
3062    
3063  static BOOL  static BOOL
3064  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3065    const pcre_uint32* base_list)    const pcre_uint32 *base_list, const pcre_uchar *base_end)
3066  {  {
3067  pcre_uchar c;  pcre_uchar c;
3068  pcre_uint32 list[8];  pcre_uint32 list[8];
3069  const pcre_uint32* chr_ptr;  const pcre_uint32 *chr_ptr;
3070  const pcre_uint32* ochr_ptr;  const pcre_uint32 *ochr_ptr;
3071  const pcre_uint32* list_ptr;  const pcre_uint32 *list_ptr;
3072    const pcre_uchar *next_code;
3073    const pcre_uint8 *class_bitset;
3074    const pcre_uint32 *set1, *set2, *set_end;
3075  pcre_uint32 chr;  pcre_uint32 chr;
3076    BOOL accepted, invert_bits;
3077    
3078    /* Note: the base_list[1] contains whether the current opcode has greedy
3079    (represented by a non-zero value) quantifier. This is a different from
3080    other character type lists, which stores here that the character iterator
3081    matches to an empty string (also represented by a non-zero value). */
3082    
3083  for(;;)  for(;;)
3084    {    {
3085      /* All operations move the code pointer forward.
3086      Therefore infinite recursions are not possible. */
3087    
3088    c = *code;    c = *code;
3089    
3090    /* Skip over callouts */    /* Skip over callouts */
# Line 2925  for(;;) Line 3104  for(;;)
3104    switch(c)    switch(c)
3105      {      {
3106      case OP_END:      case OP_END:
3107      /* TRUE only in greedy case. The non-greedy case could be replaced by an      case OP_KETRPOS:
3108      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses      /* TRUE only in greedy case. The non-greedy case could be replaced by
3109      more memory, which we cannot get at this stage.) */      an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3110        uses more memory, which we cannot get at this stage.) */
3111    
3112      return base_list[1] != 0;      return base_list[1] != 0;
3113    
3114      case OP_KET:      case OP_KET:
3115      /* If the bracket is capturing, and referenced by an OP_RECURSE, the      /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3116      non-greedy case cannot be converted to a possessive form. We do not test      it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3117      the bracket type at the moment, but we might do it in the future to improve      cannot be converted to a possessive form. */
     this condition. (But note that recursive calls are always atomic.) */  
3118    
3119      if (base_list[1] == 0) return FALSE;      if (base_list[1] == 0) return FALSE;
3120    
3121        switch(*(code - GET(code, 1)))
3122          {
3123          case OP_ASSERT:
3124          case OP_ASSERT_NOT:
3125          case OP_ASSERTBACK:
3126          case OP_ASSERTBACK_NOT:
3127          case OP_ONCE:
3128          case OP_ONCE_NC:
3129          /* Atomic sub-patterns and assertions can always auto-possessify their
3130          last iterator. */
3131          return TRUE;
3132          }
3133    
3134        code += PRIV(OP_lengths)[c];
3135        continue;
3136    
3137        case OP_ONCE:
3138        case OP_ONCE_NC:
3139        case OP_BRA:
3140        case OP_CBRA:
3141        next_code = code + GET(code, 1);
3142        code += PRIV(OP_lengths)[c];
3143    
3144        while (*next_code == OP_ALT)
3145          {
3146          if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3147          code = next_code + 1 + LINK_SIZE;
3148          next_code += GET(next_code, 1);
3149          }
3150        continue;
3151    
3152        case OP_BRAZERO:
3153        case OP_BRAMINZERO:
3154    
3155        next_code = code + 1;
3156        if (*next_code != OP_BRA && *next_code != OP_CBRA
3157            && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3158    
3159        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3160    
3161        /* The bracket content will be checked by the
3162        OP_BRA/OP_CBRA case above. */
3163        next_code += 1 + LINK_SIZE;
3164        if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3165          return FALSE;
3166    
3167      code += PRIV(OP_lengths)[c];      code += PRIV(OP_lengths)[c];
3168      continue;      continue;
3169      }      }
# Line 2961  for(;;) Line 3187  for(;;)
3187      list_ptr = base_list;      list_ptr = base_list;
3188      }      }
3189    
3190      /* Character bitsets can also be compared to certain opcodes. */
3191    
3192      else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3193    #ifdef COMPILE_PCRE8
3194          /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3195          || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3196    #endif
3197          )
3198        {
3199    #ifdef COMPILE_PCRE8
3200        if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3201    #else
3202        if (base_list[0] == OP_CLASS)
3203    #endif
3204          {
3205          set1 = (pcre_uint32 *)(base_end - base_list[2]);
3206          list_ptr = list;
3207          }
3208        else
3209          {
3210          set1 = (pcre_uint32 *)(code - list[2]);
3211          list_ptr = base_list;
3212          }
3213    
3214        invert_bits = FALSE;
3215        switch(list_ptr[0])
3216          {
3217          case OP_CLASS:
3218          case OP_NCLASS:
3219          set2 = (pcre_uint32 *)
3220            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3221          break;
3222    
3223          /* OP_XCLASS cannot be supported here, because its bitset
3224          is not necessarily complete. E.g: [a-\0x{200}] is stored
3225          as a character range, and the appropriate bits are not set. */
3226    
3227          case OP_NOT_DIGIT:
3228            invert_bits = TRUE;
3229            /* Fall through */
3230          case OP_DIGIT:
3231            set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3232            break;
3233    
3234          case OP_NOT_WHITESPACE:
3235            invert_bits = TRUE;
3236            /* Fall through */
3237          case OP_WHITESPACE:
3238            set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3239            break;
3240    
3241          case OP_NOT_WORDCHAR:
3242            invert_bits = TRUE;
3243            /* Fall through */
3244          case OP_WORDCHAR:
3245            set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3246            break;
3247    
3248          default:
3249          return FALSE;
3250          }
3251    
3252        /* Compare 4 bytes to improve speed. */
3253        set_end = set1 + (32 / 4);
3254        if (invert_bits)
3255          {
3256          do
3257            {
3258            if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3259            }
3260          while (set1 < set_end);
3261          }
3262        else
3263          {
3264          do
3265            {
3266            if ((*set1++ & *set2++) != 0) return FALSE;
3267            }
3268          while (set1 < set_end);
3269          }
3270    
3271        if (list[1] == 0) return TRUE;
3272        /* Might be an empty repeat. */
3273        continue;
3274        }
3275    
3276    /* Some property combinations also acceptable. Unicode property opcodes are    /* Some property combinations also acceptable. Unicode property opcodes are
3277    processed specially; the rest can be handled with a lookup table. */    processed specially; the rest can be handled with a lookup table. */
3278    
# Line 2968  for(;;) Line 3280  for(;;)
3280      {      {
3281      pcre_uint32 leftop, rightop;      pcre_uint32 leftop, rightop;
3282    
     if (list[1] != 0) return FALSE;   /* Must match at least one character */  
3283      leftop = base_list[0];      leftop = base_list[0];
3284      rightop = list[0];      rightop = list[0];
3285    
3286  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3287        accepted = FALSE; /* Always set in non-unicode case. */
3288      if (leftop == OP_PROP || leftop == OP_NOTPROP)      if (leftop == OP_PROP || leftop == OP_NOTPROP)
3289        {        {
3290        if (rightop == OP_EOD) return TRUE;        if (rightop == OP_EOD)
3291        if (rightop == OP_PROP || rightop == OP_NOTPROP)          accepted = TRUE;
3292          else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3293          {          {
3294          int n;          int n;
3295          const pcre_uint8 *p;          const pcre_uint8 *p;
# Line 2997  for(;;) Line 3310  for(;;)
3310          n = propposstab[base_list[2]][list[2]];          n = propposstab[base_list[2]][list[2]];
3311          switch(n)          switch(n)
3312            {            {
3313            case 0: return FALSE;            case 0: break;
3314            case 1: return bothprop;            case 1: accepted = bothprop; break;
3315            case 2: return (base_list[3] == list[3]) != same;            case 2: accepted = (base_list[3] == list[3]) != same; break;
3316            case 3: return !same;            case 3: accepted = !same; break;
3317    
3318            case 4:  /* Left general category, right particular category */            case 4:  /* Left general category, right particular category */
3319            return risprop && catposstab[base_list[3]][list[3]] == same;            accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3320              break;
3321    
3322            case 5:  /* Right general category, left particular category */            case 5:  /* Right general category, left particular category */
3323            return lisprop && catposstab[list[3]][base_list[3]] == same;            accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3324              break;
3325    
3326            /* This code is logically tricky. Think hard before fiddling with it.            /* This code is logically tricky. Think hard before fiddling with it.
3327            The posspropstab table has four entries per row. Each row relates to            The posspropstab table has four entries per row. Each row relates to
3328            one of PCRE's special properties such as ALNUM or SPACE or WORD.            one of PCRE's special properties such as ALNUM or SPACE or WORD.
3329            Only WORD actually needs all four entries, but using repeats for the            Only WORD actually needs all four entries, but using repeats for the
3330            others means they can all use the same code below.            others means they can all use the same code below.
3331    
3332            The first two entries in each row are Unicode general categories, and            The first two entries in each row are Unicode general categories, and
3333            apply always, because all the characters they include are part of the            apply always, because all the characters they include are part of the
3334            PCRE character set. The third and fourth entries are a general and a            PCRE character set. The third and fourth entries are a general and a
# Line 3023  for(;;) Line 3338  for(;;)
3338            category contains more characters than the specials that are defined            category contains more characters than the specials that are defined
3339            for the property being tested against. Therefore, it cannot be used            for the property being tested against. Therefore, it cannot be used
3340            in a NOTPROP case.            in a NOTPROP case.
3341    
3342            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3343            Underscore is covered by ucp_P or ucp_Po. */            Underscore is covered by ucp_P or ucp_Po. */
3344    
# Line 3031  for(;;) Line 3346  for(;;)
3346            case 7:  /* Left space vs right general category */            case 7:  /* Left space vs right general category */
3347            case 8:  /* Left word vs right general category */            case 8:  /* Left word vs right general category */
3348            p = posspropstab[n-6];            p = posspropstab[n-6];
3349            return risprop && lisprop ==            accepted = risprop && lisprop ==
3350              (list[3] != p[0] &&              (list[3] != p[0] &&
3351               list[3] != p[1] &&               list[3] != p[1] &&
3352              (list[3] != p[2] || !lisprop));              (list[3] != p[2] || !lisprop));
3353              break;
3354    
3355            case 9:   /* Right alphanum vs left general category */            case 9:   /* Right alphanum vs left general category */
3356            case 10:  /* Right space vs left general category */            case 10:  /* Right space vs left general category */
3357            case 11:  /* Right word vs left general category */            case 11:  /* Right word vs left general category */
3358            p = posspropstab[n-9];            p = posspropstab[n-9];
3359            return lisprop && risprop ==            accepted = lisprop && risprop ==
3360              (base_list[3] != p[0] &&              (base_list[3] != p[0] &&
3361               base_list[3] != p[1] &&               base_list[3] != p[1] &&
3362              (base_list[3] != p[2] || !risprop));              (base_list[3] != p[2] || !risprop));
3363              break;
3364    
3365            case 12:  /* Left alphanum vs right particular category */            case 12:  /* Left alphanum vs right particular category */
3366            case 13:  /* Left space vs right particular category */            case 13:  /* Left space vs right particular category */
3367            case 14:  /* Left word vs right particular category */            case 14:  /* Left word vs right particular category */
3368            p = posspropstab[n-12];            p = posspropstab[n-12];
3369            return risprop && lisprop ==            accepted = risprop && lisprop ==
3370              (catposstab[p[0]][list[3]] &&              (catposstab[p[0]][list[3]] &&
3371               catposstab[p[1]][list[3]] &&               catposstab[p[1]][list[3]] &&
3372              (list[3] != p[3] || !lisprop));              (list[3] != p[3] || !lisprop));
3373              break;
3374    
3375            case 15:  /* Right alphanum vs left particular category */            case 15:  /* Right alphanum vs left particular category */
3376            case 16:  /* Right space vs left particular category */            case 16:  /* Right space vs left particular category */
3377            case 17:  /* Right word vs left particular category */            case 17:  /* Right word vs left particular category */
3378            p = posspropstab[n-15];            p = posspropstab[n-15];
3379            return lisprop && risprop ==            accepted = lisprop && risprop ==
3380              (catposstab[p[0]][base_list[3]] &&              (catposstab[p[0]][base_list[3]] &&
3381               catposstab[p[1]][base_list[3]] &&               catposstab[p[1]][base_list[3]] &&
3382              (base_list[3] != p[3] || !risprop));              (base_list[3] != p[3] || !risprop));
3383              break;
3384            }            }
3385          }          }
       return FALSE;  
3386        }        }
3387    
3388      else      else
3389  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
3390    
3391      return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&      accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3392             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3393             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3394    
3395        if (!accepted)
3396          return FALSE;
3397    
3398        if (list[1] == 0) return TRUE;
3399        /* Might be an empty repeat. */
3400        continue;
3401      }      }
3402    
3403    /* Control reaches here only if one of the items is a small character list.    /* Control reaches here only if one of the items is a small character list.
# Line 3186  for(;;) Line 3511  for(;;)
3511        case OP_EOD:    /* Can always possessify before \z */        case OP_EOD:    /* Can always possessify before \z */
3512        break;        break;
3513    
3514    #ifdef SUPPORT_UCP
3515        case OP_PROP:        case OP_PROP:
3516        case OP_NOTPROP:        case OP_NOTPROP:
3517        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3518              list_ptr[0] == OP_NOTPROP))              list_ptr[0] == OP_NOTPROP))
3519          return FALSE;          return FALSE;
3520        break;        break;
3521    #endif
       /* The class comparisons work only when the class is the second item  
       of the pair, because there are at present no possessive forms of the  
       class opcodes. Note also that the "code" variable that is used below  
       points after the second item, and that the pointer for the first item  
       is not available, so even if there were possessive forms of the class  
       opcodes, the correct comparison could not be done. */  
3522    
3523        case OP_NCLASS:        case OP_NCLASS:
3524        if (chr > 255) return FALSE;        if (chr > 255) return FALSE;
3525        /* Fall through */        /* Fall through */
3526    
3527        case OP_CLASS:        case OP_CLASS:
       if (list_ptr != list) return FALSE;   /* Class is first opcode */  
3528        if (chr > 255) break;        if (chr > 255) break;
3529        if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)        class_bitset = (pcre_uint8 *)
3530          return FALSE;          ((list_ptr == list ? code : base_end) - list_ptr[2]);
3531          if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3532        break;        break;
3533    
3534  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3535        case OP_XCLASS:        case OP_XCLASS:
3536        if (list_ptr != list) return FALSE;   /* Class is first opcode */        if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3537        if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))            list_ptr[2] + LINK_SIZE, utf)) return FALSE;
         return FALSE;  
3538        break;        break;
3539  #endif  #endif
3540    
# Line 3257  auto_possessify(pcre_uchar *code, BOOL u Line 3576  auto_possessify(pcre_uchar *code, BOOL u
3576  {  {
3577  register pcre_uchar c;  register pcre_uchar c;
3578  const pcre_uchar *end;  const pcre_uchar *end;
3579    pcre_uchar *repeat_opcode;
3580  pcre_uint32 list[8];  pcre_uint32 list[8];
3581    
3582  for (;;)  for (;;)
# Line 3270  for (;;) Line 3590  for (;;)
3590        get_chr_property_list(code, utf, cd->fcc, list) : NULL;        get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3591      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3592    
3593      if (end != NULL && compare_opcodes(end, utf, cd, list))      if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3594        {        {
3595        switch(c)        switch(c)
3596          {          {
# Line 3309  for (;;) Line 3629  for (;;)
3629        }        }
3630      c = *code;      c = *code;
3631      }      }
3632      else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3633        {
3634    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3635        if (c == OP_XCLASS)
3636          repeat_opcode = code + GET(code, 1);
3637        else
3638    #endif
3639          repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3640    
3641        c = *repeat_opcode;
3642        if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3643          {
3644          /* end must not be NULL. */
3645          end = get_chr_property_list(code, utf, cd->fcc, list);
3646    
3647          list[1] = (c & 1) == 0;
3648    
3649          if (compare_opcodes(end, utf, cd, list, end))
3650            {
3651            switch (c)
3652              {
3653              case OP_CRSTAR:
3654              case OP_CRMINSTAR:
3655              *repeat_opcode = OP_CRPOSSTAR;
3656              break;
3657    
3658              case OP_CRPLUS:
3659              case OP_CRMINPLUS:
3660              *repeat_opcode = OP_CRPOSPLUS;
3661              break;
3662    
3663              case OP_CRQUERY:
3664              case OP_CRMINQUERY:
3665              *repeat_opcode = OP_CRPOSQUERY;
3666              break;
3667    
3668              case OP_CRRANGE:
3669              case OP_CRMINRANGE:
3670              *repeat_opcode = OP_CRPOSRANGE;
3671              break;
3672              }
3673            }
3674          }
3675        c = *code;
3676        }
3677    
3678    switch(c)    switch(c)
3679      {      {
# Line 3335  for (;;) Line 3700  for (;;)
3700        code += 2;        code += 2;
3701      break;      break;
3702    
3703    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3704      case OP_XCLASS:      case OP_XCLASS:
3705      code += GET(code, 1);      code += GET(code, 1);
3706      break;      break;
3707    #endif
3708    
3709      case OP_MARK:      case OP_MARK:
3710      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
# Line 3446  class, but [abc[:x\]pqr:]] is (so that a Line 3813  class, but [abc[:x\]pqr:]] is (so that a
3813  below handles the special case of \], but does not try to do any other escape  below handles the special case of \], but does not try to do any other escape
3814  processing. This makes it different from Perl for cases such as [:l\ower:]  processing. This makes it different from Perl for cases such as [:l\ower:]
3815  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3816  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
3817  I think.  I think.
3818    
3819  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
# Line 4199  for (;; ptr++) Line 4566  for (;; ptr++)
4566          }          }
4567        goto NORMAL_CHAR;        goto NORMAL_CHAR;
4568        }        }
4569        /* Control does not reach here. */
4570      }      }
4571    
4572    /* Fill in length of a previous callout, except when the next thing is    /* In extended mode, skip white space and comments. We need a loop in order
4573    a quantifier. */    to check for more white space and more comments after a comment. */
   
   is_quantifier =  
     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||  
     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));  
   
   if (!is_quantifier && previous_callout != NULL &&  
        after_manual_callout-- <= 0)  
     {  
     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */  
       complete_callout(previous_callout, ptr, cd);  
     previous_callout = NULL;  
     }  
   
   /* In extended mode, skip white space and comments. */  
4574    
4575    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
4576      {      {
4577      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;      for (;;)
     if (c == CHAR_NUMBER_SIGN)  
4578        {        {
4579          while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4580          if (c != CHAR_NUMBER_SIGN) break;
4581        ptr++;        ptr++;
4582        while (*ptr != CHAR_NULL)        while (*ptr != CHAR_NULL)
4583          {          {
4584          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4585              {                          /* IS_NEWLINE sets cd->nllen. */
4586              ptr += cd->nllen;
4587              break;
4588              }
4589          ptr++;          ptr++;
4590  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4591          if (utf) FORWARDCHAR(ptr);          if (utf) FORWARDCHAR(ptr);
4592  #endif  #endif
4593          }          }
4594        if (*ptr != CHAR_NULL) continue;        c = *ptr;     /* Either NULL or the char after a newline */
   
       /* Else fall through to handle end of string */  
       c = 0;  
4595        }        }
4596      }      }
4597    
4598    /* No auto callout for quantifiers. */    /* See if the next thing is a quantifier. */
4599    
4600      is_quantifier =
4601        c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4602        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4603    
4604      /* Fill in length of a previous callout, except when the next thing is a
4605      quantifier or when processing a property substitution string in UCP mode. */
4606    
4607      if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4608           after_manual_callout-- <= 0)
4609        {
4610        if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4611          complete_callout(previous_callout, ptr, cd);
4612        previous_callout = NULL;
4613        }
4614    
4615      /* Create auto callout, except for quantifiers, or while processing property
4616      strings that are substituted for \w etc in UCP mode. */
4617    
4618    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4619      {      {
4620      previous_callout = code;      previous_callout = code;
4621      code = auto_callout(code, ptr, cd);      code = auto_callout(code, ptr, cd);
4622      }      }
4623    
4624      /* Process the next pattern item. */
4625    
4626    switch(c)    switch(c)
4627      {      {
4628      /* ===================================================================*/      /* ===================================================================*/
4629      case 0:                        /* The branch terminates at string end */      case CHAR_NULL:                /* The branch terminates at string end */
4630      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
4631      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
4632      *firstcharptr = firstchar;      *firstcharptr = firstchar;
# Line 4327  for (;; ptr++) Line 4703  for (;; ptr++)
4703        goto FAILED;        goto FAILED;
4704        }        }
4705      goto NORMAL_CHAR;      goto NORMAL_CHAR;
4706    
4707        /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4708        used for "start of word" and "end of word". As these are otherwise illegal
4709        sequences, we don't break anything by recognizing them. They are replaced
4710        by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4711        erroneous and are handled by the normal code below. */
4712    
4713      case CHAR_LEFT_SQUARE_BRACKET:      case CHAR_LEFT_SQUARE_BRACKET:
4714        if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4715          {
4716          nestptr = ptr + 7;
4717          ptr = sub_start_of_word - 1;
4718          continue;
4719          }
4720    
4721        if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4722          {
4723          nestptr = ptr + 7;
4724          ptr = sub_end_of_word - 1;
4725          continue;
4726          }
4727    
4728        /* Handle a real character class. */
4729    
4730      previous = code;      previous = code;
4731    
4732      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
# Line 4493  for (;; ptr++) Line 4891  for (;; ptr++)
4891            posix_class = 0;            posix_class = 0;
4892    
4893          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
4894          different escape sequences that use Unicode properties. */          different escape sequences that use Unicode properties \p or \P. Others
4895            that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4896            directly. */
4897    
4898  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4899          if ((options & PCRE_UCP) != 0)          if ((options & PCRE_UCP) != 0)
4900            {            {
4901              unsigned int ptype = 0;
4902            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4903    
4904              /* The posix_substitutes table specifies which POSIX classes can be
4905              converted to \p or \P items. */
4906    
4907            if (posix_substitutes[pc] != NULL)            if (posix_substitutes[pc] != NULL)
4908              {              {
4909              nestptr = tempptr + 1;              nestptr = tempptr + 1;
4910              ptr = posix_substitutes[pc] - 1;              ptr = posix_substitutes[pc] - 1;
4911              continue;              continue;
4912              }              }
4913    
4914              /* There are three other classes that generate special property calls
4915              that are recognized only in an XCLASS. */
4916    
4917              else switch(posix_class)
4918                {
4919                case PC_GRAPH:
4920                ptype = PT_PXGRAPH;
4921                /* Fall through */
4922                case PC_PRINT:
4923                if (ptype == 0) ptype = PT_PXPRINT;
4924                /* Fall through */
4925                case PC_PUNCT:
4926                if (ptype == 0) ptype = PT_PXPUNCT;
4927                *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4928                *class_uchardata++ = ptype;
4929                *class_uchardata++ = 0;
4930                ptr = tempptr + 1;
4931                continue;
4932    
4933                /* For all other POSIX classes, no special action is taken in UCP
4934                mode. Fall through to the non_UCP case. */
4935    
4936                default:
4937                break;
4938                }
4939            }            }
4940  #endif  #endif
4941          /* In the non-UCP case, we build the bit map for the POSIX class in a          /* In the non-UCP case, or when UCP makes no difference, we build the
4942          chunk of local store because we may be adding and subtracting from it,          bit map for the POSIX class in a chunk of local store because we may be
4943          and we don't want to subtract bits that may be in the main map already.          adding and subtracting from it, and we don't want to subtract bits that
4944          At the end we or the result into the bit map that is being built. */          may be in the main map already. At the end we or the result into the
4945            bit map that is being built. */
4946    
4947          posix_class *= 3;          posix_class *= 3;
4948    
# Line 4631  for (;; ptr++) Line 5063  for (;; ptr++)
5063              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5064              previously set by something earlier in the character class.              previously set by something earlier in the character class.
5065              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5066              we could just adjust the appropriate bit. From PCRE 8.34 we no              we could just adjust the appropriate bit. From PCRE 8.34 we no
5067              longer treat \s and \S specially. */              longer treat \s and \S specially. */
5068    
5069              case ESC_s:              case ESC_s:
# Line 4762  for (;; ptr++) Line 5194  for (;; ptr++)
5194  #endif  #endif
5195          d = *ptr;  /* Not UTF-8 mode */          d = *ptr;  /* Not UTF-8 mode */
5196    
5197          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape
5198          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          sequence, but not any of the other escapes. Perl treats a hyphen as a
5199          in such circumstances. */          literal in such circumstances. However, in Perl's warning mode, a
5200            warning is given, so PCRE now faults it as it is almost certainly a
5201            mistake on the user's part. */
5202    
5203          if (!inescq && d == CHAR_BACKSLASH)          if (!inescq)
5204            {            {
5205            int descape;            if (d == CHAR_BACKSLASH)
5206            descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);              {
5207            if (*errorcodeptr != 0) goto FAILED;              int descape;
5208                descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5209                if (*errorcodeptr != 0) goto FAILED;
5210    
5211            /* \b is backspace; any other special means the '-' was literal. */              /* 0 means a character was put into d; \b is backspace; any other
5212                special causes an error. */
5213    
5214            if (descape != 0)              if (descape != 0)
             {  
             if (descape == ESC_b) d = CHAR_BS; else  
5215                {                {
5216                ptr = oldptr;                if (descape == ESC_b) d = CHAR_BS; else
5217                goto CLASS_SINGLE_CHARACTER;  /* A few lines below */                  {
5218                    *errorcodeptr = ERR83;
5219                    goto FAILED;
5220                    }
5221                }                }
5222              }              }
5223    
5224              /* A hyphen followed by a POSIX class is treated in the same way. */
5225    
5226              else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5227                       (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5228                        ptr[1] == CHAR_EQUALS_SIGN) &&
5229                       check_posix_syntax(ptr, &tempptr))
5230                {
5231                *errorcodeptr = ERR83;
5232                goto FAILED;
5233                }
5234            }            }
5235    
5236          /* Check that the two values are in the correct order. Optimize          /* Check that the two values are in the correct order. Optimize
# Line 5045  for (;; ptr++) Line 5494  for (;; ptr++)
5494    
5495      tempcode = previous;      tempcode = previous;
5496    
5497        /* Before checking for a possessive quantifier, we must skip over
5498        whitespace and comments in extended mode because Perl allows white space at
5499        this point. */
5500    
5501        if ((options & PCRE_EXTENDED) != 0)
5502          {
5503          const pcre_uchar *p = ptr + 1;
5504          for (;;)
5505            {
5506            while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5507            if (*p != CHAR_NUMBER_SIGN) break;
5508            p++;
5509            while (*p != CHAR_NULL)
5510              {
5511              if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5512                {                        /* IS_NEWLINE sets cd->nllen. */
5513                p += cd->nllen;
5514                break;
5515                }
5516              p++;
5517    #ifdef SUPPORT_UTF
5518              if (utf) FORWARDCHAR(p);
5519    #endif
5520              }           /* Loop for comment characters */
5521            }             /* Loop for multiple comments */
5522          ptr = p - 1;    /* Character before the next significant one. */
5523          }
5524    
5525      /* If the next character is '+', we have a possessive quantifier. This      /* If the next character is '+', we have a possessive quantifier. This
5526      implies greediness, whatever the setting of the PCRE_UNGREEDY option.      implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5527      If the next character is '?' this is a minimizing repeat, by default,      If the next character is '?' this is a minimizing repeat, by default,
# Line 5337  for (;; ptr++) Line 5814  for (;; ptr++)
5814      opcodes such as BRA and CBRA, as this is the place where they get converted      opcodes such as BRA and CBRA, as this is the place where they get converted
5815      into the more special varieties such as BRAPOS and SBRA. A test for >=      into the more special varieties such as BRAPOS and SBRA. A test for >=
5816      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5817      ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow      ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5818      repetition of assertions, but now it does, for Perl compatibility. */      Originally, PCRE did not allow repetition of assertions, but now it does,
5819        for Perl compatibility. */
5820    
5821      else if (*previous >= OP_ASSERT && *previous <= OP_COND)      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5822        {        {
# Line 5356  for (;; ptr++) Line 5834  for (;; ptr++)
5834        /* There is no sense in actually repeating assertions. The only potential        /* There is no sense in actually repeating assertions. The only potential
5835        use of repetition is in cases when the assertion is optional. Therefore,        use of repetition is in cases when the assertion is optional. Therefore,
5836        if the minimum is greater than zero, just ignore the repeat. If the        if the minimum is greater than zero, just ignore the repeat. If the
5837        maximum is not not zero or one, set it to 1. */        maximum is not zero or one, set it to 1. */
5838    
5839        if (*previous < OP_ONCE)    /* Assertion */        if (*previous < OP_ONCE)    /* Assertion */
5840          {          {
# Line 5729  for (;; ptr++) Line 6207  for (;; ptr++)
6207        goto FAILED;        goto FAILED;
6208        }        }
6209    
6210      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', possessive_quantifier is
6211      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,      TRUE. For some opcodes, there are special alternative opcodes for this
6212      there are special alternative opcodes for this case. For anything else, we      case. For anything else, we wrap the entire repeated item inside OP_ONCE
6213      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      brackets. Logically, the '+' notation is just syntactic sugar, taken from
6214      notation is just syntactic sugar, taken from Sun's Java package, but the      Sun's Java package, but the special opcodes can optimize it.
     special opcodes can optimize it.  
6215    
6216      Some (but not all) possessively repeated subpatterns have already been      Some (but not all) possessively repeated subpatterns have already been
6217      completely handled in the code just above. For them, possessive_quantifier      completely handled in the code just above. For them, possessive_quantifier
6218      is always FALSE at this stage.      is always FALSE at this stage. Note that the repeated item starts at
6219        tempcode, not at previous, which might be the first part of a string whose
6220      Note that the repeated item starts at tempcode, not at previous, which      (former) last char we repeated. */
     might be the first part of a string whose (former) last char we repeated.  
   
     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But  
     an 'upto' may follow. We skip over an 'exact' item, and then test the  
     length of what remains before proceeding. */  
6221    
6222      if (possessive_quantifier)      if (possessive_quantifier)
6223        {        {
6224        int len;        int len;
6225    
6226        if (*tempcode == OP_TYPEEXACT)        /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6227          However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6228          {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6229          remains is greater than zero, there's a further opcode that can be
6230          handled. If not, do nothing, leaving the EXACT alone. */
6231    
6232          switch(*tempcode)
6233            {
6234            case OP_TYPEEXACT:
6235          tempcode += PRIV(OP_lengths)[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
6236            ((tempcode[1 + IMM2_SIZE] == OP_PROP            ((tempcode[1 + IMM2_SIZE] == OP_PROP
6237            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6238            break;
6239    
6240        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)          /* CHAR opcodes are used for exacts whose count is 1. */
6241          {  
6242            case OP_CHAR:
6243            case OP_CHARI:
6244            case OP_NOT:
6245            case OP_NOTI:
6246            case OP_EXACT:
6247            case OP_EXACTI:
6248            case OP_NOTEXACT:
6249            case OP_NOTEXACTI:
6250          tempcode += PRIV(OP_lengths)[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
6251  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6252          if (utf && HAS_EXTRALEN(tempcode[-1]))          if (utf && HAS_EXTRALEN(tempcode[-1]))
6253            tempcode += GET_EXTRALEN(tempcode[-1]);            tempcode += GET_EXTRALEN(tempcode[-1]);
6254  #endif  #endif
6255            break;
6256    
6257            /* For the class opcodes, the repeat operator appears at the end;
6258            adjust tempcode to point to it. */
6259    
6260            case OP_CLASS:
6261            case OP_NCLASS:
6262            tempcode += 1 + 32/sizeof(pcre_uchar);
6263            break;
6264    
6265    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6266            case OP_XCLASS:
6267            tempcode += GET(tempcode, 1);
6268            break;
6269    #endif
6270          }          }
6271    
6272          /* If tempcode is equal to code (which points to the end of the repeated
6273          item), it means we have skipped an EXACT item but there is no following
6274          QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6275          all other cases, tempcode will be pointing to the repeat opcode, and will
6276          be less than code, so the value of len will be greater than 0. */
6277    
6278        len = (int)(code - tempcode);        len = (int)(code - tempcode);
6279          if (len > 0)
6280            {
6281            unsigned int repcode = *tempcode;
6282    
6283            /* There is a table for possessifying opcodes, all of which are less
6284            than OP_CALLOUT. A zero entry means there is no possessified version.
6285            */
6286    
6287            if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6288              *tempcode = opcode_possessify[repcode];
6289    
6290            /* For opcode without a special possessified version, wrap the item in
6291            ONCE brackets. Because we are moving code along, we must ensure that any
6292            pending recursive references are updated. */
6293    
6294            else
6295              {
6296              *code = OP_END;
6297              adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6298              memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6299              code += 1 + LINK_SIZE;
6300              len += 1 + LINK_SIZE;
6301              tempcode[0] = OP_ONCE;
6302              *code++ = OP_KET;
6303              PUTINC(code, 0, len);
6304              PUT(tempcode, 1, len);
6305              }
6306            }
6307    
6308    #ifdef NEVER
6309        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
6310          {          {
6311          case OP_STAR:  *tempcode = OP_POSSTAR; break;          case OP_STAR:  *tempcode = OP_POSSTAR; break;
# Line 5793  for (;; ptr++) Line 6333  for (;; ptr++)
6333          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6334          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6335    
6336            case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6337            case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6338            case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6339            case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6340    
6341          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
6342          pending recursive references are updated. */          pending recursive references are updated. */
6343    
# Line 5808  for (;; ptr++) Line 6353  for (;; ptr++)
6353          PUT(tempcode, 1, len);          PUT(tempcode, 1, len);
6354          break;          break;
6355          }          }
6356    #endif
6357        }        }
6358    
6359      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 5996  for (;; ptr++) Line 6542  for (;; ptr++)
6542          tempptr = ptr;          tempptr = ptr;
6543    
6544          /* A condition can be an assertion, a number (referring to a numbered          /* A condition can be an assertion, a number (referring to a numbered
6545          group), a name (referring to a named group), or 'R', referring to          group's having been set), a name (referring to a named group), or 'R',
6546          recursion. R<digits> and R&name are also permitted for recursion tests.          referring to recursion. R<digits> and R&name are also permitted for
6547            recursion tests.
6548          There are several syntaxes for testing a named group: (?(name)) is used  
6549          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).          There are ways of testing a named group: (?(name)) is used by Python;
6550            Perl 5.10 onwards uses (?(<name>) or (?('name')).
6551          There are two unfortunate ambiguities, caused by history. (a) 'R' can  
6552          be the recursive thing or the name 'R' (and similarly for 'R' followed          There is one unfortunate ambiguity, caused by history. 'R' can be the
6553          by digits), and (b) a number could be a name that consists of digits.          recursive thing or the name 'R' (and similarly for 'R' followed by
6554          In both cases, we look for a name first; if not found, we try the other          digits). We look for a name first; if not found, we try the other case.
         cases.  
6555    
6556          For compatibility with auto-callouts, we allow a callout to be          For compatibility with auto-callouts, we allow a callout to be
6557          specified before a condition that is an assertion. First, check for the          specified before a condition that is an assertion. First, check for the
# Line 6030  for (;; ptr++) Line 6575  for (;; ptr++)
6575                 tempptr[2] == CHAR_LESS_THAN_SIGN))                 tempptr[2] == CHAR_LESS_THAN_SIGN))
6576            break;            break;
6577    
6578          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6579          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6580    
6581          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6582          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
# Line 6039  for (;; ptr++) Line 6584  for (;; ptr++)
6584    
6585          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
6586    
6587          if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)          ptr++;
6588            if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6589            {            {
6590            terminator = -1;            terminator = -1;
6591            ptr += 2;            ptr += 2;
# Line 6047  for (;; ptr++) Line 6593  for (;; ptr++)
6593            }            }
6594    
6595          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6596          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6597            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6598    
6599          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (*ptr == CHAR_LESS_THAN_SIGN)
6600            {            {
6601            terminator = CHAR_GREATER_THAN_SIGN;            terminator = CHAR_GREATER_THAN_SIGN;
6602            ptr++;            ptr++;
6603            }            }
6604          else if (ptr[1] == CHAR_APOSTROPHE)          else if (*ptr == CHAR_APOSTROPHE)
6605            {            {
6606            terminator = CHAR_APOSTROPHE;            terminator = CHAR_APOSTROPHE;
6607            ptr++;            ptr++;
# Line 6062  for (;; ptr++) Line 6609  for (;; ptr++)
6609          else          else
6610            {            {
6611            terminator = CHAR_NULL;            terminator = CHAR_NULL;
6612            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6613                else if (IS_DIGIT(*ptr)) refsign = 0;
6614            }            }
6615    
6616          /* We now expect to read a name; any thing else is an error */          /* Handle a number */
6617    
6618          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (refsign >= 0)
6619            {            {
6620            ptr += 1;  /* To get the right offset */            recno = 0;
6621            *errorcodeptr = ERR28;            while (IS_DIGIT(*ptr))
6622            goto FAILED;              {
6623                recno = recno * 10 + (int)(*ptr - CHAR_0);
6624                ptr++;
6625                }
6626            }            }
6627    
6628          /* Read the name, but also get it as a number if it's all digits */          /* Otherwise we expect to read a name; anything else is an error. When
6629            a name is one of a number of duplicates, a different opcode is used and
6630            it needs more memory. Unfortunately we cannot tell whether a name is a
6631            duplicate in the first pass, so we have to allow for more memory. */
6632    
6633          recno = 0;          else
         name = ++ptr;  
         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)  
6634            {            {
6635            if (recno >= 0)            if (IS_DIGIT(*ptr))
6636              recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;              {
6637            ptr++;              *errorcodeptr = ERR84;
6638                goto FAILED;
6639                }
6640              if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6641                {
6642                *errorcodeptr = ERR28;   /* Assertion expected */
6643                goto FAILED;
6644                }
6645              name = ptr++;
6646              while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6647                {
6648                ptr++;
6649                }
6650              namelen = (int)(ptr - name);
6651              if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
6652            }            }
6653          namelen = (int)(ptr - name);  
6654            /* Check the terminator */
6655    
6656          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6657              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6658            {            {
6659            ptr--;      /* Error offset */            ptr--;                  /* Error offset */
6660            *errorcodeptr = ERR26;            *errorcodeptr = ERR26;  /* Malformed number or name */
6661            goto FAILED;            goto FAILED;
6662            }            }
6663    
# Line 6099  for (;; ptr++) Line 6666  for (;; ptr++)
6666          if (lengthptr != NULL) break;          if (lengthptr != NULL) break;
6667    
6668          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
6669          reference. If the string started with "+" or "-" we require the rest to          reference. If refsign is not negative, it means we have a number in
6670          be digits, in which case recno will be set. */          recno. */
6671    
6672          if (refsign > 0)          if (refsign >= 0)
6673            {            {
6674            if (recno <= 0)            if (recno <= 0)
6675              {              {
6676              *errorcodeptr = ERR58;              *errorcodeptr = ERR35;
6677              goto FAILED;              goto FAILED;
6678              }              }
6679            recno = (refsign == CHAR_MINUS)?            if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6680              cd->bracount - recno + 1 : recno +cd->bracount;              cd->bracount - recno + 1 : recno + cd->bracount;
6681            if (recno <= 0 || recno > cd->final_bracount)            if (recno <= 0 || recno > cd->final_bracount)
6682              {              {
6683              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 6120  for (;; ptr++) Line 6687  for (;; ptr++)
6687            break;            break;
6688            }            }
6689    
6690          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise look for the name. */
         name. If we find a name, add one to the opcode to change OP_CREF or  
         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
6691    
6692          slot = cd->name_table;          slot = cd->name_table;
6693          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 6133  for (;; ptr++) Line 6696  for (;; ptr++)
6696            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6697            }            }
6698    
6699          /* Found the named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6700            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6701            appropriate data values. Otherwise, just insert the unique subpattern
6702            number. */
6703    
6704          if (i < cd->names_found)          if (i < cd->names_found)
6705            {            {
6706            recno = GET2(slot, 0);            int offset = i++;
6707            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6708            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6709              for (; i < cd->names_found; i++)
6710                {
6711                slot += cd->name_entry_size;
6712                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6713                count++;
6714                }
6715              if (count > 1)
6716                {
6717                PUT2(code, 2+LINK_SIZE, offset);
6718                PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6719                skipbytes += IMM2_SIZE;
6720                code[1+LINK_SIZE]++;
6721                }
6722              else  /* Not a duplicated name */
6723                {
6724                PUT2(code, 2+LINK_SIZE, recno);
6725                }
6726            }            }
6727    
6728          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
6729          after the opening parenthesis [e.g. (?(abc)...] and in this case there          after the opening parenthesis [e.g. (?(abc)...] and in this case there
6730          are some further alternatives to try. For the cases where terminator !=          are some further alternatives to try. For the cases where terminator !=
6731          0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have          CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6732          now checked all the possibilities, so give an error. */          we have now checked all the possibilities, so give an error. */
6733    
6734          else if (terminator != CHAR_NULL)          else if (terminator != CHAR_NULL)
6735            {            {
# Line 6183  for (;; ptr++) Line 6766  for (;; ptr++)
6766            skipbytes = 1;            skipbytes = 1;
6767            }            }
6768    
6769          /* Check for the "name" actually being a subpattern number. We are          /* Reference to an unidentified subpattern. */
         in the second pass here, so final_bracount is set. */  
   
         else if (recno > 0 && recno <= cd->final_bracount)  
           {  
           PUT2(code, 2+LINK_SIZE, recno);  
           }  
   
         /* Either an unidentified subpattern, or a reference to (?(0) */  
6770    
6771          else          else
6772            {            {
6773            *errorcodeptr = (recno == 0)? ERR35: ERR15;            *errorcodeptr = ERR15;
6774            goto FAILED;            goto FAILED;
6775            }            }
6776          break;          break;
# Line 6208  for (;; ptr++) Line 6783  for (;; ptr++)
6783          ptr++;          ptr++;
6784          break;          break;
6785    
6786            /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6787            thing to do, but Perl allows all assertions to be quantified, and when
6788            they contain capturing parentheses there may be a potential use for
6789            this feature. Not that that applies to a quantified (?!) but we allow
6790            it for uniformity. */
6791    
6792          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
6793          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6794          ptr++;          ptr++;
6795          if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */          if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6796                 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6797                (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6798            {            {
6799            *code++ = OP_FAIL;            *code++ = OP_FAIL;
6800            previous = NULL;            previous = NULL;
# Line 6308  for (;; ptr++) Line 6890  for (;; ptr++)
6890          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6891            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6892          name = ++ptr;          name = ++ptr;
6893            if (IS_DIGIT(*ptr))
6894              {
6895              *errorcodeptr = ERR84;   /* Group name must start with non-digit */
6896              goto FAILED;
6897              }
6898          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6899          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6900    
# Line 6422  for (;; ptr++) Line 7008  for (;; ptr++)
7008    
7009          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
7010          name = ++ptr;          name = ++ptr;
7011            if (IS_DIGIT(*ptr))
7012              {
7013              *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7014              goto FAILED;
7015              }
7016          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7017          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
7018    
# Line 6814  for (;; ptr++) Line 7405  for (;; ptr++)
7405        skipbytes = IMM2_SIZE;        skipbytes = IMM2_SIZE;
7406        }        }
7407    
7408      /* Process nested bracketed regex. Assertions used not to be repeatable,      /* Process nested bracketed regex. First check for parentheses nested too
7409      but this was changed for Perl compatibility, so all kinds can now be      deeply. */
7410      repeated. We copy code into a non-register variable (tempcode) in order to  
7411      be able to pass its address because some compilers complain otherwise. */      if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7412          {
7413          *errorcodeptr = ERR82;
7414          goto FAILED;
7415          }
7416    
7417        /* Assertions used not to be repeatable, but this was changed for Perl
7418        compatibility, so all kinds can now be repeated. We copy code into a
7419        non-register variable (tempcode) in order to be able to pass its address
7420        because some compilers complain otherwise. */
7421    
7422      previous = code;                      /* For handling repetition */      previous = code;                      /* For handling repetition */
7423      *code = bravalue;      *code = bravalue;
# Line 6848  for (;; ptr++) Line 7448  for (;; ptr++)
7448           ))           ))
7449        goto FAILED;        goto FAILED;
7450    
7451        cd->parens_depth -= 1;
7452    
7453      /* If this was an atomic group and there are no capturing groups within it,      /* If this was an atomic group and there are no capturing groups within it,
7454      generate OP_ONCE_NC instead of OP_ONCE. */      generate OP_ONCE_NC instead of OP_ONCE. */
7455    
# Line 7062  for (;; ptr++) Line 7664  for (;; ptr++)
7664        if (escape == ESC_g)        if (escape == ESC_g)
7665          {          {
7666          const pcre_uchar *p;          const pcre_uchar *p;
7667            pcre_uint32 cf;
7668    
7669          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
7670          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7671            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7672    
7673          /* These two statements stop the compiler for warning about possibly          /* These two statements stop the compiler for warning about possibly
7674          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7675          fact, because we actually check for a number below, the paths that          fact, because we do the check for a number below, the paths that
7676          would actually be in error are never taken. */          would actually be in error are never taken. */
7677    
7678          skipbytes = 0;          skipbytes = 0;
7679          reset_bracount = FALSE;          reset_bracount = FALSE;
7680    
7681          /* Test for a name */          /* If it's not a signed or unsigned number, treat it as a name. */
7682    
7683          if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)          cf = ptr[1];
7684            if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7685            {            {
           BOOL is_a_number = TRUE;  
           for (p = ptr + 1; *p != CHAR_NULL && *p != (pcre_uchar)terminator; p++)  
             {  
             if (!MAX_255(*p)) { is_a_number = FALSE; break; }  
             if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;  
             if ((cd->ctypes[*p] & ctype_word) == 0) break;  
             }  
           if (*p != (pcre_uchar)terminator)  
             {  
             *errorcodeptr = ERR57;  
             break;  
             }  
           if (is_a_number)  
             {  
             ptr++;  
             goto HANDLE_NUMERICAL_RECURSION;  
             }  
7686            is_recurse = TRUE;            is_recurse = TRUE;
7687            goto NAMED_REF_OR_RECURSE;            goto NAMED_REF_OR_RECURSE;
7688            }            }
7689    
7690          /* Test a signed number in angle brackets or quotes. */          /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7691            or a digit. */
7692    
7693          p = ptr + 2;          p = ptr + 2;
7694          while (IS_DIGIT(*p)) p++;          while (IS_DIGIT(*p)) p++;
# Line 7239  for (;; ptr++) Line 7828  for (;; ptr++)
7828    
7829      /* ===================================================================*/      /* ===================================================================*/
7830      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
7831      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in a UTF mode, it may be a
7832      multi-byte literal character. */      multi-unit literal character. */
7833    
7834      default:      default:
7835      NORMAL_CHAR:      NORMAL_CHAR:
# Line 7829  do { Line 8418  do {
8418       switch (*scode)       switch (*scode)
8419         {         {
8420         case OP_CREF:         case OP_CREF:
8421         case OP_NCREF:         case OP_DNCREF:
8422         case OP_RREF:         case OP_RREF:
8423         case OP_NRREF:         case OP_DNRREF:
8424         case OP_DEF:         case OP_DEF:
8425         return FALSE;         return FALSE;
8426    
# Line 8228  PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. * Line 8817  PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. *
8817      { skipatstart += 6; options |= PCRE_UTF8; continue; }      { skipatstart += 6; options |= PCRE_UTF8; continue; }
8818    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
8819      { skipatstart += 6; options |= PCRE_UCP; continue; }      { skipatstart += 6; options |= PCRE_UCP; continue; }
8820      else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
8821        { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
8822    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
8823      { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }      { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
8824    
# Line 8422  cd->named_group_list_size = NAMED_GROUP_ Line 9013  cd->named_group_list_size = NAMED_GROUP_
9013  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
9014  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9015  cd->req_varyopt = 0;  cd->req_varyopt = 0;
9016    cd->parens_depth = 0;
9017  cd->assert_depth = 0;  cd->assert_depth = 0;
9018  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
9019  cd->external_options = options;  cd->external_options = options;
# Line 8436  outside can help speed up starting point Line 9028  outside can help speed up starting point
9028  ptr += skipatstart;  ptr += skipatstart;
9029  code = cworkspace;  code = cworkspace;
9030  *code = OP_BRA;  *code = OP_BRA;
9031    
9032  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9033    FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,    FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9034    cd, &length);    cd, &length);
# Line 8507  field; this time it's used for rememberi Line 9100  field; this time it's used for rememberi
9100  */  */
9101    
9102  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9103    cd->parens_depth = 0;
9104  cd->assert_depth = 0;  cd->assert_depth = 0;
9105  cd->bracount = 0;  cd->bracount = 0;
9106  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
# Line 8612  if (errorcode == 0 && re->top_backref > Line 9206  if (errorcode == 0 && re->top_backref >
9206  /* Unless disabled, check whether single character iterators can be  /* Unless disabled, check whether single character iterators can be
9207  auto-possessified. The function overwrites the appropriate opcode values. */  auto-possessified. The function overwrites the appropriate opcode values. */
9208    
9209  if ((options & PCRE_NO_AUTO_POSSESSIFY) == 0)  if ((options & PCRE_NO_AUTO_POSSESS) == 0)
9210    auto_possessify((pcre_uchar *)codestart, utf, cd);    auto_possessify((pcre_uchar *)codestart, utf, cd);
9211    
9212  /* If there were any lookbehind assertions that contained OP_RECURSE  /* If there were any lookbehind assertions that contained OP_RECURSE
# Line 8836  return (pcre32 *)re; Line 9430  return (pcre32 *)re;
9430  }  }
9431    
9432  /* End of pcre_compile.c */  /* End of pcre_compile.c */
9433    

Legend:
Removed from v.1364  
changed lines
  Added in v.1411

  ViewVC Help
Powered by ViewVC 1.1.5