/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1365 by ph10, Sun Oct 6 18:33:56 2013 UTC revision 1414 by zherczeg, Sun Dec 22 16:27:35 2013 UTC
# Line 260  static const verbitem verbs[] = { Line 260  static const verbitem verbs[] = {
260  static const int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261    
262    
263    /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
264    another regex library. */
265    
266    static const pcre_uchar sub_start_of_word[] = {
267      CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
268      CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
269    
270    static const pcre_uchar sub_end_of_word[] = {
271      CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
272      CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
273      CHAR_RIGHT_PARENTHESIS, '\0' };
274    
275    
276  /* Tables of names of POSIX character classes and their lengths. The names are  /* Tables of names of POSIX character classes and their lengths. The names are
277  now all in a single string, to reduce the number of relocations when a shared  now all in a single string, to reduce the number of relocations when a shared
278  library is dynamically loaded. The list of lengths is terminated by a zero  library is dynamically loaded. The list of lengths is terminated by a zero
279  length entry. The first three must be alpha, lower, upper, as this is assumed  length entry. The first three must be alpha, lower, upper, as this is assumed
280  for handling case independence. */  for handling case independence. The indices for graph, print, and punct are
281    needed, so identify them. */
282    
283  static const char posix_names[] =  static const char posix_names[] =
284    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
# Line 275  static const char posix_names[] = Line 289  static const char posix_names[] =
289  static const pcre_uint8 posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
290    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
291    
292    #define PC_GRAPH  8
293    #define PC_PRINT  9
294    #define PC_PUNCT 10
295    
296    
297  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
298  base map, with an optional addition or removal of another map. Then, for some  base map, with an optional addition or removal of another map. Then, for some
299  classes, there is some additional tweaking: for [:blank:] the vertical space  classes, there is some additional tweaking: for [:blank:] the vertical space
# Line 302  static const int posix_class_maps[] = { Line 321  static const int posix_class_maps[] = {
321    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
322  };  };
323    
324  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class  /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
325  substitutes must be in the order of the names, defined above, and there are  Unicode property escapes. */
 both positive and negative cases. NULL means no substitute. */  
326    
327  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
328  static const pcre_uchar string_PNd[]  = {  static const pcre_uchar string_PNd[]  = {
# Line 329  static const pcre_uchar string_pXwd[] = Line 347  static const pcre_uchar string_pXwd[] =
347  static const pcre_uchar *substitutes[] = {  static const pcre_uchar *substitutes[] = {
348    string_PNd,           /* \D */    string_PNd,           /* \D */
349    string_pNd,           /* \d */    string_pNd,           /* \d */
350    string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */    string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
351    string_pXsp,          /* \s */    string_pXsp,          /* \s */   /* space and POSIX space are the same. */
352    string_PXwd,          /* \W */    string_PXwd,          /* \W */
353    string_pXwd           /* \w */    string_pXwd           /* \w */
354  };  };
355    
356    /* The POSIX class substitutes must be in the order of the POSIX class names,
357    defined above, and there are both positive and negative cases. NULL means no
358    general substitute of a Unicode property escape (\p or \P). However, for some
359    POSIX classes (e.g. graph, print, punct) a special property code is compiled
360    directly. */
361    
362  static const pcre_uchar string_pL[] =   {  static const pcre_uchar string_pL[] =   {
363    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
364    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
# Line 382  static const pcre_uchar *posix_substitut Line 406  static const pcre_uchar *posix_substitut
406    NULL,                 /* graph */    NULL,                 /* graph */
407    NULL,                 /* print */    NULL,                 /* print */
408    NULL,                 /* punct */    NULL,                 /* punct */
409    string_pXps,          /* space */    /* NOTE: Xps is POSIX space */    string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
410    string_pXwd,          /* word */    string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
411    NULL,                 /* xdigit */    NULL,                 /* xdigit */
412    /* Negated cases */    /* Negated cases */
413    string_PL,            /* ^alpha */    string_PL,            /* ^alpha */
# Line 397  static const pcre_uchar *posix_substitut Line 421  static const pcre_uchar *posix_substitut
421    NULL,                 /* ^graph */    NULL,                 /* ^graph */
422    NULL,                 /* ^print */    NULL,                 /* ^print */
423    NULL,                 /* ^punct */    NULL,                 /* ^punct */
424    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
425    string_PXwd,          /* ^word */    string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
426    NULL                  /* ^xdigit */    NULL                  /* ^xdigit */
427  };  };
428  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
# Line 462  static const char error_texts[] = Line 486  static const char error_texts[] =
486    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
487    "this version of PCRE is compiled without UTF support\0"    "this version of PCRE is compiled without UTF support\0"
488    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
489    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
490    /* 35 */    /* 35 */
491    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
492    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 516  static const char error_texts[] = Line 540  static const char error_texts[] =
540    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
541    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
542    "setting UTF is disabled by the application\0"    "setting UTF is disabled by the application\0"
543      "non-hex character in \\x{} (closing brace missing?)\0"
544      /* 80 */
545      "non-octal character in \\o{} (closing brace missing?)\0"
546      "missing opening brace after \\o\0"
547      "parentheses are too deeply nested\0"
548      "invalid range in character class\0"
549      "group name must start with a non-digit\0"
550    ;    ;
551    
552  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 773  static const pcre_uint8 posspropstab[3][ Line 804  static const pcre_uint8 posspropstab[3][
804    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
805  };  };
806    
807    /* This table is used when converting repeating opcodes into possessified
808    versions as a result of an explicit possessive quantifier such as ++. A zero
809    value means there is no possessified version - in those cases the item in
810    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
811    because all relevant opcodes are less than that. */
812    
813    static const pcre_uint8 opcode_possessify[] = {
814      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
815      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
816    
817      0,                       /* NOTI */
818      OP_POSSTAR, 0,           /* STAR, MINSTAR */
819      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
820      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
821      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
822      0,                       /* EXACT */
823      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
824    
825      OP_POSSTARI, 0,          /* STARI, MINSTARI */
826      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
827      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
828      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
829      0,                       /* EXACTI */
830      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
831    
832      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
833      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
834      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
835      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
836      0,                       /* NOTEXACT */
837      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
838    
839      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
840      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
841      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
842      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
843      0,                       /* NOTEXACTI */
844      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
845    
846      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
847      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
848      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
849      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
850      0,                       /* TYPEEXACT */
851      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
852    
853      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
854      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
855      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
856      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
857      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
858    
859      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
860      0, 0,                    /* REF, REFI */
861      0, 0,                    /* DNREF, DNREFI */
862      0, 0                     /* RECURSE, CALLOUT */
863    };
864    
865    
866    
867  /*************************************************  /*************************************************
# Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 968  return (*p == CHAR_RIGHT_CURLY_BRACKET);
968  *************************************************/  *************************************************/
969    
970  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
971  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
972  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
973  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
974  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
975  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
976    
977  Arguments:  Arguments:
978    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
979    chptr          points to the data character    chptr          points to a returned data character
980    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
981    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
982    options        the options bits    options        the options bits
# Line 1092  else Line 1180  else
1180      break;      break;
1181    
1182      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1183      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1184      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1185        recommended to avoid the ambiguities in the old syntax.
1186    
1187      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1188      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1189      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1190      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1191      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1192      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1193      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1194    
1195        Inside a character class, \ followed by a digit is always either a literal
1196        8 or 9 or an octal number. */
1197    
1198      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1199      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1128  else Line 1220  else
1220          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1221          break;          break;
1222          }          }
1223        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1224          {          {
1225          escape = -s;          escape = -s;
1226          break;          break;
# Line 1136  else Line 1228  else
1228        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1229        }        }
1230    
1231      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1232      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1233      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1234        changed so as not to insert the binary zero. */
1235    
1236      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1237        {  
1238        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1239    
1240      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1241      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1162  else Line 1252  else
1252  #endif  #endif
1253      break;      break;
1254    
1255      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1256      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.      specifying character codes in octal. The only supported form is \o{ddd}. */
1257      If not, { is treated as a data character. */  
1258        case CHAR_o:
1259        if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1260          {
1261          ptr += 2;
1262          c = 0;
1263          overflow = FALSE;
1264          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1265            {
1266            register pcre_uint32 cc = *ptr++;
1267            if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1268    #ifdef COMPILE_PCRE32
1269            if (c >= 0x20000000l) { overflow = TRUE; break; }
1270    #endif
1271            c = (c << 3) + cc - CHAR_0 ;
1272    #if defined COMPILE_PCRE8
1273            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1274    #elif defined COMPILE_PCRE16
1275            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1276    #elif defined COMPILE_PCRE32
1277            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1278    #endif
1279            }
1280          if (overflow)
1281            {
1282            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1283            *errorcodeptr = ERR34;
1284            }
1285          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1286            {
1287            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1288            }
1289          else *errorcodeptr = ERR80;
1290          }
1291        break;
1292    
1293        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1294        numbers. Otherwise it is a lowercase x letter. */
1295    
1296      case CHAR_x:      case CHAR_x:
1297      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1298        {        {
       /* In JavaScript, \x must be followed by two hexadecimal numbers.  
       Otherwise it is a lowercase x letter. */  
1299        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1300          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1301          {          {
# Line 1187  else Line 1312  else
1312  #endif  #endif
1313            }            }
1314          }          }
1315        break;        }    /* End JavaScript handling */
       }  
1316    
1317      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1318        {      greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1319        const pcre_uchar *pt = ptr + 2;      digits. If not, { used to be treated as a data character. However, Perl
1320        seems to read hex digits up to the first non-such, and ignore the rest, so
1321        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1322        now gives an error. */
1323    
1324        c = 0;      else
1325        overflow = FALSE;        {
1326        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1327          {          {
1328          register pcre_uint32 cc = *pt++;          ptr += 2;
1329          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          c = 0;
1330            overflow = FALSE;
1331            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1332              {
1333              register pcre_uint32 cc = *ptr++;
1334              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1335    
1336  #ifdef COMPILE_PCRE32  #ifdef COMPILE_PCRE32
1337          if (c >= 0x10000000l) { overflow = TRUE; break; }            if (c >= 0x10000000l) { overflow = TRUE; break; }
1338  #endif  #endif
1339    
1340  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1341          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1342          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1343  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1344          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1345          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1346  #endif  #endif
1347    
1348  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1349          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1350  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1351          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1352  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1353          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1354  #endif  #endif
1355          }            }
1356    
1357        if (overflow)          if (overflow)
1358          {            {
1359          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1360          *errorcodeptr = ERR34;            *errorcodeptr = ERR34;
1361          }            }
1362    
1363        if (*pt == CHAR_RIGHT_CURLY_BRACKET)          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1364          {            {
1365          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1366          ptr = pt;            }
         break;  
         }  
1367    
1368        /* If the sequence of hex digits does not end with '}', then we don't          /* If the sequence of hex digits does not end with '}', give an error.
1369        recognize this construct; fall through to the normal \x handling. */          We used just to recognize this construct and fall through to the normal
1370        }          \x handling, but nowadays Perl gives an error, which seems much more
1371            sensible, so we do too. */
1372    
1373      /* Read just a single-byte hex-defined char */          else *errorcodeptr = ERR79;
1374            }   /* End of \x{} processing */
1375    
1376      c = 0;        /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1377      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)  
1378        {        else
1379        pcre_uint32 cc;                          /* Some compilers don't like */          {
1380        cc = *(++ptr);                           /* ++ in initializers */          c = 0;
1381            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1382              {
1383              pcre_uint32 cc;                          /* Some compilers don't like */
1384              cc = *(++ptr);                           /* ++ in initializers */
1385  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1386        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1387        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1388  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1389        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1390        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1391  #endif  #endif
1392        }            }
1393            }     /* End of \xdd handling */
1394          }       /* End of Perl-style \x handling */
1395      break;      break;
1396    
1397      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 1764  for (;;) Line 1902  for (;;)
1902    
1903      switch (*cc)      switch (*cc)
1904        {        {
       case OP_CRPLUS:  
       case OP_CRMINPLUS:  
1905        case OP_CRSTAR:        case OP_CRSTAR:
1906        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1907          case OP_CRPLUS:
1908          case OP_CRMINPLUS:
1909        case OP_CRQUERY:        case OP_CRQUERY:
1910        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1911          case OP_CRPOSSTAR:
1912          case OP_CRPOSPLUS:
1913          case OP_CRPOSQUERY:
1914        return -1;        return -1;
1915    
1916        case OP_CRRANGE:        case OP_CRRANGE:
1917        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1918          case OP_CRPOSRANGE:
1919        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1920        branchlength += (int)GET2(cc,1);        branchlength += (int)GET2(cc,1);
1921        cc += 1 + 2 * IMM2_SIZE;        cc += 1 + 2 * IMM2_SIZE;
# Line 2366  for (code = first_significant_code(code Line 2508  for (code = first_significant_code(code
2508        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2509        case OP_CRQUERY:        case OP_CRQUERY:
2510        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2511          case OP_CRPOSSTAR:
2512          case OP_CRPOSQUERY:
2513        break;        break;
2514    
2515        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2516        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2517        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2518          case OP_CRPOSPLUS:
2519        return FALSE;        return FALSE;
2520    
2521        case OP_CRRANGE:        case OP_CRRANGE:
2522        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2523          case OP_CRPOSRANGE:
2524        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2525        break;        break;
2526        }        }
# Line 2653  switch(ptype) Line 2799  switch(ptype)
2799    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2800    means that Perl space and POSIX space are now identical. PCRE was changed    means that Perl space and POSIX space are now identical. PCRE was changed
2801    at release 8.34. */    at release 8.34. */
2802    
2803    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2804    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2805    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    switch(c)
2806            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||      {
2807            c == CHAR_FF || c == CHAR_CR)      HSPACE_CASES:
2808            == negated;      VSPACE_CASES:
2809        return negated;
2810    
2811        default:
2812        return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2813        }
2814      break;  /* Control never reaches here */
2815    
2816    case PT_WORD:    case PT_WORD:
2817    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
# Line 2708  get_chr_property_list(const pcre_uchar * Line 2860  get_chr_property_list(const pcre_uchar *
2860    const pcre_uint8 *fcc, pcre_uint32 *list)    const pcre_uint8 *fcc, pcre_uint32 *list)
2861  {  {
2862  pcre_uchar c = *code;  pcre_uchar c = *code;
2863    pcre_uchar base;
2864  const pcre_uchar *end;  const pcre_uchar *end;
 const pcre_uint32 *clist_src;  
 pcre_uint32 *clist_dest;  
2865  pcre_uint32 chr;  pcre_uint32 chr;
2866  pcre_uchar base;  
2867    #ifdef SUPPORT_UCP
2868    pcre_uint32 *clist_dest;
2869    const pcre_uint32 *clist_src;
2870    #else
2871    utf = utf;  /* Suppress "unused parameter" compiler warning */
2872    #endif
2873    
2874  list[0] = c;  list[0] = c;
2875  list[1] = FALSE;  list[1] = FALSE;
# Line 2818  switch(c) Line 2975  switch(c)
2975      return code + 2;      return code + 2;
2976      }      }
2977    
2978    /* Convert only if we have anough space. */    /* Convert only if we have enough space. */
2979    
2980    clist_src = PRIV(ucd_caseless_sets) + code[1];    clist_src = PRIV(ucd_caseless_sets) + code[1];
2981    clist_dest = list + 2;    clist_dest = list + 2;
2982    code += 2;    code += 2;
2983    
2984    do {    do {
      /* Early return if there is not enough space. */  
2985       if (clist_dest >= list + 8)       if (clist_dest >= list + 8)
2986         {         {
2987           /* Early return if there is not enough space. This should never
2988           happen, since all clists are shorter than 5 character now. */
2989         list[2] = code[0];         list[2] = code[0];
2990         list[3] = code[1];         list[3] = code[1];
2991         return code;         return code;
2992         }         }
2993       *clist_dest++ = *clist_src;       *clist_dest++ = *clist_src;
2994       }       }
2995     while(*clist_src++ != NOTACHAR);    while(*clist_src++ != NOTACHAR);
2996    
2997    /* Enough space to store all characters. */    /* All characters are stored. The terminating NOTACHAR
2998      is copied form the clist itself. */
2999    
3000    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3001    return code;    return code;
# Line 2846  switch(c) Line 3005  switch(c)
3005    case OP_CLASS:    case OP_CLASS:
3006  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3007    case OP_XCLASS:    case OP_XCLASS:
   
3008    if (c == OP_XCLASS)    if (c == OP_XCLASS)
3009      end = code + GET(code, 0);      end = code + GET(code, 0) - 1;
3010    else    else
3011  #endif  #endif
3012      end = code + 32 / sizeof(pcre_uchar);      end = code + 32 / sizeof(pcre_uchar);
# Line 2859  switch(c) Line 3017  switch(c)
3017      case OP_CRMINSTAR:      case OP_CRMINSTAR:
3018      case OP_CRQUERY:      case OP_CRQUERY:
3019      case OP_CRMINQUERY:      case OP_CRMINQUERY:
3020        case OP_CRPOSSTAR:
3021        case OP_CRPOSQUERY:
3022      list[1] = TRUE;      list[1] = TRUE;
3023      end++;      end++;
3024      break;      break;
3025    
3026        case OP_CRPLUS:
3027        case OP_CRMINPLUS:
3028        case OP_CRPOSPLUS:
3029        end++;
3030        break;
3031    
3032      case OP_CRRANGE:      case OP_CRRANGE:
3033      case OP_CRMINRANGE:      case OP_CRMINRANGE:
3034        case OP_CRPOSRANGE:
3035      list[1] = (GET2(end, 1) == 0);      list[1] = (GET2(end, 1) == 0);
3036      end += 1 + 2 * IMM2_SIZE;      end += 1 + 2 * IMM2_SIZE;
3037      break;      break;
# Line 2895  Returns:      TRUE if the auto-possessif Line 3062  Returns:      TRUE if the auto-possessif
3062    
3063  static BOOL  static BOOL
3064  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3065    const pcre_uint32* base_list)    const pcre_uint32 *base_list, const pcre_uchar *base_end)
3066  {  {
3067  pcre_uchar c;  pcre_uchar c;
3068  pcre_uint32 list[8];  pcre_uint32 list[8];
3069  const pcre_uint32* chr_ptr;  const pcre_uint32 *chr_ptr;
3070  const pcre_uint32* ochr_ptr;  const pcre_uint32 *ochr_ptr;
3071  const pcre_uint32* list_ptr;  const pcre_uint32 *list_ptr;
3072    const pcre_uchar *next_code;
3073    const pcre_uint8 *class_bitset;
3074    const pcre_uint32 *set1, *set2, *set_end;
3075  pcre_uint32 chr;  pcre_uint32 chr;
3076    BOOL accepted, invert_bits;
3077    
3078    /* Note: the base_list[1] contains whether the current opcode has greedy
3079    (represented by a non-zero value) quantifier. This is a different from
3080    other character type lists, which stores here that the character iterator
3081    matches to an empty string (also represented by a non-zero value). */
3082    
3083  for(;;)  for(;;)
3084    {    {
3085      /* All operations move the code pointer forward.
3086      Therefore infinite recursions are not possible. */
3087    
3088    c = *code;    c = *code;
3089    
3090    /* Skip over callouts */    /* Skip over callouts */
# Line 2925  for(;;) Line 3104  for(;;)
3104    switch(c)    switch(c)
3105      {      {
3106      case OP_END:      case OP_END:
3107      /* TRUE only in greedy case. The non-greedy case could be replaced by an      case OP_KETRPOS:
3108      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses      /* TRUE only in greedy case. The non-greedy case could be replaced by
3109      more memory, which we cannot get at this stage.) */      an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3110        uses more memory, which we cannot get at this stage.) */
3111    
3112      return base_list[1] != 0;      return base_list[1] != 0;
3113    
3114      case OP_KET:      case OP_KET:
3115      /* If the bracket is capturing, and referenced by an OP_RECURSE, the      /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3116      non-greedy case cannot be converted to a possessive form. We do not test      it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3117      the bracket type at the moment, but we might do it in the future to improve      cannot be converted to a possessive form. */
     this condition. (But note that recursive calls are always atomic.) */  
3118    
3119      if (base_list[1] == 0) return FALSE;      if (base_list[1] == 0) return FALSE;
3120    
3121        switch(*(code - GET(code, 1)))
3122          {
3123          case OP_ASSERT:
3124          case OP_ASSERT_NOT:
3125          case OP_ASSERTBACK:
3126          case OP_ASSERTBACK_NOT:
3127          case OP_ONCE:
3128          case OP_ONCE_NC:
3129          /* Atomic sub-patterns and assertions can always auto-possessify their
3130          last iterator. */
3131          return TRUE;
3132          }
3133    
3134        code += PRIV(OP_lengths)[c];
3135        continue;
3136    
3137        case OP_ONCE:
3138        case OP_ONCE_NC:
3139        case OP_BRA:
3140        case OP_CBRA:
3141        next_code = code + GET(code, 1);
3142        code += PRIV(OP_lengths)[c];
3143    
3144        while (*next_code == OP_ALT)
3145          {
3146          if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3147          code = next_code + 1 + LINK_SIZE;
3148          next_code += GET(next_code, 1);
3149          }
3150        continue;
3151    
3152        case OP_BRAZERO:
3153        case OP_BRAMINZERO:
3154    
3155        next_code = code + 1;
3156        if (*next_code != OP_BRA && *next_code != OP_CBRA
3157            && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3158    
3159        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3160    
3161        /* The bracket content will be checked by the
3162        OP_BRA/OP_CBRA case above. */
3163        next_code += 1 + LINK_SIZE;
3164        if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3165          return FALSE;
3166    
3167      code += PRIV(OP_lengths)[c];      code += PRIV(OP_lengths)[c];
3168      continue;      continue;
3169      }      }
# Line 2961  for(;;) Line 3187  for(;;)
3187      list_ptr = base_list;      list_ptr = base_list;
3188      }      }
3189    
3190      /* Character bitsets can also be compared to certain opcodes. */
3191    
3192      else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3193    #ifdef COMPILE_PCRE8
3194          /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3195          || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3196    #endif
3197          )
3198        {
3199    #ifdef COMPILE_PCRE8
3200        if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3201    #else
3202        if (base_list[0] == OP_CLASS)
3203    #endif
3204          {
3205          set1 = (pcre_uint32 *)(base_end - base_list[2]);
3206          list_ptr = list;
3207          }
3208        else
3209          {
3210          set1 = (pcre_uint32 *)(code - list[2]);
3211          list_ptr = base_list;
3212          }
3213    
3214        invert_bits = FALSE;
3215        switch(list_ptr[0])
3216          {
3217          case OP_CLASS:
3218          case OP_NCLASS:
3219          set2 = (pcre_uint32 *)
3220            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3221          break;
3222    
3223          /* OP_XCLASS cannot be supported here, because its bitset
3224          is not necessarily complete. E.g: [a-\0x{200}] is stored
3225          as a character range, and the appropriate bits are not set. */
3226    
3227          case OP_NOT_DIGIT:
3228            invert_bits = TRUE;
3229            /* Fall through */
3230          case OP_DIGIT:
3231            set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3232            break;
3233    
3234          case OP_NOT_WHITESPACE:
3235            invert_bits = TRUE;
3236            /* Fall through */
3237          case OP_WHITESPACE:
3238            set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3239            break;
3240    
3241          case OP_NOT_WORDCHAR:
3242            invert_bits = TRUE;
3243            /* Fall through */
3244          case OP_WORDCHAR:
3245            set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3246            break;
3247    
3248          default:
3249          return FALSE;
3250          }
3251    
3252        /* Compare 4 bytes to improve speed. */
3253        set_end = set1 + (32 / 4);
3254        if (invert_bits)
3255          {
3256          do
3257            {
3258            if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3259            }
3260          while (set1 < set_end);
3261          }
3262        else
3263          {
3264          do
3265            {
3266            if ((*set1++ & *set2++) != 0) return FALSE;
3267            }
3268          while (set1 < set_end);
3269          }
3270    
3271        if (list[1] == 0) return TRUE;
3272        /* Might be an empty repeat. */
3273        continue;
3274        }
3275    
3276    /* Some property combinations also acceptable. Unicode property opcodes are    /* Some property combinations also acceptable. Unicode property opcodes are
3277    processed specially; the rest can be handled with a lookup table. */    processed specially; the rest can be handled with a lookup table. */
3278    
# Line 2968  for(;;) Line 3280  for(;;)
3280      {      {
3281      pcre_uint32 leftop, rightop;      pcre_uint32 leftop, rightop;
3282    
     if (list[1] != 0) return FALSE;   /* Must match at least one character */  
3283      leftop = base_list[0];      leftop = base_list[0];
3284      rightop = list[0];      rightop = list[0];
3285    
3286  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3287        accepted = FALSE; /* Always set in non-unicode case. */
3288      if (leftop == OP_PROP || leftop == OP_NOTPROP)      if (leftop == OP_PROP || leftop == OP_NOTPROP)
3289        {        {
3290        if (rightop == OP_EOD) return TRUE;        if (rightop == OP_EOD)
3291        if (rightop == OP_PROP || rightop == OP_NOTPROP)          accepted = TRUE;
3292          else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3293          {          {
3294          int n;          int n;
3295          const pcre_uint8 *p;          const pcre_uint8 *p;
# Line 2997  for(;;) Line 3310  for(;;)
3310          n = propposstab[base_list[2]][list[2]];          n = propposstab[base_list[2]][list[2]];
3311          switch(n)          switch(n)
3312            {            {
3313            case 0: return FALSE;            case 0: break;
3314            case 1: return bothprop;            case 1: accepted = bothprop; break;
3315            case 2: return (base_list[3] == list[3]) != same;            case 2: accepted = (base_list[3] == list[3]) != same; break;
3316            case 3: return !same;            case 3: accepted = !same; break;
3317    
3318            case 4:  /* Left general category, right particular category */            case 4:  /* Left general category, right particular category */
3319            return risprop && catposstab[base_list[3]][list[3]] == same;            accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3320              break;
3321    
3322            case 5:  /* Right general category, left particular category */            case 5:  /* Right general category, left particular category */
3323            return lisprop && catposstab[list[3]][base_list[3]] == same;            accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3324              break;
3325    
3326            /* This code is logically tricky. Think hard before fiddling with it.            /* This code is logically tricky. Think hard before fiddling with it.
3327            The posspropstab table has four entries per row. Each row relates to            The posspropstab table has four entries per row. Each row relates to
3328            one of PCRE's special properties such as ALNUM or SPACE or WORD.            one of PCRE's special properties such as ALNUM or SPACE or WORD.
3329            Only WORD actually needs all four entries, but using repeats for the            Only WORD actually needs all four entries, but using repeats for the
3330            others means they can all use the same code below.            others means they can all use the same code below.
3331    
3332            The first two entries in each row are Unicode general categories, and            The first two entries in each row are Unicode general categories, and
3333            apply always, because all the characters they include are part of the            apply always, because all the characters they include are part of the
3334            PCRE character set. The third and fourth entries are a general and a            PCRE character set. The third and fourth entries are a general and a
# Line 3023  for(;;) Line 3338  for(;;)
3338            category contains more characters than the specials that are defined            category contains more characters than the specials that are defined
3339            for the property being tested against. Therefore, it cannot be used            for the property being tested against. Therefore, it cannot be used
3340            in a NOTPROP case.            in a NOTPROP case.
3341    
3342            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3343            Underscore is covered by ucp_P or ucp_Po. */            Underscore is covered by ucp_P or ucp_Po. */
3344    
# Line 3031  for(;;) Line 3346  for(;;)
3346            case 7:  /* Left space vs right general category */            case 7:  /* Left space vs right general category */
3347            case 8:  /* Left word vs right general category */            case 8:  /* Left word vs right general category */
3348            p = posspropstab[n-6];            p = posspropstab[n-6];
3349            return risprop && lisprop ==            accepted = risprop && lisprop ==
3350              (list[3] != p[0] &&              (list[3] != p[0] &&
3351               list[3] != p[1] &&               list[3] != p[1] &&
3352              (list[3] != p[2] || !lisprop));              (list[3] != p[2] || !lisprop));
3353              break;
3354    
3355            case 9:   /* Right alphanum vs left general category */            case 9:   /* Right alphanum vs left general category */
3356            case 10:  /* Right space vs left general category */            case 10:  /* Right space vs left general category */
3357            case 11:  /* Right word vs left general category */            case 11:  /* Right word vs left general category */
3358            p = posspropstab[n-9];            p = posspropstab[n-9];
3359            return lisprop && risprop ==            accepted = lisprop && risprop ==
3360              (base_list[3] != p[0] &&              (base_list[3] != p[0] &&
3361               base_list[3] != p[1] &&               base_list[3] != p[1] &&
3362              (base_list[3] != p[2] || !risprop));              (base_list[3] != p[2] || !risprop));
3363              break;
3364    
3365            case 12:  /* Left alphanum vs right particular category */            case 12:  /* Left alphanum vs right particular category */
3366            case 13:  /* Left space vs right particular category */            case 13:  /* Left space vs right particular category */
3367            case 14:  /* Left word vs right particular category */            case 14:  /* Left word vs right particular category */
3368            p = posspropstab[n-12];            p = posspropstab[n-12];
3369            return risprop && lisprop ==            accepted = risprop && lisprop ==
3370              (catposstab[p[0]][list[3]] &&              (catposstab[p[0]][list[3]] &&
3371               catposstab[p[1]][list[3]] &&               catposstab[p[1]][list[3]] &&
3372              (list[3] != p[3] || !lisprop));              (list[3] != p[3] || !lisprop));
3373              break;
3374    
3375            case 15:  /* Right alphanum vs left particular category */            case 15:  /* Right alphanum vs left particular category */
3376            case 16:  /* Right space vs left particular category */            case 16:  /* Right space vs left particular category */
3377            case 17:  /* Right word vs left particular category */            case 17:  /* Right word vs left particular category */
3378            p = posspropstab[n-15];            p = posspropstab[n-15];
3379            return lisprop && risprop ==            accepted = lisprop && risprop ==
3380              (catposstab[p[0]][base_list[3]] &&              (catposstab[p[0]][base_list[3]] &&
3381               catposstab[p[1]][base_list[3]] &&               catposstab[p[1]][base_list[3]] &&
3382              (base_list[3] != p[3] || !risprop));              (base_list[3] != p[3] || !risprop));
3383              break;
3384            }            }
3385          }          }
       return FALSE;  
3386        }        }
3387    
3388      else      else
3389  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
3390    
3391      return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&      accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3392             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3393             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3394    
3395        if (!accepted)
3396          return FALSE;
3397    
3398        if (list[1] == 0) return TRUE;
3399        /* Might be an empty repeat. */
3400        continue;
3401      }      }
3402    
3403    /* Control reaches here only if one of the items is a small character list.    /* Control reaches here only if one of the items is a small character list.
# Line 3186  for(;;) Line 3511  for(;;)
3511        case OP_EOD:    /* Can always possessify before \z */        case OP_EOD:    /* Can always possessify before \z */
3512        break;        break;
3513    
3514    #ifdef SUPPORT_UCP
3515        case OP_PROP:        case OP_PROP:
3516        case OP_NOTPROP:        case OP_NOTPROP:
3517        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3518              list_ptr[0] == OP_NOTPROP))              list_ptr[0] == OP_NOTPROP))
3519          return FALSE;          return FALSE;
3520        break;        break;
3521    #endif
       /* The class comparisons work only when the class is the second item  
       of the pair, because there are at present no possessive forms of the  
       class opcodes. Note also that the "code" variable that is used below  
       points after the second item, and that the pointer for the first item  
       is not available, so even if there were possessive forms of the class  
       opcodes, the correct comparison could not be done. */  
3522    
3523        case OP_NCLASS:        case OP_NCLASS:
3524        if (chr > 255) return FALSE;        if (chr > 255) return FALSE;
3525        /* Fall through */        /* Fall through */
3526    
3527        case OP_CLASS:        case OP_CLASS:
       if (list_ptr != list) return FALSE;   /* Class is first opcode */  
3528        if (chr > 255) break;        if (chr > 255) break;
3529        if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)        class_bitset = (pcre_uint8 *)
3530          return FALSE;          ((list_ptr == list ? code : base_end) - list_ptr[2]);
3531          if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3532        break;        break;
3533    
3534  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3535        case OP_XCLASS:        case OP_XCLASS:
3536        if (list_ptr != list) return FALSE;   /* Class is first opcode */        if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3537        if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))            list_ptr[2] + LINK_SIZE, utf)) return FALSE;
         return FALSE;  
3538        break;        break;
3539  #endif  #endif
3540    
# Line 3257  auto_possessify(pcre_uchar *code, BOOL u Line 3576  auto_possessify(pcre_uchar *code, BOOL u
3576  {  {
3577  register pcre_uchar c;  register pcre_uchar c;
3578  const pcre_uchar *end;  const pcre_uchar *end;
3579    pcre_uchar *repeat_opcode;
3580  pcre_uint32 list[8];  pcre_uint32 list[8];
3581    
3582  for (;;)  for (;;)
# Line 3270  for (;;) Line 3590  for (;;)
3590        get_chr_property_list(code, utf, cd->fcc, list) : NULL;        get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3591      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3592    
3593      if (end != NULL && compare_opcodes(end, utf, cd, list))      if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3594        {        {
3595        switch(c)        switch(c)
3596          {          {
# Line 3309  for (;;) Line 3629  for (;;)
3629        }        }
3630      c = *code;      c = *code;
3631      }      }
3632      else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3633        {
3634    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3635        if (c == OP_XCLASS)
3636          repeat_opcode = code + GET(code, 1);
3637        else
3638    #endif
3639          repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3640    
3641        c = *repeat_opcode;
3642        if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3643          {
3644          /* end must not be NULL. */
3645          end = get_chr_property_list(code, utf, cd->fcc, list);
3646    
3647          list[1] = (c & 1) == 0;
3648    
3649          if (compare_opcodes(end, utf, cd, list, end))
3650            {
3651            switch (c)
3652              {
3653              case OP_CRSTAR:
3654              case OP_CRMINSTAR:
3655              *repeat_opcode = OP_CRPOSSTAR;
3656              break;
3657    
3658              case OP_CRPLUS:
3659              case OP_CRMINPLUS:
3660              *repeat_opcode = OP_CRPOSPLUS;
3661              break;
3662    
3663              case OP_CRQUERY:
3664              case OP_CRMINQUERY:
3665              *repeat_opcode = OP_CRPOSQUERY;
3666              break;
3667    
3668              case OP_CRRANGE:
3669              case OP_CRMINRANGE:
3670              *repeat_opcode = OP_CRPOSRANGE;
3671              break;
3672              }
3673            }
3674          }
3675        c = *code;
3676        }
3677    
3678    switch(c)    switch(c)
3679      {      {
# Line 3335  for (;;) Line 3700  for (;;)
3700        code += 2;        code += 2;
3701      break;      break;
3702    
3703    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3704      case OP_XCLASS:      case OP_XCLASS:
3705      code += GET(code, 1);      code += GET(code, 1);
3706      break;      break;
3707    #endif
3708    
3709      case OP_MARK:      case OP_MARK:
3710      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
# Line 3446  class, but [abc[:x\]pqr:]] is (so that a Line 3813  class, but [abc[:x\]pqr:]] is (so that a
3813  below handles the special case of \], but does not try to do any other escape  below handles the special case of \], but does not try to do any other escape
3814  processing. This makes it different from Perl for cases such as [:l\ower:]  processing. This makes it different from Perl for cases such as [:l\ower:]
3815  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3816  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
3817  I think.  I think.
3818    
3819  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
# Line 3738  add_to_class(pcre_uint8 *classbits, pcre Line 4105  add_to_class(pcre_uint8 *classbits, pcre
4105    compile_data *cd, pcre_uint32 start, pcre_uint32 end)    compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4106  {  {
4107  pcre_uint32 c;  pcre_uint32 c;
4108    pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4109  int n8 = 0;  int n8 = 0;
4110    
4111  /* If caseless matching is required, scan the range and process alternate  /* If caseless matching is required, scan the range and process alternate
# Line 3781  if ((options & PCRE_CASELESS) != 0) Line 4149  if ((options & PCRE_CASELESS) != 0)
4149    
4150    /* Not UTF-mode, or no UCP */    /* Not UTF-mode, or no UCP */
4151    
4152    for (c = start; c <= end && c < 256; c++)    for (c = start; c <= classbits_end; c++)
4153      {      {
4154      SETBIT(classbits, cd->fcc[c]);      SETBIT(classbits, cd->fcc[c]);
4155      n8++;      n8++;
# Line 3806  in all cases. */ Line 4174  in all cases. */
4174    
4175  #endif /* COMPILE_PCRE[8|16] */  #endif /* COMPILE_PCRE[8|16] */
4176    
4177  /* If all characters are less than 256, use the bit map. Otherwise use extra  /* Use the bitmap for characters < 256. Otherwise use extra data.*/
 data. */  
4178    
4179  if (end < 0x100)  for (c = start; c <= classbits_end; c++)
4180    {    {
4181    for (c = start; c <= end; c++)    /* Regardless of start, c will always be <= 255. */
4182      {    SETBIT(classbits, c);
4183      n8++;    n8++;
     SETBIT(classbits, c);  
     }  
4184    }    }
4185    
4186  else  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4187    {  if (start <= 0xff) start = 0xff + 1;
4188    
4189    if (end >= start) {
4190    pcre_uchar *uchardata = *uchardptr;    pcre_uchar *uchardata = *uchardptr;
4191    
4192  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
# Line 3861  else Line 4228  else
4228    
4229    *uchardptr = uchardata;   /* Updata extra data pointer */    *uchardptr = uchardata;   /* Updata extra data pointer */
4230    }    }
4231    #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4232    
4233  return n8;    /* Number of 8-bit characters */  return n8;    /* Number of 8-bit characters */
4234  }  }
# Line 4082  for (;; ptr++) Line 4450  for (;; ptr++)
4450    BOOL reset_bracount;    BOOL reset_bracount;
4451    int class_has_8bitchar;    int class_has_8bitchar;
4452    int class_one_char;    int class_one_char;
4453    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4454      BOOL xclass_has_prop;
4455    #endif
4456    int newoptions;    int newoptions;
4457    int recno;    int recno;
4458    int refsign;    int refsign;
# Line 4199  for (;; ptr++) Line 4570  for (;; ptr++)
4570          }          }
4571        goto NORMAL_CHAR;        goto NORMAL_CHAR;
4572        }        }
4573        /* Control does not reach here. */
4574      }      }
4575    
4576    /* Fill in length of a previous callout, except when the next thing is    /* In extended mode, skip white space and comments. We need a loop in order
4577    a quantifier. */    to check for more white space and more comments after a comment. */
   
   is_quantifier =  
     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||  
     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));  
   
   if (!is_quantifier && previous_callout != NULL &&  
        after_manual_callout-- <= 0)  
     {  
     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */  
       complete_callout(previous_callout, ptr, cd);  
     previous_callout = NULL;  
     }  
   
   /* In extended mode, skip white space and comments. */  
4578    
4579    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
4580      {      {
4581      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;      for (;;)
     if (c == CHAR_NUMBER_SIGN)  
4582        {        {
4583          while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4584          if (c != CHAR_NUMBER_SIGN) break;
4585        ptr++;        ptr++;
4586        while (*ptr != CHAR_NULL)        while (*ptr != CHAR_NULL)
4587          {          {
4588          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4589              {                          /* IS_NEWLINE sets cd->nllen. */
4590              ptr += cd->nllen;
4591              break;
4592              }
4593          ptr++;          ptr++;
4594  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4595          if (utf) FORWARDCHAR(ptr);          if (utf) FORWARDCHAR(ptr);
4596  #endif  #endif
4597          }          }
4598        if (*ptr != CHAR_NULL) continue;        c = *ptr;     /* Either NULL or the char after a newline */
   
       /* Else fall through to handle end of string */  
       c = 0;  
4599        }        }
4600      }      }
4601    
4602    /* No auto callout for quantifiers. */    /* See if the next thing is a quantifier. */
4603    
4604      is_quantifier =
4605        c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4606        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4607    
4608      /* Fill in length of a previous callout, except when the next thing is a
4609      quantifier or when processing a property substitution string in UCP mode. */
4610    
4611      if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4612           after_manual_callout-- <= 0)
4613        {
4614        if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4615          complete_callout(previous_callout, ptr, cd);
4616        previous_callout = NULL;
4617        }
4618    
4619      /* Create auto callout, except for quantifiers, or while processing property
4620      strings that are substituted for \w etc in UCP mode. */
4621    
4622    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4623      {      {
4624      previous_callout = code;      previous_callout = code;
4625      code = auto_callout(code, ptr, cd);      code = auto_callout(code, ptr, cd);
4626      }      }
4627    
4628      /* Process the next pattern item. */
4629    
4630    switch(c)    switch(c)
4631      {      {
4632      /* ===================================================================*/      /* ===================================================================*/
4633      case 0:                        /* The branch terminates at string end */      case CHAR_NULL:                /* The branch terminates at string end */
4634      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
4635      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
4636      *firstcharptr = firstchar;      *firstcharptr = firstchar;
# Line 4328  for (;; ptr++) Line 4708  for (;; ptr++)
4708        }        }
4709      goto NORMAL_CHAR;      goto NORMAL_CHAR;
4710    
4711        /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4712        used for "start of word" and "end of word". As these are otherwise illegal
4713        sequences, we don't break anything by recognizing them. They are replaced
4714        by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4715        erroneous and are handled by the normal code below. */
4716    
4717      case CHAR_LEFT_SQUARE_BRACKET:      case CHAR_LEFT_SQUARE_BRACKET:
4718        if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4719          {
4720          nestptr = ptr + 7;
4721          ptr = sub_start_of_word - 1;
4722          continue;
4723          }
4724    
4725        if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4726          {
4727          nestptr = ptr + 7;
4728          ptr = sub_end_of_word - 1;
4729          continue;
4730          }
4731    
4732        /* Handle a real character class. */
4733    
4734      previous = code;      previous = code;
4735    
4736      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
# Line 4385  for (;; ptr++) Line 4787  for (;; ptr++)
4787    
4788      should_flip_negation = FALSE;      should_flip_negation = FALSE;
4789    
4790        /* Extended class (xclass) will be used when characters > 255
4791        might match. */
4792    
4793    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4794        xclass = FALSE;
4795        class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4796        class_uchardata_base = class_uchardata;   /* Save the start */
4797    #endif
4798    
4799      /* For optimization purposes, we track some properties of the class:      /* For optimization purposes, we track some properties of the class:
4800      class_has_8bitchar will be non-zero if the class contains at least one <      class_has_8bitchar will be non-zero if the class contains at least one <
4801      256 character; class_one_char will be 1 if the class contains just one      256 character; class_one_char will be 1 if the class contains just one
4802      character. */      character; xclass_has_prop will be TRUE if unicode property checks
4803        are present in the class. */
4804    
4805      class_has_8bitchar = 0;      class_has_8bitchar = 0;
4806      class_one_char = 0;      class_one_char = 0;
4807    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4808        xclass_has_prop = FALSE;
4809    #endif
4810    
4811      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
4812      temporary bit of memory, in case the class contains fewer than two      temporary bit of memory, in case the class contains fewer than two
# Line 4400  for (;; ptr++) Line 4815  for (;; ptr++)
4815    
4816      memset(classbits, 0, 32 * sizeof(pcre_uint8));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
4817    
 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  
     xclass = FALSE;  
     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */  
     class_uchardata_base = class_uchardata;   /* Save the start */  
 #endif  
   
4818      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
4819      means that an initial ] is taken as a data character. At the start of the      means that an initial ] is taken as a data character. At the start of the
4820      loop, c contains the first byte of the character. */      loop, c contains the first byte of the character. */
# Line 4493  for (;; ptr++) Line 4902  for (;; ptr++)
4902            posix_class = 0;            posix_class = 0;
4903    
4904          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
4905          different escape sequences that use Unicode properties. */          different escape sequences that use Unicode properties \p or \P. Others
4906            that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4907            directly. */
4908    
4909  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4910          if ((options & PCRE_UCP) != 0)          if ((options & PCRE_UCP) != 0)
4911            {            {
4912              unsigned int ptype = 0;
4913            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4914    
4915              /* The posix_substitutes table specifies which POSIX classes can be
4916              converted to \p or \P items. */
4917    
4918            if (posix_substitutes[pc] != NULL)            if (posix_substitutes[pc] != NULL)
4919              {              {
4920              nestptr = tempptr + 1;              nestptr = tempptr + 1;
4921              ptr = posix_substitutes[pc] - 1;              ptr = posix_substitutes[pc] - 1;
4922              continue;              continue;
4923              }              }
4924    
4925              /* There are three other classes that generate special property calls
4926              that are recognized only in an XCLASS. */
4927    
4928              else switch(posix_class)
4929                {
4930                case PC_GRAPH:
4931                ptype = PT_PXGRAPH;
4932                /* Fall through */
4933                case PC_PRINT:
4934                if (ptype == 0) ptype = PT_PXPRINT;
4935                /* Fall through */
4936                case PC_PUNCT:
4937                if (ptype == 0) ptype = PT_PXPUNCT;
4938                *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4939                *class_uchardata++ = ptype;
4940                *class_uchardata++ = 0;
4941                xclass_has_prop = TRUE;
4942                ptr = tempptr + 1;
4943                continue;
4944    
4945                /* For all other POSIX classes, no special action is taken in UCP
4946                mode. Fall through to the non_UCP case. */
4947    
4948                default:
4949                break;
4950                }
4951            }            }
4952  #endif  #endif
4953          /* In the non-UCP case, we build the bit map for the POSIX class in a          /* In the non-UCP case, or when UCP makes no difference, we build the
4954          chunk of local store because we may be adding and subtracting from it,          bit map for the POSIX class in a chunk of local store because we may be
4955          and we don't want to subtract bits that may be in the main map already.          adding and subtracting from it, and we don't want to subtract bits that
4956          At the end we or the result into the bit map that is being built. */          may be in the main map already. At the end we or the result into the
4957            bit map that is being built. */
4958    
4959          posix_class *= 3;          posix_class *= 3;
4960    
# Line 4631  for (;; ptr++) Line 5075  for (;; ptr++)
5075              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5076              previously set by something earlier in the character class.              previously set by something earlier in the character class.
5077              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5078              we could just adjust the appropriate bit. From PCRE 8.34 we no              we could just adjust the appropriate bit. From PCRE 8.34 we no
5079              longer treat \s and \S specially. */              longer treat \s and \S specially. */
5080    
5081              case ESC_s:              case ESC_s:
# Line 4677  for (;; ptr++) Line 5121  for (;; ptr++)
5121                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
5122                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
5123                *class_uchardata++ = pdata;                *class_uchardata++ = pdata;
5124                  xclass_has_prop = TRUE;
5125                class_has_8bitchar--;                /* Undo! */                class_has_8bitchar--;                /* Undo! */
5126                continue;                continue;
5127                }                }
# Line 4762  for (;; ptr++) Line 5207  for (;; ptr++)
5207  #endif  #endif
5208          d = *ptr;  /* Not UTF-8 mode */          d = *ptr;  /* Not UTF-8 mode */
5209    
5210          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape
5211          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          sequence, but not any of the other escapes. Perl treats a hyphen as a
5212          in such circumstances. */          literal in such circumstances. However, in Perl's warning mode, a
5213            warning is given, so PCRE now faults it as it is almost certainly a
5214            mistake on the user's part. */
5215    
5216          if (!inescq && d == CHAR_BACKSLASH)          if (!inescq)
5217            {            {
5218            int descape;            if (d == CHAR_BACKSLASH)
5219            descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);              {
5220            if (*errorcodeptr != 0) goto FAILED;              int descape;
5221                descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5222                if (*errorcodeptr != 0) goto FAILED;
5223    
5224            /* \b is backspace; any other special means the '-' was literal. */              /* 0 means a character was put into d; \b is backspace; any other
5225                special causes an error. */
5226    
5227            if (descape != 0)              if (descape != 0)
             {  
             if (descape == ESC_b) d = CHAR_BS; else  
5228                {                {
5229                ptr = oldptr;                if (descape == ESC_b) d = CHAR_BS; else
5230                goto CLASS_SINGLE_CHARACTER;  /* A few lines below */                  {
5231                    *errorcodeptr = ERR83;
5232                    goto FAILED;
5233                    }
5234                }                }
5235              }              }
5236    
5237              /* A hyphen followed by a POSIX class is treated in the same way. */
5238    
5239              else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5240                       (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5241                        ptr[1] == CHAR_EQUALS_SIGN) &&
5242                       check_posix_syntax(ptr, &tempptr))
5243                {
5244                *errorcodeptr = ERR83;
5245                goto FAILED;
5246                }
5247            }            }
5248    
5249          /* Check that the two values are in the correct order. Optimize          /* Check that the two values are in the correct order. Optimize
# Line 4954  for (;; ptr++) Line 5416  for (;; ptr++)
5416        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
5417        code += LINK_SIZE;        code += LINK_SIZE;
5418        *code = negate_class? XCL_NOT:0;        *code = negate_class? XCL_NOT:0;
5419          if (xclass_has_prop) *code |= XCL_HASPROP;
5420    
5421        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
5422        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
# Line 4963  for (;; ptr++) Line 5426  for (;; ptr++)
5426          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
5427          memmove(code + (32 / sizeof(pcre_uchar)), code,          memmove(code + (32 / sizeof(pcre_uchar)), code,
5428            IN_UCHARS(class_uchardata - code));            IN_UCHARS(class_uchardata - code));
5429            if (negate_class && !xclass_has_prop)
5430              for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5431          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
5432          code = class_uchardata + (32 / sizeof(pcre_uchar));          code = class_uchardata + (32 / sizeof(pcre_uchar));
5433          }          }
# Line 5045  for (;; ptr++) Line 5510  for (;; ptr++)
5510    
5511      tempcode = previous;      tempcode = previous;
5512    
5513        /* Before checking for a possessive quantifier, we must skip over
5514        whitespace and comments in extended mode because Perl allows white space at
5515        this point. */
5516    
5517        if ((options & PCRE_EXTENDED) != 0)
5518          {
5519          const pcre_uchar *p = ptr + 1;
5520          for (;;)
5521            {
5522            while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5523            if (*p != CHAR_NUMBER_SIGN) break;
5524            p++;
5525            while (*p != CHAR_NULL)
5526              {
5527              if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5528                {                        /* IS_NEWLINE sets cd->nllen. */
5529                p += cd->nllen;
5530                break;
5531                }
5532              p++;
5533    #ifdef SUPPORT_UTF
5534              if (utf) FORWARDCHAR(p);
5535    #endif
5536              }           /* Loop for comment characters */
5537            }             /* Loop for multiple comments */
5538          ptr = p - 1;    /* Character before the next significant one. */
5539          }
5540    
5541      /* If the next character is '+', we have a possessive quantifier. This      /* If the next character is '+', we have a possessive quantifier. This
5542      implies greediness, whatever the setting of the PCRE_UNGREEDY option.      implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5543      If the next character is '?' this is a minimizing repeat, by default,      If the next character is '?' this is a minimizing repeat, by default,
# Line 5337  for (;; ptr++) Line 5830  for (;; ptr++)
5830      opcodes such as BRA and CBRA, as this is the place where they get converted      opcodes such as BRA and CBRA, as this is the place where they get converted
5831      into the more special varieties such as BRAPOS and SBRA. A test for >=      into the more special varieties such as BRAPOS and SBRA. A test for >=
5832      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5833      ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow      ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5834      repetition of assertions, but now it does, for Perl compatibility. */      Originally, PCRE did not allow repetition of assertions, but now it does,
5835        for Perl compatibility. */
5836    
5837      else if (*previous >= OP_ASSERT && *previous <= OP_COND)      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5838        {        {
# Line 5356  for (;; ptr++) Line 5850  for (;; ptr++)
5850        /* There is no sense in actually repeating assertions. The only potential        /* There is no sense in actually repeating assertions. The only potential
5851        use of repetition is in cases when the assertion is optional. Therefore,        use of repetition is in cases when the assertion is optional. Therefore,
5852        if the minimum is greater than zero, just ignore the repeat. If the        if the minimum is greater than zero, just ignore the repeat. If the
5853        maximum is not not zero or one, set it to 1. */        maximum is not zero or one, set it to 1. */
5854    
5855        if (*previous < OP_ONCE)    /* Assertion */        if (*previous < OP_ONCE)    /* Assertion */
5856          {          {
# Line 5729  for (;; ptr++) Line 6223  for (;; ptr++)
6223        goto FAILED;        goto FAILED;
6224        }        }
6225    
6226      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', possessive_quantifier is
6227      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,      TRUE. For some opcodes, there are special alternative opcodes for this
6228      there are special alternative opcodes for this case. For anything else, we      case. For anything else, we wrap the entire repeated item inside OP_ONCE
6229      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      brackets. Logically, the '+' notation is just syntactic sugar, taken from
6230      notation is just syntactic sugar, taken from Sun's Java package, but the      Sun's Java package, but the special opcodes can optimize it.
     special opcodes can optimize it.  
6231    
6232      Some (but not all) possessively repeated subpatterns have already been      Some (but not all) possessively repeated subpatterns have already been
6233      completely handled in the code just above. For them, possessive_quantifier      completely handled in the code just above. For them, possessive_quantifier
6234      is always FALSE at this stage.      is always FALSE at this stage. Note that the repeated item starts at
6235        tempcode, not at previous, which might be the first part of a string whose
6236      Note that the repeated item starts at tempcode, not at previous, which      (former) last char we repeated. */
     might be the first part of a string whose (former) last char we repeated.  
   
     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But  
     an 'upto' may follow. We skip over an 'exact' item, and then test the  
     length of what remains before proceeding. */  
6237    
6238      if (possessive_quantifier)      if (possessive_quantifier)
6239        {        {
6240        int len;        int len;
6241    
6242        if (*tempcode == OP_TYPEEXACT)        /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6243          However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6244          {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6245          remains is greater than zero, there's a further opcode that can be
6246          handled. If not, do nothing, leaving the EXACT alone. */
6247    
6248          switch(*tempcode)
6249            {
6250            case OP_TYPEEXACT:
6251          tempcode += PRIV(OP_lengths)[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
6252            ((tempcode[1 + IMM2_SIZE] == OP_PROP            ((tempcode[1 + IMM2_SIZE] == OP_PROP
6253            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6254            break;
6255    
6256        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)          /* CHAR opcodes are used for exacts whose count is 1. */
6257          {  
6258            case OP_CHAR:
6259            case OP_CHARI:
6260            case OP_NOT:
6261            case OP_NOTI:
6262            case OP_EXACT:
6263            case OP_EXACTI:
6264            case OP_NOTEXACT:
6265            case OP_NOTEXACTI:
6266          tempcode += PRIV(OP_lengths)[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
6267  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6268          if (utf && HAS_EXTRALEN(tempcode[-1]))          if (utf && HAS_EXTRALEN(tempcode[-1]))
6269            tempcode += GET_EXTRALEN(tempcode[-1]);            tempcode += GET_EXTRALEN(tempcode[-1]);
6270  #endif  #endif
6271            break;
6272    
6273            /* For the class opcodes, the repeat operator appears at the end;
6274            adjust tempcode to point to it. */
6275    
6276            case OP_CLASS:
6277            case OP_NCLASS:
6278            tempcode += 1 + 32/sizeof(pcre_uchar);
6279            break;
6280    
6281    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6282            case OP_XCLASS:
6283            tempcode += GET(tempcode, 1);
6284            break;
6285    #endif
6286          }          }
6287    
6288          /* If tempcode is equal to code (which points to the end of the repeated
6289          item), it means we have skipped an EXACT item but there is no following
6290          QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6291          all other cases, tempcode will be pointing to the repeat opcode, and will
6292          be less than code, so the value of len will be greater than 0. */
6293    
6294        len = (int)(code - tempcode);        len = (int)(code - tempcode);
6295          if (len > 0)
6296            {
6297            unsigned int repcode = *tempcode;
6298    
6299            /* There is a table for possessifying opcodes, all of which are less
6300            than OP_CALLOUT. A zero entry means there is no possessified version.
6301            */
6302    
6303            if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6304              *tempcode = opcode_possessify[repcode];
6305    
6306            /* For opcode without a special possessified version, wrap the item in
6307            ONCE brackets. Because we are moving code along, we must ensure that any
6308            pending recursive references are updated. */
6309    
6310            else
6311              {
6312              *code = OP_END;
6313              adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6314              memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6315              code += 1 + LINK_SIZE;
6316              len += 1 + LINK_SIZE;
6317              tempcode[0] = OP_ONCE;
6318              *code++ = OP_KET;
6319              PUTINC(code, 0, len);
6320              PUT(tempcode, 1, len);
6321              }
6322            }
6323    
6324    #ifdef NEVER
6325        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
6326          {          {
6327          case OP_STAR:  *tempcode = OP_POSSTAR; break;          case OP_STAR:  *tempcode = OP_POSSTAR; break;
# Line 5793  for (;; ptr++) Line 6349  for (;; ptr++)
6349          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6350          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6351    
6352            case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6353            case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6354            case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6355            case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6356    
6357          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
6358          pending recursive references are updated. */          pending recursive references are updated. */
6359    
# Line 5808  for (;; ptr++) Line 6369  for (;; ptr++)
6369          PUT(tempcode, 1, len);          PUT(tempcode, 1, len);
6370          break;          break;
6371          }          }
6372    #endif
6373        }        }
6374    
6375      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 5996  for (;; ptr++) Line 6558  for (;; ptr++)
6558          tempptr = ptr;          tempptr = ptr;
6559    
6560          /* A condition can be an assertion, a number (referring to a numbered          /* A condition can be an assertion, a number (referring to a numbered
6561          group), a name (referring to a named group), or 'R', referring to          group's having been set), a name (referring to a named group), or 'R',
6562          recursion. R<digits> and R&name are also permitted for recursion tests.          referring to recursion. R<digits> and R&name are also permitted for
6563            recursion tests.
6564          There are several syntaxes for testing a named group: (?(name)) is used  
6565          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).          There are ways of testing a named group: (?(name)) is used by Python;
6566            Perl 5.10 onwards uses (?(<name>) or (?('name')).
6567          There are two unfortunate ambiguities, caused by history. (a) 'R' can  
6568          be the recursive thing or the name 'R' (and similarly for 'R' followed          There is one unfortunate ambiguity, caused by history. 'R' can be the
6569          by digits), and (b) a number could be a name that consists of digits.          recursive thing or the name 'R' (and similarly for 'R' followed by
6570          In both cases, we look for a name first; if not found, we try the other          digits). We look for a name first; if not found, we try the other case.
         cases.  
6571    
6572          For compatibility with auto-callouts, we allow a callout to be          For compatibility with auto-callouts, we allow a callout to be
6573          specified before a condition that is an assertion. First, check for the          specified before a condition that is an assertion. First, check for the
# Line 6039  for (;; ptr++) Line 6600  for (;; ptr++)
6600    
6601          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
6602    
6603          if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)          ptr++;
6604            if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6605            {            {
6606            terminator = -1;            terminator = -1;
6607            ptr += 2;            ptr += 2;
# Line 6048  for (;; ptr++) Line 6610  for (;; ptr++)
6610    
6611          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6612          syntax (?(<name>) or (?('name'), and also allow for the original PCRE          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6613          syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may          syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
         consist entirely of digits, there is scope for ambiguity. */  
6614    
6615          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (*ptr == CHAR_LESS_THAN_SIGN)
6616            {            {
6617            terminator = CHAR_GREATER_THAN_SIGN;            terminator = CHAR_GREATER_THAN_SIGN;
6618            ptr++;            ptr++;
6619            }            }
6620          else if (ptr[1] == CHAR_APOSTROPHE)          else if (*ptr == CHAR_APOSTROPHE)
6621            {            {
6622            terminator = CHAR_APOSTROPHE;            terminator = CHAR_APOSTROPHE;
6623            ptr++;            ptr++;
# Line 6064  for (;; ptr++) Line 6625  for (;; ptr++)
6625          else          else
6626            {            {
6627            terminator = CHAR_NULL;            terminator = CHAR_NULL;
6628            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6629                else if (IS_DIGIT(*ptr)) refsign = 0;
6630            }            }
   
         /* When a name is one of a number of duplicates, a different opcode is  
         used and it needs more memory. Unfortunately we cannot tell whether a  
         name is a duplicate in the first pass, so we have to allow for more  
         memory except when we know it is a relative numerical reference. */  
   
         if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;  
6631    
6632          /* We now expect to read a name (possibly all digits); any thing else          /* Handle a number */
         is an error. In the case of all digits, also get it as a number. */  
6633    
6634          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (refsign >= 0)
6635            {            {
6636            ptr += 1;  /* To get the right offset */            recno = 0;
6637            *errorcodeptr = ERR28;            while (IS_DIGIT(*ptr))
6638            goto FAILED;              {
6639                recno = recno * 10 + (int)(*ptr - CHAR_0);
6640                ptr++;
6641                }
6642            }            }
6643    
6644          recno = 0;          /* Otherwise we expect to read a name; anything else is an error. When
6645          name = ++ptr;          a name is one of a number of duplicates, a different opcode is used and
6646          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)          it needs more memory. Unfortunately we cannot tell whether a name is a
6647            duplicate in the first pass, so we have to allow for more memory. */
6648    
6649            else
6650            {            {
6651            if (recno >= 0)            if (IS_DIGIT(*ptr))
6652              recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;              {
6653            ptr++;              *errorcodeptr = ERR84;
6654                goto FAILED;
6655                }
6656              if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6657                {
6658                *errorcodeptr = ERR28;   /* Assertion expected */
6659                goto FAILED;
6660                }
6661              name = ptr++;
6662              while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6663                {
6664                ptr++;
6665                }
6666              namelen = (int)(ptr - name);
6667              if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
6668            }            }
         namelen = (int)(ptr - name);  
6669    
6670          /* Check the terminator */          /* Check the terminator */
6671    
6672          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6673              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6674            {            {
6675            ptr--;      /* Error offset */            ptr--;                  /* Error offset */
6676            *errorcodeptr = ERR26;            *errorcodeptr = ERR26;  /* Malformed number or name */
6677            goto FAILED;            goto FAILED;
6678            }            }
6679    
# Line 6109  for (;; ptr++) Line 6682  for (;; ptr++)
6682          if (lengthptr != NULL) break;          if (lengthptr != NULL) break;
6683    
6684          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
6685          reference. If the string started with "+" or "-" we require the rest to          reference. If refsign is not negative, it means we have a number in
6686          be digits, in which case recno will be set. */          recno. */
6687    
6688          if (refsign > 0)          if (refsign >= 0)
6689            {            {
6690            if (recno <= 0)            if (recno <= 0)
6691              {              {
6692              *errorcodeptr = ERR58;              *errorcodeptr = ERR35;
6693              goto FAILED;              goto FAILED;
6694              }              }
6695            recno = (refsign == CHAR_MINUS)?            if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6696              cd->bracount - recno + 1 : recno +cd->bracount;              cd->bracount - recno + 1 : recno + cd->bracount;
6697            if (recno <= 0 || recno > cd->final_bracount)            if (recno <= 0 || recno > cd->final_bracount)
6698              {              {
6699              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 6130  for (;; ptr++) Line 6703  for (;; ptr++)
6703            break;            break;
6704            }            }
6705    
6706          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise look for the name. */
6707          name. */  
   
6708          slot = cd->name_table;          slot = cd->name_table;
6709          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
6710            {            {
# Line 6140  for (;; ptr++) Line 6712  for (;; ptr++)
6712            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6713            }            }
6714    
6715          /* Found the named subpattern. If the name is duplicated, add one to          /* Found the named subpattern. If the name is duplicated, add one to
6716          the opcode to change CREF/RREF into DNCREF/DNRREF and insert          the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6717          appropriate data values. Otherwise, just insert the unique subpattern          appropriate data values. Otherwise, just insert the unique subpattern
6718          number. */          number. */
6719    
6720          if (i < cd->names_found)          if (i < cd->names_found)
6721            {            {
6722            int offset = i++;            int offset = i++;
6723            int count = 1;            int count = 1;
6724            recno = GET2(slot, 0);   /* Number from first found */            recno = GET2(slot, 0);   /* Number from first found */
6725            for (; i < cd->names_found; i++)            for (; i < cd->names_found; i++)
6726              {              {
6727              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6728              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6729              count++;              count++;
6730              }              }
6731            if (count > 1)            if (count > 1)
6732              {              {
6733              PUT2(code, 2+LINK_SIZE, offset);              PUT2(code, 2+LINK_SIZE, offset);
6734              PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);              PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6735              skipbytes += IMM2_SIZE;              skipbytes += IMM2_SIZE;
6736              code[1+LINK_SIZE]++;              code[1+LINK_SIZE]++;
6737              }              }
6738            else  /* Not a duplicated name */            else  /* Not a duplicated name */
6739              {              {
6740              PUT2(code, 2+LINK_SIZE, recno);              PUT2(code, 2+LINK_SIZE, recno);
6741              }              }
6742            }            }
6743    
6744          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
6745          after the opening parenthesis [e.g. (?(abc)...] and in this case there          after the opening parenthesis [e.g. (?(abc)...] and in this case there
6746          are some further alternatives to try. For the cases where terminator !=          are some further alternatives to try. For the cases where terminator !=
6747          0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have          CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6748          now checked all the possibilities, so give an error. */          we have now checked all the possibilities, so give an error. */
6749    
6750          else if (terminator != CHAR_NULL)          else if (terminator != CHAR_NULL)
6751            {            {
# Line 6210  for (;; ptr++) Line 6782  for (;; ptr++)
6782            skipbytes = 1;            skipbytes = 1;
6783            }            }
6784    
6785          /* Check for the "name" actually being a subpattern number. We are          /* Reference to an unidentified subpattern. */
         in the second pass here, so final_bracount is set. */  
   
         else if (recno > 0 && recno <= cd->final_bracount)  
           {  
           PUT2(code, 2+LINK_SIZE, recno);  
           }  
   
         /* Either an unidentified subpattern, or a reference to (?(0) */  
6786    
6787          else          else
6788            {            {
6789            *errorcodeptr = (recno == 0)? ERR35: ERR15;            *errorcodeptr = ERR15;
6790            goto FAILED;            goto FAILED;
6791            }            }
6792          break;          break;
# Line 6235  for (;; ptr++) Line 6799  for (;; ptr++)
6799          ptr++;          ptr++;
6800          break;          break;
6801    
6802            /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6803            thing to do, but Perl allows all assertions to be quantified, and when
6804            they contain capturing parentheses there may be a potential use for
6805            this feature. Not that that applies to a quantified (?!) but we allow
6806            it for uniformity. */
6807    
6808          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
6809          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6810          ptr++;          ptr++;
6811          if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */          if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6812                 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6813                (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6814            {            {
6815            *code++ = OP_FAIL;            *code++ = OP_FAIL;
6816            previous = NULL;            previous = NULL;
# Line 6335  for (;; ptr++) Line 6906  for (;; ptr++)
6906          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6907            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6908          name = ++ptr;          name = ++ptr;
6909            if (IS_DIGIT(*ptr))
6910              {
6911              *errorcodeptr = ERR84;   /* Group name must start with non-digit */
6912              goto FAILED;
6913              }
6914          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6915          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6916    
# Line 6449  for (;; ptr++) Line 7024  for (;; ptr++)
7024    
7025          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
7026          name = ++ptr;          name = ++ptr;
7027            if (IS_DIGIT(*ptr))
7028              {
7029              *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7030              goto FAILED;
7031              }
7032          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7033          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
7034    
# Line 6841  for (;; ptr++) Line 7421  for (;; ptr++)
7421        skipbytes = IMM2_SIZE;        skipbytes = IMM2_SIZE;
7422        }        }
7423    
7424      /* Process nested bracketed regex. Assertions used not to be repeatable,      /* Process nested bracketed regex. First check for parentheses nested too
7425      but this was changed for Perl compatibility, so all kinds can now be      deeply. */
7426      repeated. We copy code into a non-register variable (tempcode) in order to  
7427      be able to pass its address because some compilers complain otherwise. */      if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7428          {
7429          *errorcodeptr = ERR82;
7430          goto FAILED;
7431          }
7432    
7433        /* Assertions used not to be repeatable, but this was changed for Perl
7434        compatibility, so all kinds can now be repeated. We copy code into a
7435        non-register variable (tempcode) in order to be able to pass its address
7436        because some compilers complain otherwise. */
7437    
7438      previous = code;                      /* For handling repetition */      previous = code;                      /* For handling repetition */
7439      *code = bravalue;      *code = bravalue;
# Line 6875  for (;; ptr++) Line 7464  for (;; ptr++)
7464           ))           ))
7465        goto FAILED;        goto FAILED;
7466    
7467        cd->parens_depth -= 1;
7468    
7469      /* If this was an atomic group and there are no capturing groups within it,      /* If this was an atomic group and there are no capturing groups within it,
7470      generate OP_ONCE_NC instead of OP_ONCE. */      generate OP_ONCE_NC instead of OP_ONCE. */
7471    
# Line 7089  for (;; ptr++) Line 7680  for (;; ptr++)
7680        if (escape == ESC_g)        if (escape == ESC_g)
7681          {          {
7682          const pcre_uchar *p;          const pcre_uchar *p;
7683            pcre_uint32 cf;
7684    
7685          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
7686          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7687            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7688    
7689          /* These two statements stop the compiler for warning about possibly          /* These two statements stop the compiler for warning about possibly
7690          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7691          fact, because we actually check for a number below, the paths that          fact, because we do the check for a number below, the paths that
7692          would actually be in error are never taken. */          would actually be in error are never taken. */
7693    
7694          skipbytes = 0;          skipbytes = 0;
7695          reset_bracount = FALSE;          reset_bracount = FALSE;
7696    
7697          /* Test for a name */          /* If it's not a signed or unsigned number, treat it as a name. */
7698    
7699          if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)          cf = ptr[1];
7700            if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7701            {            {
           BOOL is_a_number = TRUE;  
           for (p = ptr + 1; *p != CHAR_NULL && *p != (pcre_uchar)terminator; p++)  
             {  
             if (!MAX_255(*p)) { is_a_number = FALSE; break; }  
             if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;  
             if ((cd->ctypes[*p] & ctype_word) == 0) break;  
             }  
           if (*p != (pcre_uchar)terminator)  
             {  
             *errorcodeptr = ERR57;  
             break;  
             }  
           if (is_a_number)  
             {  
             ptr++;  
             goto HANDLE_NUMERICAL_RECURSION;  
             }  
7702            is_recurse = TRUE;            is_recurse = TRUE;
7703            goto NAMED_REF_OR_RECURSE;            goto NAMED_REF_OR_RECURSE;
7704            }            }
7705    
7706          /* Test a signed number in angle brackets or quotes. */          /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7707            or a digit. */
7708    
7709          p = ptr + 2;          p = ptr + 2;
7710          while (IS_DIGIT(*p)) p++;          while (IS_DIGIT(*p)) p++;
# Line 7266  for (;; ptr++) Line 7844  for (;; ptr++)
7844    
7845      /* ===================================================================*/      /* ===================================================================*/
7846      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
7847      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in a UTF mode, it may be a
7848      multi-byte literal character. */      multi-unit literal character. */
7849    
7850      default:      default:
7851      NORMAL_CHAR:      NORMAL_CHAR:
# Line 8255  PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. * Line 8833  PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. *
8833      { skipatstart += 6; options |= PCRE_UTF8; continue; }      { skipatstart += 6; options |= PCRE_UTF8; continue; }
8834    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
8835      { skipatstart += 6; options |= PCRE_UCP; continue; }      { skipatstart += 6; options |= PCRE_UCP; continue; }
8836      else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
8837        { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
8838    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
8839      { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }      { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
8840    
# Line 8449  cd->named_group_list_size = NAMED_GROUP_ Line 9029  cd->named_group_list_size = NAMED_GROUP_
9029  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
9030  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9031  cd->req_varyopt = 0;  cd->req_varyopt = 0;
9032    cd->parens_depth = 0;
9033  cd->assert_depth = 0;  cd->assert_depth = 0;
9034  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
9035  cd->external_options = options;  cd->external_options = options;
# Line 8463  outside can help speed up starting point Line 9044  outside can help speed up starting point
9044  ptr += skipatstart;  ptr += skipatstart;
9045  code = cworkspace;  code = cworkspace;
9046  *code = OP_BRA;  *code = OP_BRA;
9047    
9048  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9049    FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,    FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9050    cd, &length);    cd, &length);
# Line 8534  field; this time it's used for rememberi Line 9116  field; this time it's used for rememberi
9116  */  */
9117    
9118  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9119    cd->parens_depth = 0;
9120  cd->assert_depth = 0;  cd->assert_depth = 0;
9121  cd->bracount = 0;  cd->bracount = 0;
9122  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
# Line 8639  if (errorcode == 0 && re->top_backref > Line 9222  if (errorcode == 0 && re->top_backref >
9222  /* Unless disabled, check whether single character iterators can be  /* Unless disabled, check whether single character iterators can be
9223  auto-possessified. The function overwrites the appropriate opcode values. */  auto-possessified. The function overwrites the appropriate opcode values. */
9224    
9225  if ((options & PCRE_NO_AUTO_POSSESSIFY) == 0)  if ((options & PCRE_NO_AUTO_POSSESS) == 0)
9226    auto_possessify((pcre_uchar *)codestart, utf, cd);    auto_possessify((pcre_uchar *)codestart, utf, cd);
9227    
9228  /* If there were any lookbehind assertions that contained OP_RECURSE  /* If there were any lookbehind assertions that contained OP_RECURSE
# Line 8863  return (pcre32 *)re; Line 9446  return (pcre32 *)re;
9446  }  }
9447    
9448  /* End of pcre_compile.c */  /* End of pcre_compile.c */
9449    

Legend:
Removed from v.1365  
changed lines
  Added in v.1414

  ViewVC Help
Powered by ViewVC 1.1.5