/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1363 by ph10, Tue Oct 1 16:54:40 2013 UTC revision 1393 by ph10, Fri Nov 8 16:37:21 2013 UTC
# Line 264  static const int verbcount = sizeof(verb Line 264  static const int verbcount = sizeof(verb
264  now all in a single string, to reduce the number of relocations when a shared  now all in a single string, to reduce the number of relocations when a shared
265  library is dynamically loaded. The list of lengths is terminated by a zero  library is dynamically loaded. The list of lengths is terminated by a zero
266  length entry. The first three must be alpha, lower, upper, as this is assumed  length entry. The first three must be alpha, lower, upper, as this is assumed
267  for handling case independence. */  for handling case independence. The indices for graph, print, and punct are
268    needed, so identify them. */
269    
270  static const char posix_names[] =  static const char posix_names[] =
271    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
# Line 275  static const char posix_names[] = Line 276  static const char posix_names[] =
276  static const pcre_uint8 posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
277    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
278    
279    #define PC_GRAPH  8
280    #define PC_PRINT  9
281    #define PC_PUNCT 10
282    
283    
284  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
285  base map, with an optional addition or removal of another map. Then, for some  base map, with an optional addition or removal of another map. Then, for some
286  classes, there is some additional tweaking: for [:blank:] the vertical space  classes, there is some additional tweaking: for [:blank:] the vertical space
# Line 302  static const int posix_class_maps[] = { Line 308  static const int posix_class_maps[] = {
308    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
309  };  };
310    
311  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class  /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
312  substitutes must be in the order of the names, defined above, and there are  Unicode property escapes. */
 both positive and negative cases. NULL means no substitute. */  
313    
314  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
315  static const pcre_uchar string_PNd[]  = {  static const pcre_uchar string_PNd[]  = {
# Line 329  static const pcre_uchar string_pXwd[] = Line 334  static const pcre_uchar string_pXwd[] =
334  static const pcre_uchar *substitutes[] = {  static const pcre_uchar *substitutes[] = {
335    string_PNd,           /* \D */    string_PNd,           /* \D */
336    string_pNd,           /* \d */    string_pNd,           /* \d */
337    string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */    string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
338    string_pXsp,          /* \s */    string_pXsp,          /* \s */   /* space and POSIX space are the same. */
339    string_PXwd,          /* \W */    string_PXwd,          /* \W */
340    string_pXwd           /* \w */    string_pXwd           /* \w */
341  };  };
342    
343    /* The POSIX class substitutes must be in the order of the POSIX class names,
344    defined above, and there are both positive and negative cases. NULL means no
345    general substitute of a Unicode property escape (\p or \P). However, for some
346    POSIX classes (e.g. graph, print, punct) a special property code is compiled
347    directly. */
348    
349  static const pcre_uchar string_pL[] =   {  static const pcre_uchar string_pL[] =   {
350    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
# Line 382  static const pcre_uchar *posix_substitut Line 393  static const pcre_uchar *posix_substitut
393    NULL,                 /* graph */    NULL,                 /* graph */
394    NULL,                 /* print */    NULL,                 /* print */
395    NULL,                 /* punct */    NULL,                 /* punct */
396    string_pXps,          /* space */    /* NOTE: Xps is POSIX space */    string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
397    string_pXwd,          /* word */    string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
398    NULL,                 /* xdigit */    NULL,                 /* xdigit */
399    /* Negated cases */    /* Negated cases */
400    string_PL,            /* ^alpha */    string_PL,            /* ^alpha */
# Line 397  static const pcre_uchar *posix_substitut Line 408  static const pcre_uchar *posix_substitut
408    NULL,                 /* ^graph */    NULL,                 /* ^graph */
409    NULL,                 /* ^print */    NULL,                 /* ^print */
410    NULL,                 /* ^punct */    NULL,                 /* ^punct */
411    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
412    string_PXwd,          /* ^word */    string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
413    NULL                  /* ^xdigit */    NULL                  /* ^xdigit */
414  };  };
415  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
# Line 462  static const char error_texts[] = Line 473  static const char error_texts[] =
473    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
474    "this version of PCRE is compiled without UTF support\0"    "this version of PCRE is compiled without UTF support\0"
475    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
476    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
477    /* 35 */    /* 35 */
478    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
479    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 516  static const char error_texts[] = Line 527  static const char error_texts[] =
527    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
528    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
529    "setting UTF is disabled by the application\0"    "setting UTF is disabled by the application\0"
530      "non-hex character in \\x{} (closing brace missing?)\0"
531      /* 80 */
532      "non-octal character in \\o{} (closing brace missing?)\0"
533      "missing opening brace after \\o\0"
534      "parentheses are too deeply nested\0"
535      "invalid range in character class\0"
536    ;    ;
537    
538  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 773  static const pcre_uint8 posspropstab[3][ Line 790  static const pcre_uint8 posspropstab[3][
790    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
791  };  };
792    
793    /* This table is used when converting repeating opcodes into possessified
794    versions as a result of an explicit possessive quantifier such as ++. A zero
795    value means there is no possessified version - in those cases the item in
796    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
797    because all relevant opcodes are less than that. */
798    
799    static const pcre_uint8 opcode_possessify[] = {
800      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
801      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
802    
803      0,                       /* NOTI */
804      OP_POSSTAR, 0,           /* STAR, MINSTAR */
805      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
806      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
807      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
808      0,                       /* EXACT */
809      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
810    
811      OP_POSSTARI, 0,          /* STARI, MINSTARI */
812      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
813      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
814      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
815      0,                       /* EXACTI */
816      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
817    
818      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
819      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
820      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
821      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
822      0,                       /* NOTEXACT */
823      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
824    
825      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
826      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
827      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
828      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
829      0,                       /* NOTEXACTI */
830      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
831    
832      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
833      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
834      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
835      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
836      0,                       /* TYPEEXACT */
837      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
838    
839      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
840      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
841      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
842      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
843      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
844    
845      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
846      0, 0,                    /* REF, REFI */
847      0, 0,                    /* DNREF, DNREFI */
848      0, 0                     /* RECURSE, CALLOUT */
849    };
850    
851    
852    
853  /*************************************************  /*************************************************
# Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 954  return (*p == CHAR_RIGHT_CURLY_BRACKET);
954  *************************************************/  *************************************************/
955    
956  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
957  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
958  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
959  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
960  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
961  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
962    
963  Arguments:  Arguments:
964    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
965    chptr          points to the data character    chptr          points to a returned data character
966    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
967    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
968    options        the options bits    options        the options bits
# Line 1092  else Line 1166  else
1166      break;      break;
1167    
1168      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1169      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1170      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1171        recommended to avoid the ambiguities in the old syntax.
1172    
1173      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1174      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1175      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1176      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1177      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1178      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1179      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1180    
1181        Inside a character class, \ followed by a digit is always either a literal
1182        8 or 9 or an octal number. */
1183    
1184      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1185      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1128  else Line 1206  else
1206          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1207          break;          break;
1208          }          }
1209        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1210          {          {
1211          escape = -s;          escape = -s;
1212          break;          break;
# Line 1136  else Line 1214  else
1214        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1215        }        }
1216    
1217      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1218      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1219      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1220        changed so as not to insert the binary zero. */
1221    
1222      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1223        {  
1224        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1225    
1226      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1227      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1162  else Line 1238  else
1238  #endif  #endif
1239      break;      break;
1240    
1241      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1242      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.      specifying character codes in octal. The only supported form is \o{ddd}. */
1243      If not, { is treated as a data character. */  
1244        case CHAR_o:
1245        if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1246          {
1247          ptr += 2;
1248          c = 0;
1249          overflow = FALSE;
1250          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1251            {
1252            register pcre_uint32 cc = *ptr++;
1253            if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1254    #ifdef COMPILE_PCRE32
1255            if (c >= 0x20000000l) { overflow = TRUE; break; }
1256    #endif
1257            c = (c << 3) + cc - CHAR_0 ;
1258    #if defined COMPILE_PCRE8
1259            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1260    #elif defined COMPILE_PCRE16
1261            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1262    #elif defined COMPILE_PCRE32
1263            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1264    #endif
1265            }
1266          if (overflow)
1267            {
1268            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1269            *errorcodeptr = ERR34;
1270            }
1271          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1272            {
1273            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1274            }
1275          else *errorcodeptr = ERR80;
1276          }
1277        break;
1278    
1279        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1280        numbers. Otherwise it is a lowercase x letter. */
1281    
1282      case CHAR_x:      case CHAR_x:
1283      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1284        {        {
       /* In JavaScript, \x must be followed by two hexadecimal numbers.  
       Otherwise it is a lowercase x letter. */  
1285        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1286          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1287          {          {
# Line 1187  else Line 1298  else
1298  #endif  #endif
1299            }            }
1300          }          }
1301        break;        }    /* End JavaScript handling */
       }  
1302    
1303      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1304        {      greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1305        const pcre_uchar *pt = ptr + 2;      digits. If not, { used to be treated as a data character. However, Perl
1306        seems to read hex digits up to the first non-such, and ignore the rest, so
1307        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1308        now gives an error. */
1309    
1310        c = 0;      else
1311        overflow = FALSE;        {
1312        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1313          {          {
1314          register pcre_uint32 cc = *pt++;          ptr += 2;
1315          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          c = 0;
1316            overflow = FALSE;
1317            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1318              {
1319              register pcre_uint32 cc = *ptr++;
1320              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1321    
1322  #ifdef COMPILE_PCRE32  #ifdef COMPILE_PCRE32
1323          if (c >= 0x10000000l) { overflow = TRUE; break; }            if (c >= 0x10000000l) { overflow = TRUE; break; }
1324  #endif  #endif
1325    
1326  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1327          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1328          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1329  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1330          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1331          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1332  #endif  #endif
1333    
1334  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1335          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1336  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1337          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1338  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1339          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1340  #endif  #endif
1341          }            }
1342    
1343        if (overflow)          if (overflow)
1344          {            {
1345          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1346          *errorcodeptr = ERR34;            *errorcodeptr = ERR34;
1347          }            }
1348    
1349        if (*pt == CHAR_RIGHT_CURLY_BRACKET)          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1350          {            {
1351          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1352          ptr = pt;            }
         break;  
         }  
1353    
1354        /* If the sequence of hex digits does not end with '}', then we don't          /* If the sequence of hex digits does not end with '}', give an error.
1355        recognize this construct; fall through to the normal \x handling. */          We used just to recognize this construct and fall through to the normal
1356        }          \x handling, but nowadays Perl gives an error, which seems much more
1357            sensible, so we do too. */
1358    
1359      /* Read just a single-byte hex-defined char */          else *errorcodeptr = ERR79;
1360            }   /* End of \x{} processing */
1361    
1362      c = 0;        /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1363      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)  
1364        {        else
1365        pcre_uint32 cc;                          /* Some compilers don't like */          {
1366        cc = *(++ptr);                           /* ++ in initializers */          c = 0;
1367            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1368              {
1369              pcre_uint32 cc;                          /* Some compilers don't like */
1370              cc = *(++ptr);                           /* ++ in initializers */
1371  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1372        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1373        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1374  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1375        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1376        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1377  #endif  #endif
1378        }            }
1379            }     /* End of \xdd handling */
1380          }       /* End of Perl-style \x handling */
1381      break;      break;
1382    
1383      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 1524  for (;;) Line 1648  for (;;)
1648    
1649      case OP_CALLOUT:      case OP_CALLOUT:
1650      case OP_CREF:      case OP_CREF:
1651      case OP_NCREF:      case OP_DNCREF:
1652      case OP_RREF:      case OP_RREF:
1653      case OP_NRREF:      case OP_DNRREF:
1654      case OP_DEF:      case OP_DEF:
1655      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1656      break;      break;
# Line 1663  for (;;) Line 1787  for (;;)
1787      case OP_COMMIT:      case OP_COMMIT:
1788      case OP_CREF:      case OP_CREF:
1789      case OP_DEF:      case OP_DEF:
1790        case OP_DNCREF:
1791        case OP_DNRREF:
1792      case OP_DOLL:      case OP_DOLL:
1793      case OP_DOLLM:      case OP_DOLLM:
1794      case OP_EOD:      case OP_EOD:
1795      case OP_EODN:      case OP_EODN:
1796      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1797      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1798      case OP_PRUNE:      case OP_PRUNE:
1799      case OP_REVERSE:      case OP_REVERSE:
# Line 1764  for (;;) Line 1888  for (;;)
1888    
1889      switch (*cc)      switch (*cc)
1890        {        {
       case OP_CRPLUS:  
       case OP_CRMINPLUS:  
1891        case OP_CRSTAR:        case OP_CRSTAR:
1892        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1893          case OP_CRPLUS:
1894          case OP_CRMINPLUS:
1895        case OP_CRQUERY:        case OP_CRQUERY:
1896        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1897          case OP_CRPOSSTAR:
1898          case OP_CRPOSPLUS:
1899          case OP_CRPOSQUERY:
1900        return -1;        return -1;
1901    
1902        case OP_CRRANGE:        case OP_CRRANGE:
1903        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1904          case OP_CRPOSRANGE:
1905        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1906        branchlength += (int)GET2(cc,1);        branchlength += (int)GET2(cc,1);
1907        cc += 1 + 2 * IMM2_SIZE;        cc += 1 + 2 * IMM2_SIZE;
# Line 2366  for (code = first_significant_code(code Line 2494  for (code = first_significant_code(code
2494        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2495        case OP_CRQUERY:        case OP_CRQUERY:
2496        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2497          case OP_CRPOSSTAR:
2498          case OP_CRPOSQUERY:
2499        break;        break;
2500    
2501        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2502        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2503        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2504          case OP_CRPOSPLUS:
2505        return FALSE;        return FALSE;
2506    
2507        case OP_CRRANGE:        case OP_CRRANGE:
2508        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2509          case OP_CRPOSRANGE:
2510        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2511        break;        break;
2512        }        }
# Line 2650  switch(ptype) Line 2782  switch(ptype)
2782    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2783            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2784    
2785    case PT_SPACE:    /* Perl space */    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2786    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    means that Perl space and POSIX space are now identical. PCRE was changed
2787            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)    at release 8.34. */
           == negated;  
2788    
2789      case PT_SPACE:    /* Perl space */
2790    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2791    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    switch(c)
2792            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||      {
2793            c == CHAR_FF || c == CHAR_CR)      HSPACE_CASES:
2794            == negated;      VSPACE_CASES:
2795        return negated;
2796    
2797        default:
2798        return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2799        }
2800      break;  /* Control never reaches here */
2801    
2802    case PT_WORD:    case PT_WORD:
2803    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
# Line 2818  switch(c) Line 2956  switch(c)
2956      return code + 2;      return code + 2;
2957      }      }
2958    
2959    /* Convert only if we have anough space. */    /* Convert only if we have enough space. */
2960    
2961    clist_src = PRIV(ucd_caseless_sets) + code[1];    clist_src = PRIV(ucd_caseless_sets) + code[1];
2962    clist_dest = list + 2;    clist_dest = list + 2;
2963    code += 2;    code += 2;
2964    
2965    do {    do {
      /* Early return if there is not enough space. */  
2966       if (clist_dest >= list + 8)       if (clist_dest >= list + 8)
2967         {         {
2968           /* Early return if there is not enough space. This should never
2969           happen, since all clists are shorter than 5 character now. */
2970         list[2] = code[0];         list[2] = code[0];
2971         list[3] = code[1];         list[3] = code[1];
2972         return code;         return code;
2973         }         }
2974       *clist_dest++ = *clist_src;       *clist_dest++ = *clist_src;
2975       }       }
2976     while(*clist_src++ != NOTACHAR);    while(*clist_src++ != NOTACHAR);
2977    
2978    /* Enough space to store all characters. */    /* All characters are stored. The terminating NOTACHAR
2979      is copied form the clist itself. */
2980    
2981    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2982    return code;    return code;
# Line 2846  switch(c) Line 2986  switch(c)
2986    case OP_CLASS:    case OP_CLASS:
2987  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2988    case OP_XCLASS:    case OP_XCLASS:
   
2989    if (c == OP_XCLASS)    if (c == OP_XCLASS)
2990      end = code + GET(code, 0);      end = code + GET(code, 0) - 1;
2991    else    else
2992  #endif  #endif
2993      end = code + 32 / sizeof(pcre_uchar);      end = code + 32 / sizeof(pcre_uchar);
# Line 2859  switch(c) Line 2998  switch(c)
2998      case OP_CRMINSTAR:      case OP_CRMINSTAR:
2999      case OP_CRQUERY:      case OP_CRQUERY:
3000      case OP_CRMINQUERY:      case OP_CRMINQUERY:
3001        case OP_CRPOSSTAR:
3002        case OP_CRPOSQUERY:
3003      list[1] = TRUE;      list[1] = TRUE;
3004      end++;      end++;
3005      break;      break;
3006    
3007        case OP_CRPLUS:
3008        case OP_CRMINPLUS:
3009        case OP_CRPOSPLUS:
3010        end++;
3011        break;
3012    
3013      case OP_CRRANGE:      case OP_CRRANGE:
3014      case OP_CRMINRANGE:      case OP_CRMINRANGE:
3015        case OP_CRPOSRANGE:
3016      list[1] = (GET2(end, 1) == 0);      list[1] = (GET2(end, 1) == 0);
3017      end += 1 + 2 * IMM2_SIZE;      end += 1 + 2 * IMM2_SIZE;
3018      break;      break;
# Line 2895  Returns:      TRUE if the auto-possessif Line 3043  Returns:      TRUE if the auto-possessif
3043    
3044  static BOOL  static BOOL
3045  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3046    const pcre_uint32* base_list)    const pcre_uint32 *base_list, const pcre_uchar *base_end)
3047  {  {
3048  pcre_uchar c;  pcre_uchar c;
3049  pcre_uint32 list[8];  pcre_uint32 list[8];
3050  const pcre_uint32* chr_ptr;  const pcre_uint32 *chr_ptr;
3051  const pcre_uint32* ochr_ptr;  const pcre_uint32 *ochr_ptr;
3052  const pcre_uint32* list_ptr;  const pcre_uint32 *list_ptr;
3053    const pcre_uchar *next_code;
3054    const pcre_uint8 *class_bitset;
3055    const pcre_uint32 *set1, *set2, *set_end;
3056  pcre_uint32 chr;  pcre_uint32 chr;
3057    BOOL accepted, invert_bits;
3058    
3059    /* Note: the base_list[1] contains whether the current opcode has greedy
3060    (represented by a non-zero value) quantifier. This is a different from
3061    other character type lists, which stores here that the character iterator
3062    matches to an empty string (also represented by a non-zero value). */
3063    
3064  for(;;)  for(;;)
3065    {    {
3066      /* All operations move the code pointer forward.
3067      Therefore infinite recursions are not possible. */
3068    
3069    c = *code;    c = *code;
3070    
3071    /* Skip over callouts */    /* Skip over callouts */
# Line 2925  for(;;) Line 3085  for(;;)
3085    switch(c)    switch(c)
3086      {      {
3087      case OP_END:      case OP_END:
3088      /* TRUE only in greedy case. The non-greedy case could be replaced by an      case OP_KETRPOS:
3089      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses      /* TRUE only in greedy case. The non-greedy case could be replaced by
3090      more memory, which we cannot get at this stage.) */      an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3091        uses more memory, which we cannot get at this stage.) */
3092    
3093      return base_list[1] != 0;      return base_list[1] != 0;
3094    
3095      case OP_KET:      case OP_KET:
3096      /* If the bracket is capturing, and referenced by an OP_RECURSE, the      /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3097      non-greedy case cannot be converted to a possessive form. We do not test      it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3098      the bracket type at the moment, but we might do it in the future to improve      cannot be converted to a possessive form. */
     this condition. (But note that recursive calls are always atomic.) */  
3099    
3100      if (base_list[1] == 0) return FALSE;      if (base_list[1] == 0) return FALSE;
3101    
3102        switch(*(code - GET(code, 1)))
3103          {
3104          case OP_ASSERT:
3105          case OP_ASSERT_NOT:
3106          case OP_ASSERTBACK:
3107          case OP_ASSERTBACK_NOT:
3108          case OP_ONCE:
3109          case OP_ONCE_NC:
3110          /* Atomic sub-patterns and assertions can always auto-possessify their
3111          last iterator. */
3112          return TRUE;
3113          }
3114    
3115        code += PRIV(OP_lengths)[c];
3116        continue;
3117    
3118        case OP_ONCE:
3119        case OP_ONCE_NC:
3120        case OP_BRA:
3121        case OP_CBRA:
3122        next_code = code + GET(code, 1);
3123        code += PRIV(OP_lengths)[c];
3124    
3125        while (*next_code == OP_ALT)
3126          {
3127          if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3128          code = next_code + 1 + LINK_SIZE;
3129          next_code += GET(next_code, 1);
3130          }
3131        continue;
3132    
3133        case OP_BRAZERO:
3134        case OP_BRAMINZERO:
3135    
3136        next_code = code + 1;
3137        if (*next_code != OP_BRA && *next_code != OP_CBRA
3138            && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3139    
3140        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3141    
3142        /* The bracket content will be checked by the
3143        OP_BRA/OP_CBRA case above. */
3144        next_code += 1 + LINK_SIZE;
3145        if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3146          return FALSE;
3147    
3148      code += PRIV(OP_lengths)[c];      code += PRIV(OP_lengths)[c];
3149      continue;      continue;
3150      }      }
# Line 2961  for(;;) Line 3168  for(;;)
3168      list_ptr = base_list;      list_ptr = base_list;
3169      }      }
3170    
3171      /* Character bitsets can also be compared to certain opcodes. */
3172    
3173      else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3174    #ifdef COMPILE_PCRE8
3175          /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3176          || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3177    #endif
3178          )
3179        {
3180    #ifdef COMPILE_PCRE8
3181        if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3182    #else
3183        if (base_list[0] == OP_CLASS)
3184    #endif
3185          {
3186          set1 = (pcre_uint32 *)(base_end - base_list[2]);
3187          list_ptr = list;
3188          }
3189        else
3190          {
3191          set1 = (pcre_uint32 *)(code - list[2]);
3192          list_ptr = base_list;
3193          }
3194    
3195        invert_bits = FALSE;
3196        switch(list_ptr[0])
3197          {
3198          case OP_CLASS:
3199          case OP_NCLASS:
3200          set2 = (pcre_uint32 *)
3201            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3202          break;
3203    
3204          /* OP_XCLASS cannot be supported here, because its bitset
3205          is not necessarily complete. E.g: [a-\0x{200}] is stored
3206          as a character range, and the appropriate bits are not set. */
3207    
3208          case OP_NOT_DIGIT:
3209            invert_bits = TRUE;
3210            /* Fall through */
3211          case OP_DIGIT:
3212            set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3213            break;
3214    
3215          case OP_NOT_WHITESPACE:
3216            invert_bits = TRUE;
3217            /* Fall through */
3218          case OP_WHITESPACE:
3219            set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3220            break;
3221    
3222          case OP_NOT_WORDCHAR:
3223            invert_bits = TRUE;
3224            /* Fall through */
3225          case OP_WORDCHAR:
3226            set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3227            break;
3228    
3229          default:
3230          return FALSE;
3231          }
3232    
3233        /* Compare 4 bytes to improve speed. */
3234        set_end = set1 + (32 / 4);
3235        if (invert_bits)
3236          {
3237          do
3238            {
3239            if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3240            }
3241          while (set1 < set_end);
3242          }
3243        else
3244          {
3245          do
3246            {
3247            if ((*set1++ & *set2++) != 0) return FALSE;
3248            }
3249          while (set1 < set_end);
3250          }
3251    
3252        if (list[1] == 0) return TRUE;
3253        /* Might be an empty repeat. */
3254        continue;
3255        }
3256    
3257    /* Some property combinations also acceptable. Unicode property opcodes are    /* Some property combinations also acceptable. Unicode property opcodes are
3258    processed specially; the rest can be handled with a lookup table. */    processed specially; the rest can be handled with a lookup table. */
3259    
# Line 2968  for(;;) Line 3261  for(;;)
3261      {      {
3262      pcre_uint32 leftop, rightop;      pcre_uint32 leftop, rightop;
3263    
     if (list[1] != 0) return FALSE;   /* Must match at least one character */  
3264      leftop = base_list[0];      leftop = base_list[0];
3265      rightop = list[0];      rightop = list[0];
3266    
3267  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3268        accepted = FALSE; /* Always set in non-unicode case. */
3269      if (leftop == OP_PROP || leftop == OP_NOTPROP)      if (leftop == OP_PROP || leftop == OP_NOTPROP)
3270        {        {
3271        if (rightop == OP_EOD) return TRUE;        if (rightop == OP_EOD)
3272        if (rightop == OP_PROP || rightop == OP_NOTPROP)          accepted = TRUE;
3273          else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3274          {          {
3275          int n;          int n;
3276          const pcre_uint8 *p;          const pcre_uint8 *p;
# Line 2997  for(;;) Line 3291  for(;;)
3291          n = propposstab[base_list[2]][list[2]];          n = propposstab[base_list[2]][list[2]];
3292          switch(n)          switch(n)
3293            {            {
3294            case 0: return FALSE;            case 0: break;
3295            case 1: return bothprop;            case 1: accepted = bothprop; break;
3296            case 2: return (base_list[3] == list[3]) != same;            case 2: accepted = (base_list[3] == list[3]) != same; break;
3297            case 3: return !same;            case 3: accepted = !same; break;
3298    
3299            case 4:  /* Left general category, right particular category */            case 4:  /* Left general category, right particular category */
3300            return risprop && catposstab[base_list[3]][list[3]] == same;            accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3301              break;
3302    
3303            case 5:  /* Right general category, left particular category */            case 5:  /* Right general category, left particular category */
3304            return lisprop && catposstab[list[3]][base_list[3]] == same;            accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3305              break;
3306    
3307            /* This code is logically tricky. Think hard before fiddling with it.            /* This code is logically tricky. Think hard before fiddling with it.
3308            The posspropstab table has four entries per row. Each row relates to            The posspropstab table has four entries per row. Each row relates to
3309            one of PCRE's special properties such as ALNUM or SPACE or WORD.            one of PCRE's special properties such as ALNUM or SPACE or WORD.
3310            Only WORD actually needs all four entries, but using repeats for the            Only WORD actually needs all four entries, but using repeats for the
3311            others means they can all use the same code below.            others means they can all use the same code below.
3312    
3313            The first two entries in each row are Unicode general categories, and            The first two entries in each row are Unicode general categories, and
3314            apply always, because all the characters they include are part of the            apply always, because all the characters they include are part of the
3315            PCRE character set. The third and fourth entries are a general and a            PCRE character set. The third and fourth entries are a general and a
# Line 3023  for(;;) Line 3319  for(;;)
3319            category contains more characters than the specials that are defined            category contains more characters than the specials that are defined
3320            for the property being tested against. Therefore, it cannot be used            for the property being tested against. Therefore, it cannot be used
3321            in a NOTPROP case.            in a NOTPROP case.
3322    
3323            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3324            Underscore is covered by ucp_P or ucp_Po. */            Underscore is covered by ucp_P or ucp_Po. */
3325    
# Line 3031  for(;;) Line 3327  for(;;)
3327            case 7:  /* Left space vs right general category */            case 7:  /* Left space vs right general category */
3328            case 8:  /* Left word vs right general category */            case 8:  /* Left word vs right general category */
3329            p = posspropstab[n-6];            p = posspropstab[n-6];
3330            return risprop && lisprop ==            accepted = risprop && lisprop ==
3331              (list[3] != p[0] &&              (list[3] != p[0] &&
3332               list[3] != p[1] &&               list[3] != p[1] &&
3333              (list[3] != p[2] || !lisprop));              (list[3] != p[2] || !lisprop));
3334              break;
3335    
3336            case 9:   /* Right alphanum vs left general category */            case 9:   /* Right alphanum vs left general category */
3337            case 10:  /* Right space vs left general category */            case 10:  /* Right space vs left general category */
3338            case 11:  /* Right word vs left general category */            case 11:  /* Right word vs left general category */
3339            p = posspropstab[n-9];            p = posspropstab[n-9];
3340            return lisprop && risprop ==            accepted = lisprop && risprop ==
3341              (base_list[3] != p[0] &&              (base_list[3] != p[0] &&
3342               base_list[3] != p[1] &&               base_list[3] != p[1] &&
3343              (base_list[3] != p[2] || !risprop));              (base_list[3] != p[2] || !risprop));
3344              break;
3345    
3346            case 12:  /* Left alphanum vs right particular category */            case 12:  /* Left alphanum vs right particular category */
3347            case 13:  /* Left space vs right particular category */            case 13:  /* Left space vs right particular category */
3348            case 14:  /* Left word vs right particular category */            case 14:  /* Left word vs right particular category */
3349            p = posspropstab[n-12];            p = posspropstab[n-12];
3350            return risprop && lisprop ==            accepted = risprop && lisprop ==
3351              (catposstab[p[0]][list[3]] &&              (catposstab[p[0]][list[3]] &&
3352               catposstab[p[1]][list[3]] &&               catposstab[p[1]][list[3]] &&
3353              (list[3] != p[3] || !lisprop));              (list[3] != p[3] || !lisprop));
3354              break;
3355    
3356            case 15:  /* Right alphanum vs left particular category */            case 15:  /* Right alphanum vs left particular category */
3357            case 16:  /* Right space vs left particular category */            case 16:  /* Right space vs left particular category */
3358            case 17:  /* Right word vs left particular category */            case 17:  /* Right word vs left particular category */
3359            p = posspropstab[n-15];            p = posspropstab[n-15];
3360            return lisprop && risprop ==            accepted = lisprop && risprop ==
3361              (catposstab[p[0]][base_list[3]] &&              (catposstab[p[0]][base_list[3]] &&
3362               catposstab[p[1]][base_list[3]] &&               catposstab[p[1]][base_list[3]] &&
3363              (base_list[3] != p[3] || !risprop));              (base_list[3] != p[3] || !risprop));
3364              break;
3365            }            }
3366          }          }
       return FALSE;  
3367        }        }
3368    
3369      else      else
3370  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
3371    
3372      return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&      accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3373             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3374             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3375    
3376        if (!accepted)
3377          return FALSE;
3378    
3379        if (list[1] == 0) return TRUE;
3380        /* Might be an empty repeat. */
3381        continue;
3382      }      }
3383    
3384    /* Control reaches here only if one of the items is a small character list.    /* Control reaches here only if one of the items is a small character list.
# Line 3186  for(;;) Line 3492  for(;;)
3492        case OP_EOD:    /* Can always possessify before \z */        case OP_EOD:    /* Can always possessify before \z */
3493        break;        break;
3494    
3495    #ifdef SUPPORT_UCP
3496        case OP_PROP:        case OP_PROP:
3497        case OP_NOTPROP:        case OP_NOTPROP:
3498        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3499              list_ptr[0] == OP_NOTPROP))              list_ptr[0] == OP_NOTPROP))
3500          return FALSE;          return FALSE;
3501        break;        break;
3502    #endif
       /* The class comparisons work only when the class is the second item  
       of the pair, because there are at present no possessive forms of the  
       class opcodes. Note also that the "code" variable that is used below  
       points after the second item, and that the pointer for the first item  
       is not available, so even if there were possessive forms of the class  
       opcodes, the correct comparison could not be done. */  
3503    
3504        case OP_NCLASS:        case OP_NCLASS:
3505        if (chr > 255) return FALSE;        if (chr > 255) return FALSE;
3506        /* Fall through */        /* Fall through */
3507    
3508        case OP_CLASS:        case OP_CLASS:
       if (list_ptr != list) return FALSE;   /* Class is first opcode */  
3509        if (chr > 255) break;        if (chr > 255) break;
3510        if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)        class_bitset = (pcre_uint8 *)
3511          return FALSE;          ((list_ptr == list ? code : base_end) - list_ptr[2]);
3512          if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3513        break;        break;
3514    
3515  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3516        case OP_XCLASS:        case OP_XCLASS:
3517        if (list_ptr != list) return FALSE;   /* Class is first opcode */        if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3518        if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))            list_ptr[2] + LINK_SIZE, utf)) return FALSE;
         return FALSE;  
3519        break;        break;
3520  #endif  #endif
3521    
# Line 3257  auto_possessify(pcre_uchar *code, BOOL u Line 3557  auto_possessify(pcre_uchar *code, BOOL u
3557  {  {
3558  register pcre_uchar c;  register pcre_uchar c;
3559  const pcre_uchar *end;  const pcre_uchar *end;
3560    pcre_uchar *repeat_opcode;
3561  pcre_uint32 list[8];  pcre_uint32 list[8];
3562    
3563  for (;;)  for (;;)
# Line 3270  for (;;) Line 3571  for (;;)
3571        get_chr_property_list(code, utf, cd->fcc, list) : NULL;        get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3572      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3573    
3574      if (end != NULL && compare_opcodes(end, utf, cd, list))      if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3575        {        {
3576        switch(c)        switch(c)
3577          {          {
# Line 3309  for (;;) Line 3610  for (;;)
3610        }        }
3611      c = *code;      c = *code;
3612      }      }
3613      else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3614        {
3615    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3616        if (c == OP_XCLASS)
3617          repeat_opcode = code + GET(code, 1);
3618        else
3619    #endif
3620          repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3621    
3622        c = *repeat_opcode;
3623        if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3624          {
3625          /* end must not be NULL. */
3626          end = get_chr_property_list(code, utf, cd->fcc, list);
3627    
3628          list[1] = (c & 1) == 0;
3629    
3630          if (compare_opcodes(end, utf, cd, list, end))
3631            {
3632            switch (c)
3633              {
3634              case OP_CRSTAR:
3635              case OP_CRMINSTAR:
3636              *repeat_opcode = OP_CRPOSSTAR;
3637              break;
3638    
3639              case OP_CRPLUS:
3640              case OP_CRMINPLUS:
3641              *repeat_opcode = OP_CRPOSPLUS;
3642              break;
3643    
3644              case OP_CRQUERY:
3645              case OP_CRMINQUERY:
3646              *repeat_opcode = OP_CRPOSQUERY;
3647              break;
3648    
3649              case OP_CRRANGE:
3650              case OP_CRMINRANGE:
3651              *repeat_opcode = OP_CRPOSRANGE;
3652              break;
3653              }
3654            }
3655          }
3656        c = *code;
3657        }
3658    
3659    switch(c)    switch(c)
3660      {      {
# Line 3335  for (;;) Line 3681  for (;;)
3681        code += 2;        code += 2;
3682      break;      break;
3683    
3684    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3685      case OP_XCLASS:      case OP_XCLASS:
3686      code += GET(code, 1);      code += GET(code, 1);
3687      break;      break;
3688    #endif
3689    
3690      case OP_MARK:      case OP_MARK:
3691      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
# Line 3446  class, but [abc[:x\]pqr:]] is (so that a Line 3794  class, but [abc[:x\]pqr:]] is (so that a
3794  below handles the special case of \], but does not try to do any other escape  below handles the special case of \], but does not try to do any other escape
3795  processing. This makes it different from Perl for cases such as [:l\ower:]  processing. This makes it different from Perl for cases such as [:l\ower:]
3796  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3797  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
3798  I think.  I think.
3799    
3800  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
# Line 4201  for (;; ptr++) Line 4549  for (;; ptr++)
4549        }        }
4550      }      }
4551    
   /* Fill in length of a previous callout, except when the next thing is  
   a quantifier. */  
   
4552    is_quantifier =    is_quantifier =
4553      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4554      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4555    
4556    if (!is_quantifier && previous_callout != NULL &&    /* Fill in length of a previous callout, except when the next thing is a
4557      quantifier or when processing a property substitution string in UCP mode. */
4558    
4559      if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4560         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
4561      {      {
4562      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
# Line 4239  for (;; ptr++) Line 4587  for (;; ptr++)
4587        }        }
4588      }      }
4589    
4590    /* No auto callout for quantifiers. */    /* No auto callout for quantifiers, or while processing property strings that
4591      are substituted for \w etc in UCP mode. */
4592    
4593    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4594      {      {
4595      previous_callout = code;      previous_callout = code;
4596      code = auto_callout(code, ptr, cd);      code = auto_callout(code, ptr, cd);
# Line 4493  for (;; ptr++) Line 4842  for (;; ptr++)
4842            posix_class = 0;            posix_class = 0;
4843    
4844          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
4845          different escape sequences that use Unicode properties. */          different escape sequences that use Unicode properties \p or \P. Others
4846            that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4847            directly. */
4848    
4849  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4850          if ((options & PCRE_UCP) != 0)          if ((options & PCRE_UCP) != 0)
4851            {            {
4852              unsigned int ptype = 0;
4853            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4854    
4855              /* The posix_substitutes table specifies which POSIX classes can be
4856              converted to \p or \P items. */
4857    
4858            if (posix_substitutes[pc] != NULL)            if (posix_substitutes[pc] != NULL)
4859              {              {
4860              nestptr = tempptr + 1;              nestptr = tempptr + 1;
4861              ptr = posix_substitutes[pc] - 1;              ptr = posix_substitutes[pc] - 1;
4862              continue;              continue;
4863              }              }
4864    
4865              /* There are three other classes that generate special property calls
4866              that are recognized only in an XCLASS. */
4867    
4868              else switch(posix_class)
4869                {
4870                case PC_GRAPH:
4871                ptype = PT_PXGRAPH;
4872                /* Fall through */
4873                case PC_PRINT:
4874                if (ptype == 0) ptype = PT_PXPRINT;
4875                /* Fall through */
4876                case PC_PUNCT:
4877                if (ptype == 0) ptype = PT_PXPUNCT;
4878                *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4879                *class_uchardata++ = ptype;
4880                *class_uchardata++ = 0;
4881                ptr = tempptr + 1;
4882                continue;
4883    
4884                /* For all other POSIX classes, no special action is taken in UCP
4885                mode. Fall through to the non_UCP case. */
4886    
4887                default:
4888                break;
4889                }
4890            }            }
4891  #endif  #endif
4892          /* In the non-UCP case, we build the bit map for the POSIX class in a          /* In the non-UCP case, or when UCP makes no difference, we build the
4893          chunk of local store because we may be adding and subtracting from it,          bit map for the POSIX class in a chunk of local store because we may be
4894          and we don't want to subtract bits that may be in the main map already.          adding and subtracting from it, and we don't want to subtract bits that
4895          At the end we or the result into the bit map that is being built. */          may be in the main map already. At the end we or the result into the
4896            bit map that is being built. */
4897    
4898          posix_class *= 3;          posix_class *= 3;
4899    
# Line 4627  for (;; ptr++) Line 5010  for (;; ptr++)
5010              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5011              continue;              continue;
5012    
5013              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5014              if it was previously set by something earlier in the character              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5015              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and              previously set by something earlier in the character class.
5016              EBCDIC, so we lazily just adjust the appropriate bit. */              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5017                we could just adjust the appropriate bit. From PCRE 8.34 we no
5018                longer treat \s and \S specially. */
5019    
5020              case ESC_s:              case ESC_s:
5021              classbits[0] |= cbits[cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
             classbits[1] |= cbits[cbit_space+1] & ~0x08;  
             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];  
5022              continue;              continue;
5023    
5024              case ESC_S:              case ESC_S:
5025              should_flip_negation = TRUE;              should_flip_negation = TRUE;
5026              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */  
5027              continue;              continue;
5028    
5029              /* The rest apply in both UCP and non-UCP cases. */              /* The rest apply in both UCP and non-UCP cases. */
# Line 4762  for (;; ptr++) Line 5144  for (;; ptr++)
5144          else          else
5145  #endif  #endif
5146          d = *ptr;  /* Not UTF-8 mode */          d = *ptr;  /* Not UTF-8 mode */
5147    
5148          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape
5149          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          sequence, but not any of the other escapes. Perl treats a hyphen as a
5150          in such circumstances. */          literal in such circumstances. However, in Perl's warning mode, a
5151            warning is given, so PCRE now faults it as it is almost certainly a
5152          if (!inescq && d == CHAR_BACKSLASH)          mistake on the user's part. */
5153            {  
5154            int descape;          if (!inescq)
5155            descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);            {
5156            if (*errorcodeptr != 0) goto FAILED;            if (d == CHAR_BACKSLASH)
   
           /* \b is backspace; any other special means the '-' was literal. */  
   
           if (descape != 0)  
5157              {              {
5158              if (descape == ESC_b) d = CHAR_BS; else              int descape;
5159                descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5160                if (*errorcodeptr != 0) goto FAILED;
5161    
5162                /* 0 means a character was put into d; \b is backspace; any other
5163                special causes an error. */
5164    
5165                if (descape != 0)
5166                {                {
5167                ptr = oldptr;                if (descape == ESC_b) d = CHAR_BS; else
5168                goto CLASS_SINGLE_CHARACTER;  /* A few lines below */                  {
5169                    *errorcodeptr = ERR83;
5170                    goto FAILED;
5171                    }
5172                }                }
5173              }              }
5174            }  
5175              /* A hyphen followed by a POSIX class is treated in the same way. */
5176    
5177              else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5178                       (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5179                        ptr[1] == CHAR_EQUALS_SIGN) &&
5180                       check_posix_syntax(ptr, &tempptr))
5181                {
5182                *errorcodeptr = ERR83;
5183                goto FAILED;
5184                }
5185              }
5186    
5187          /* Check that the two values are in the correct order. Optimize          /* Check that the two values are in the correct order. Optimize
5188          one-character ranges. */          one-character ranges. */
# Line 5338  for (;; ptr++) Line 5737  for (;; ptr++)
5737      opcodes such as BRA and CBRA, as this is the place where they get converted      opcodes such as BRA and CBRA, as this is the place where they get converted
5738      into the more special varieties such as BRAPOS and SBRA. A test for >=      into the more special varieties such as BRAPOS and SBRA. A test for >=
5739      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5740      ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow      ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5741      repetition of assertions, but now it does, for Perl compatibility. */      Originally, PCRE did not allow repetition of assertions, but now it does,
5742        for Perl compatibility. */
5743    
5744      else if (*previous >= OP_ASSERT && *previous <= OP_COND)      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5745        {        {
# Line 5357  for (;; ptr++) Line 5757  for (;; ptr++)
5757        /* There is no sense in actually repeating assertions. The only potential        /* There is no sense in actually repeating assertions. The only potential
5758        use of repetition is in cases when the assertion is optional. Therefore,        use of repetition is in cases when the assertion is optional. Therefore,
5759        if the minimum is greater than zero, just ignore the repeat. If the        if the minimum is greater than zero, just ignore the repeat. If the
5760        maximum is not not zero or one, set it to 1. */        maximum is not zero or one, set it to 1. */
5761    
5762        if (*previous < OP_ONCE)    /* Assertion */        if (*previous < OP_ONCE)    /* Assertion */
5763          {          {
# Line 5730  for (;; ptr++) Line 6130  for (;; ptr++)
6130        goto FAILED;        goto FAILED;
6131        }        }
6132    
6133      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', possessive_quantifier is
6134      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,      TRUE. For some opcodes, there are special alternative opcodes for this
6135      there are special alternative opcodes for this case. For anything else, we      case. For anything else, we wrap the entire repeated item inside OP_ONCE
6136      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      brackets. Logically, the '+' notation is just syntactic sugar, taken from
6137      notation is just syntactic sugar, taken from Sun's Java package, but the      Sun's Java package, but the special opcodes can optimize it.
     special opcodes can optimize it.  
6138    
6139      Some (but not all) possessively repeated subpatterns have already been      Some (but not all) possessively repeated subpatterns have already been
6140      completely handled in the code just above. For them, possessive_quantifier      completely handled in the code just above. For them, possessive_quantifier
6141      is always FALSE at this stage.      is always FALSE at this stage. Note that the repeated item starts at
6142        tempcode, not at previous, which might be the first part of a string whose
6143      Note that the repeated item starts at tempcode, not at previous, which      (former) last char we repeated. */
     might be the first part of a string whose (former) last char we repeated.  
   
     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But  
     an 'upto' may follow. We skip over an 'exact' item, and then test the  
     length of what remains before proceeding. */  
6144    
6145      if (possessive_quantifier)      if (possessive_quantifier)
6146        {        {
6147        int len;        int len;
6148    
6149        if (*tempcode == OP_TYPEEXACT)        /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6150          However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6151          {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6152          remains is greater than zero, there's a further opcode that can be
6153          handled. If not, do nothing, leaving the EXACT alone. */
6154    
6155          switch(*tempcode)
6156            {
6157            case OP_TYPEEXACT:
6158          tempcode += PRIV(OP_lengths)[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
6159            ((tempcode[1 + IMM2_SIZE] == OP_PROP            ((tempcode[1 + IMM2_SIZE] == OP_PROP
6160            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6161            break;
6162    
6163        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)          /* CHAR opcodes are used for exacts whose count is 1. */
6164          {  
6165            case OP_CHAR:
6166            case OP_CHARI:
6167            case OP_NOT:
6168            case OP_NOTI:
6169            case OP_EXACT:
6170            case OP_EXACTI:
6171            case OP_NOTEXACT:
6172            case OP_NOTEXACTI:
6173          tempcode += PRIV(OP_lengths)[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
6174  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6175          if (utf && HAS_EXTRALEN(tempcode[-1]))          if (utf && HAS_EXTRALEN(tempcode[-1]))
6176            tempcode += GET_EXTRALEN(tempcode[-1]);            tempcode += GET_EXTRALEN(tempcode[-1]);
6177  #endif  #endif
6178            break;
6179    
6180            /* For the class opcodes, the repeat operator appears at the end;
6181            adjust tempcode to point to it. */
6182    
6183            case OP_CLASS:
6184            case OP_NCLASS:
6185            tempcode += 1 + 32/sizeof(pcre_uchar);
6186            break;
6187    
6188    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6189            case OP_XCLASS:
6190            tempcode += GET(tempcode, 1);
6191            break;
6192    #endif
6193          }          }
6194    
6195          /* If tempcode is equal to code (which points to the end of the repeated
6196          item), it means we have skipped an EXACT item but there is no following
6197          QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6198          all other cases, tempcode will be pointing to the repeat opcode, and will
6199          be less than code, so the value of len will be greater than 0. */
6200    
6201        len = (int)(code - tempcode);        len = (int)(code - tempcode);
6202          if (len > 0)
6203            {
6204            unsigned int repcode = *tempcode;
6205    
6206            /* There is a table for possessifying opcodes, all of which are less
6207            than OP_CALLOUT. A zero entry means there is no possessified version.
6208            */
6209    
6210            if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6211              *tempcode = opcode_possessify[repcode];
6212    
6213            /* For opcode without a special possessified version, wrap the item in
6214            ONCE brackets. Because we are moving code along, we must ensure that any
6215            pending recursive references are updated. */
6216    
6217            else
6218              {
6219              *code = OP_END;
6220              adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6221              memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6222              code += 1 + LINK_SIZE;
6223              len += 1 + LINK_SIZE;
6224              tempcode[0] = OP_ONCE;
6225              *code++ = OP_KET;
6226              PUTINC(code, 0, len);
6227              PUT(tempcode, 1, len);
6228              }
6229            }
6230    
6231    #ifdef NEVER
6232        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
6233          {          {
6234          case OP_STAR:  *tempcode = OP_POSSTAR; break;          case OP_STAR:  *tempcode = OP_POSSTAR; break;
# Line 5794  for (;; ptr++) Line 6256  for (;; ptr++)
6256          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6257          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6258    
6259            case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6260            case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6261            case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6262            case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6263    
6264          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
6265          pending recursive references are updated. */          pending recursive references are updated. */
6266    
# Line 5809  for (;; ptr++) Line 6276  for (;; ptr++)
6276          PUT(tempcode, 1, len);          PUT(tempcode, 1, len);
6277          break;          break;
6278          }          }
6279    #endif
6280        }        }
6281    
6282      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 6031  for (;; ptr++) Line 6499  for (;; ptr++)
6499                 tempptr[2] == CHAR_LESS_THAN_SIGN))                 tempptr[2] == CHAR_LESS_THAN_SIGN))
6500            break;            break;
6501    
6502          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6503          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6504    
6505          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6506          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
# Line 6048  for (;; ptr++) Line 6516  for (;; ptr++)
6516            }            }
6517    
6518          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6519          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6520            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6521            consist entirely of digits, there is scope for ambiguity. */
6522    
6523          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6524            {            {
# Line 6066  for (;; ptr++) Line 6536  for (;; ptr++)
6536            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6537            }            }
6538    
6539          /* We now expect to read a name; any thing else is an error */          /* When a name is one of a number of duplicates, a different opcode is
6540            used and it needs more memory. Unfortunately we cannot tell whether a
6541            name is a duplicate in the first pass, so we have to allow for more
6542            memory except when we know it is a relative numerical reference. */
6543    
6544            if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6545    
6546            /* We now expect to read a name (possibly all digits); any thing else
6547            is an error. In the case of all digits, also get it as a number. */
6548    
6549          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6550            {            {
# Line 6075  for (;; ptr++) Line 6553  for (;; ptr++)
6553            goto FAILED;            goto FAILED;
6554            }            }
6555    
         /* Read the name, but also get it as a number if it's all digits */  
   
6556          recno = 0;          recno = 0;
6557          name = ++ptr;          name = ++ptr;
6558          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
# Line 6087  for (;; ptr++) Line 6563  for (;; ptr++)
6563            }            }
6564          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6565    
6566            /* Check the terminator */
6567    
6568          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6569              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6570            {            {
# Line 6122  for (;; ptr++) Line 6600  for (;; ptr++)
6600            }            }
6601    
6602          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
6603          name. If we find a name, add one to the opcode to change OP_CREF or          name. */
         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
6604    
6605          slot = cd->name_table;          slot = cd->name_table;
6606          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 6134  for (;; ptr++) Line 6609  for (;; ptr++)
6609            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6610            }            }
6611    
6612          /* Found the named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6613            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6614            appropriate data values. Otherwise, just insert the unique subpattern
6615            number. */
6616    
6617          if (i < cd->names_found)          if (i < cd->names_found)
6618            {            {
6619            recno = GET2(slot, 0);            int offset = i++;
6620            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6621            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6622              for (; i < cd->names_found; i++)
6623                {
6624                slot += cd->name_entry_size;
6625                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6626                count++;
6627                }
6628              if (count > 1)
6629                {
6630                PUT2(code, 2+LINK_SIZE, offset);
6631                PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6632                skipbytes += IMM2_SIZE;
6633                code[1+LINK_SIZE]++;
6634                }
6635              else  /* Not a duplicated name */
6636                {
6637                PUT2(code, 2+LINK_SIZE, recno);
6638                }
6639            }            }
6640    
6641          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
# Line 6209  for (;; ptr++) Line 6704  for (;; ptr++)
6704          ptr++;          ptr++;
6705          break;          break;
6706    
6707            /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6708            thing to do, but Perl allows all assertions to be quantified, and when
6709            they contain capturing parentheses there may be a potential use for
6710            this feature. Not that that applies to a quantified (?!) but we allow
6711            it for uniformity. */
6712    
6713          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
6714          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6715          ptr++;          ptr++;
6716          if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */          if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6717                 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6718                (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6719            {            {
6720            *code++ = OP_FAIL;            *code++ = OP_FAIL;
6721            previous = NULL;            previous = NULL;
# Line 6815  for (;; ptr++) Line 7317  for (;; ptr++)
7317        skipbytes = IMM2_SIZE;        skipbytes = IMM2_SIZE;
7318        }        }
7319    
7320      /* Process nested bracketed regex. Assertions used not to be repeatable,      /* Process nested bracketed regex. First check for parentheses nested too
7321      but this was changed for Perl compatibility, so all kinds can now be      deeply. */
7322      repeated. We copy code into a non-register variable (tempcode) in order to  
7323      be able to pass its address because some compilers complain otherwise. */      if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7324          {
7325          *errorcodeptr = ERR82;
7326          goto FAILED;
7327          }
7328    
7329        /* Assertions used not to be repeatable, but this was changed for Perl
7330        compatibility, so all kinds can now be repeated. We copy code into a
7331        non-register variable (tempcode) in order to be able to pass its address
7332        because some compilers complain otherwise. */
7333    
7334      previous = code;                      /* For handling repetition */      previous = code;                      /* For handling repetition */
7335      *code = bravalue;      *code = bravalue;
# Line 6848  for (;; ptr++) Line 7359  for (;; ptr++)
7359             &length_prevgroup              /* Pre-compile phase */             &length_prevgroup              /* Pre-compile phase */
7360           ))           ))
7361        goto FAILED;        goto FAILED;
7362    
7363        cd->parens_depth -= 1;
7364    
7365      /* If this was an atomic group and there are no capturing groups within it,      /* If this was an atomic group and there are no capturing groups within it,
7366      generate OP_ONCE_NC instead of OP_ONCE. */      generate OP_ONCE_NC instead of OP_ONCE. */
# Line 7830  do { Line 8343  do {
8343       switch (*scode)       switch (*scode)
8344         {         {
8345         case OP_CREF:         case OP_CREF:
8346         case OP_NCREF:         case OP_DNCREF:
8347         case OP_RREF:         case OP_RREF:
8348         case OP_NRREF:         case OP_DNRREF:
8349         case OP_DEF:         case OP_DEF:
8350         return FALSE;         return FALSE;
8351    
# Line 8423  cd->named_group_list_size = NAMED_GROUP_ Line 8936  cd->named_group_list_size = NAMED_GROUP_
8936  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
8937  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
8938  cd->req_varyopt = 0;  cd->req_varyopt = 0;
8939    cd->parens_depth = 0;
8940  cd->assert_depth = 0;  cd->assert_depth = 0;
8941  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
8942  cd->external_options = options;  cd->external_options = options;
# Line 8508  field; this time it's used for rememberi Line 9022  field; this time it's used for rememberi
9022  */  */
9023    
9024  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9025    cd->parens_depth = 0;
9026  cd->assert_depth = 0;  cd->assert_depth = 0;
9027  cd->bracount = 0;  cd->bracount = 0;
9028  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
# Line 8837  return (pcre32 *)re; Line 9352  return (pcre32 *)re;
9352  }  }
9353    
9354  /* End of pcre_compile.c */  /* End of pcre_compile.c */
9355    

Legend:
Removed from v.1363  
changed lines
  Added in v.1393

  ViewVC Help
Powered by ViewVC 1.1.5