/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1363 by ph10, Tue Oct 1 16:54:40 2013 UTC revision 1380 by ph10, Tue Oct 15 16:49:12 2013 UTC
# Line 462  static const char error_texts[] = Line 462  static const char error_texts[] =
462    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
463    "this version of PCRE is compiled without UTF support\0"    "this version of PCRE is compiled without UTF support\0"
464    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
465    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
466    /* 35 */    /* 35 */
467    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
468    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 516  static const char error_texts[] = Line 516  static const char error_texts[] =
516    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
517    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
518    "setting UTF is disabled by the application\0"    "setting UTF is disabled by the application\0"
519      "non-hex character in \\x{} (closing brace missing?)\0"
520      /* 80 */
521      "non-octal character in \\o{} (closing brace missing?)\0"
522      "missing opening brace after \\o\0"
523    ;    ;
524    
525  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 773  static const pcre_uint8 posspropstab[3][ Line 777  static const pcre_uint8 posspropstab[3][
777    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
778  };  };
779    
780    /* This table is used when converting repeating opcodes into possessified
781    versions as a result of an explicit possessive quantifier such as ++. A zero
782    value means there is no possessified version - in those cases the item in
783    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
784    because all relevant opcodes are less than that. */
785    
786    static const pcre_uint8 opcode_possessify[] = {
787      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
788      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
789    
790      0,                       /* NOTI */
791      OP_POSSTAR, 0,           /* STAR, MINSTAR */
792      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
793      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
794      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
795      0,                       /* EXACT */
796      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
797    
798      OP_POSSTARI, 0,          /* STARI, MINSTARI */
799      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
800      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
801      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
802      0,                       /* EXACTI */
803      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
804    
805      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
806      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
807      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
808      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
809      0,                       /* NOTEXACT */
810      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
811    
812      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
813      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
814      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
815      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
816      0,                       /* NOTEXACTI */
817      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
818    
819      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
820      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
821      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
822      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
823      0,                       /* TYPEEXACT */
824      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
825    
826      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
827      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
828      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
829      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
830      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
831    
832      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
833      0, 0,                    /* REF, REFI */
834      0, 0,                    /* DNREF, DNREFI */
835      0, 0                     /* RECURSE, CALLOUT */
836    };
837    
838    
839    
840  /*************************************************  /*************************************************
# Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 941  return (*p == CHAR_RIGHT_CURLY_BRACKET);
941  *************************************************/  *************************************************/
942    
943  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
944  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
945  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
946  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
947  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
948  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
949    
950  Arguments:  Arguments:
951    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
952    chptr          points to the data character    chptr          points to a returned data character
953    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
954    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
955    options        the options bits    options        the options bits
# Line 1092  else Line 1153  else
1153      break;      break;
1154    
1155      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1156      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1157      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1158        recommended to avoid the ambiguities in the old syntax.
1159    
1160      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1161      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1162      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1163      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1164      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1165      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1166      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1167    
1168        Inside a character class, \ followed by a digit is always either a literal
1169        8 or 9 or an octal number. */
1170    
1171      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1172      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1128  else Line 1193  else
1193          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1194          break;          break;
1195          }          }
1196        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1197          {          {
1198          escape = -s;          escape = -s;
1199          break;          break;
# Line 1136  else Line 1201  else
1201        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1202        }        }
1203    
1204      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1205      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1206      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1207        changed so as not to insert the binary zero. */
1208    
1209      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1210        {  
1211        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1212    
1213      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1214      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1162  else Line 1225  else
1225  #endif  #endif
1226      break;      break;
1227    
1228      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1229      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.      specifying character codes in octal. The only supported form is \o{ddd}. */
1230      If not, { is treated as a data character. */  
1231        case CHAR_o:
1232        if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1233          {
1234          ptr += 2;
1235          c = 0;
1236          overflow = FALSE;
1237          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1238            {
1239            register pcre_uint32 cc = *ptr++;
1240            if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1241    #ifdef COMPILE_PCRE32
1242            if (c >= 0x20000000l) { overflow = TRUE; break; }
1243    #endif
1244            c = (c << 3) + cc - CHAR_0 ;
1245    #if defined COMPILE_PCRE8
1246            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1247    #elif defined COMPILE_PCRE16
1248            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1249    #elif defined COMPILE_PCRE32
1250            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1251    #endif
1252            }
1253          if (overflow)
1254            {
1255            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1256            *errorcodeptr = ERR34;
1257            }
1258          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1259            {
1260            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1261            }
1262          else *errorcodeptr = ERR80;
1263          }
1264        break;
1265    
1266        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1267        numbers. Otherwise it is a lowercase x letter. */
1268    
1269      case CHAR_x:      case CHAR_x:
1270      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1271        {        {
       /* In JavaScript, \x must be followed by two hexadecimal numbers.  
       Otherwise it is a lowercase x letter. */  
1272        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1273          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1274          {          {
# Line 1187  else Line 1285  else
1285  #endif  #endif
1286            }            }
1287          }          }
1288        break;        }    /* End JavaScript handling */
       }  
1289    
1290      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1291        {      greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1292        const pcre_uchar *pt = ptr + 2;      digits. If not, { used to be treated as a data character. However, Perl
1293        seems to read hex digits up to the first non-such, and ignore the rest, so
1294        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1295        now gives an error. */
1296    
1297        c = 0;      else
1298        overflow = FALSE;        {
1299        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1300          {          {
1301          register pcre_uint32 cc = *pt++;          ptr += 2;
1302          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          c = 0;
1303            overflow = FALSE;
1304            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1305              {
1306              register pcre_uint32 cc = *ptr++;
1307              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1308    
1309  #ifdef COMPILE_PCRE32  #ifdef COMPILE_PCRE32
1310          if (c >= 0x10000000l) { overflow = TRUE; break; }            if (c >= 0x10000000l) { overflow = TRUE; break; }
1311  #endif  #endif
1312    
1313  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1314          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1315          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1316  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1317          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1318          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1319  #endif  #endif
1320    
1321  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1322          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1323  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1324          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1325  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1326          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1327  #endif  #endif
1328          }            }
1329    
1330        if (overflow)          if (overflow)
1331          {            {
1332          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1333          *errorcodeptr = ERR34;            *errorcodeptr = ERR34;
1334          }            }
1335    
1336        if (*pt == CHAR_RIGHT_CURLY_BRACKET)          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1337          {            {
1338          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1339          ptr = pt;            }
         break;  
         }  
1340    
1341        /* If the sequence of hex digits does not end with '}', then we don't          /* If the sequence of hex digits does not end with '}', give an error.
1342        recognize this construct; fall through to the normal \x handling. */          We used just to recognize this construct and fall through to the normal
1343        }          \x handling, but nowadays Perl gives an error, which seems much more
1344            sensible, so we do too. */
1345    
1346      /* Read just a single-byte hex-defined char */          else *errorcodeptr = ERR79;
1347            }   /* End of \x{} processing */
1348    
1349      c = 0;        /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1350      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)  
1351        {        else
1352        pcre_uint32 cc;                          /* Some compilers don't like */          {
1353        cc = *(++ptr);                           /* ++ in initializers */          c = 0;
1354            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1355              {
1356              pcre_uint32 cc;                          /* Some compilers don't like */
1357              cc = *(++ptr);                           /* ++ in initializers */
1358  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1359        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1360        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1361  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1362        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1363        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1364  #endif  #endif
1365        }            }
1366            }     /* End of \xdd handling */
1367          }       /* End of Perl-style \x handling */
1368      break;      break;
1369    
1370      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 1524  for (;;) Line 1635  for (;;)
1635    
1636      case OP_CALLOUT:      case OP_CALLOUT:
1637      case OP_CREF:      case OP_CREF:
1638      case OP_NCREF:      case OP_DNCREF:
1639      case OP_RREF:      case OP_RREF:
1640      case OP_NRREF:      case OP_DNRREF:
1641      case OP_DEF:      case OP_DEF:
1642      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1643      break;      break;
# Line 1663  for (;;) Line 1774  for (;;)
1774      case OP_COMMIT:      case OP_COMMIT:
1775      case OP_CREF:      case OP_CREF:
1776      case OP_DEF:      case OP_DEF:
1777        case OP_DNCREF:
1778        case OP_DNRREF:
1779      case OP_DOLL:      case OP_DOLL:
1780      case OP_DOLLM:      case OP_DOLLM:
1781      case OP_EOD:      case OP_EOD:
1782      case OP_EODN:      case OP_EODN:
1783      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1784      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1785      case OP_PRUNE:      case OP_PRUNE:
1786      case OP_REVERSE:      case OP_REVERSE:
# Line 1764  for (;;) Line 1875  for (;;)
1875    
1876      switch (*cc)      switch (*cc)
1877        {        {
       case OP_CRPLUS:  
       case OP_CRMINPLUS:  
1878        case OP_CRSTAR:        case OP_CRSTAR:
1879        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1880          case OP_CRPLUS:
1881          case OP_CRMINPLUS:
1882        case OP_CRQUERY:        case OP_CRQUERY:
1883        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1884          case OP_CRPOSSTAR:
1885          case OP_CRPOSPLUS:
1886          case OP_CRPOSQUERY:
1887        return -1;        return -1;
1888    
1889        case OP_CRRANGE:        case OP_CRRANGE:
1890        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1891          case OP_CRPOSRANGE:
1892        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1893        branchlength += (int)GET2(cc,1);        branchlength += (int)GET2(cc,1);
1894        cc += 1 + 2 * IMM2_SIZE;        cc += 1 + 2 * IMM2_SIZE;
# Line 2366  for (code = first_significant_code(code Line 2481  for (code = first_significant_code(code
2481        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2482        case OP_CRQUERY:        case OP_CRQUERY:
2483        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2484          case OP_CRPOSSTAR:
2485          case OP_CRPOSQUERY:
2486        break;        break;
2487    
2488        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2489        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2490        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2491          case OP_CRPOSPLUS:
2492        return FALSE;        return FALSE;
2493    
2494        case OP_CRRANGE:        case OP_CRRANGE:
2495        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2496          case OP_CRPOSRANGE:
2497        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2498        break;        break;
2499        }        }
# Line 2650  switch(ptype) Line 2769  switch(ptype)
2769    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2770            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2771    
2772    case PT_SPACE:    /* Perl space */    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2773    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    means that Perl space and POSIX space are now identical. PCRE was changed
2774            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)    at release 8.34. */
           == negated;  
2775    
2776      case PT_SPACE:    /* Perl space */
2777    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2778    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    switch(c)
2779            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||      {
2780            c == CHAR_FF || c == CHAR_CR)      HSPACE_CASES:
2781            == negated;      VSPACE_CASES:
2782        return negated;
2783    
2784        default:
2785        return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2786        }
2787      break;  /* Control never reaches here */
2788    
2789    case PT_WORD:    case PT_WORD:
2790    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
# Line 2818  switch(c) Line 2943  switch(c)
2943      return code + 2;      return code + 2;
2944      }      }
2945    
2946    /* Convert only if we have anough space. */    /* Convert only if we have enough space. */
2947    
2948    clist_src = PRIV(ucd_caseless_sets) + code[1];    clist_src = PRIV(ucd_caseless_sets) + code[1];
2949    clist_dest = list + 2;    clist_dest = list + 2;
2950    code += 2;    code += 2;
2951    
2952    do {    do {
      /* Early return if there is not enough space. */  
2953       if (clist_dest >= list + 8)       if (clist_dest >= list + 8)
2954         {         {
2955           /* Early return if there is not enough space. This should never
2956           happen, since all clists are shorter than 5 character now. */
2957         list[2] = code[0];         list[2] = code[0];
2958         list[3] = code[1];         list[3] = code[1];
2959         return code;         return code;
2960         }         }
2961       *clist_dest++ = *clist_src;       *clist_dest++ = *clist_src;
2962       }       }
2963     while(*clist_src++ != NOTACHAR);    while(*clist_src++ != NOTACHAR);
2964    
2965    /* Enough space to store all characters. */    /* All characters are stored. The terminating NOTACHAR
2966      is copied form the clist itself. */
2967    
2968    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2969    return code;    return code;
# Line 2859  switch(c) Line 2986  switch(c)
2986      case OP_CRMINSTAR:      case OP_CRMINSTAR:
2987      case OP_CRQUERY:      case OP_CRQUERY:
2988      case OP_CRMINQUERY:      case OP_CRMINQUERY:
2989        case OP_CRPOSSTAR:
2990        case OP_CRPOSQUERY:
2991      list[1] = TRUE;      list[1] = TRUE;
2992      end++;      end++;
2993      break;      break;
2994    
2995        case OP_CRPLUS:
2996        case OP_CRMINPLUS:
2997        case OP_CRPOSPLUS:
2998        end++;
2999        break;
3000    
3001      case OP_CRRANGE:      case OP_CRRANGE:
3002      case OP_CRMINRANGE:      case OP_CRMINRANGE:
3003        case OP_CRPOSRANGE:
3004      list[1] = (GET2(end, 1) == 0);      list[1] = (GET2(end, 1) == 0);
3005      end += 1 + 2 * IMM2_SIZE;      end += 1 + 2 * IMM2_SIZE;
3006      break;      break;
# Line 2895  Returns:      TRUE if the auto-possessif Line 3031  Returns:      TRUE if the auto-possessif
3031    
3032  static BOOL  static BOOL
3033  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3034    const pcre_uint32* base_list)    const pcre_uint32* base_list, const pcre_uchar *base_end)
3035  {  {
3036  pcre_uchar c;  pcre_uchar c;
3037  pcre_uint32 list[8];  pcre_uint32 list[8];
3038  const pcre_uint32* chr_ptr;  const pcre_uint32* chr_ptr;
3039  const pcre_uint32* ochr_ptr;  const pcre_uint32* ochr_ptr;
3040  const pcre_uint32* list_ptr;  const pcre_uint32* list_ptr;
3041    const pcre_uchar *next_code;
3042    const pcre_uint8 *class_bits;
3043  pcre_uint32 chr;  pcre_uint32 chr;
3044    
3045    /* Note: the base_list[1] contains whether the current opcode has greedy
3046    (represented by a non-zero value) quantifier. This is a different from
3047    other character type lists, which stores here that the character iterator
3048    matches to an empty string (also represented by a non-zero value). */
3049    
3050  for(;;)  for(;;)
3051    {    {
3052    c = *code;    c = *code;
# Line 2925  for(;;) Line 3068  for(;;)
3068    switch(c)    switch(c)
3069      {      {
3070      case OP_END:      case OP_END:
3071      /* TRUE only in greedy case. The non-greedy case could be replaced by an      case OP_KETRPOS:
3072      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses      /* TRUE only in greedy case. The non-greedy case could be replaced by
3073      more memory, which we cannot get at this stage.) */      an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3074        uses more memory, which we cannot get at this stage.) */
3075    
3076      return base_list[1] != 0;      return base_list[1] != 0;
3077    
3078      case OP_KET:      case OP_KET:
3079      /* If the bracket is capturing, and referenced by an OP_RECURSE, the      /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3080      non-greedy case cannot be converted to a possessive form. We do not test      it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3081      the bracket type at the moment, but we might do it in the future to improve      cannot be converted to a possessive form. */
     this condition. (But note that recursive calls are always atomic.) */  
3082    
3083      if (base_list[1] == 0) return FALSE;      if (base_list[1] == 0) return FALSE;
3084    
3085        switch(*(code - GET(code, 1)))
3086          {
3087          case OP_ASSERT:
3088          case OP_ASSERT_NOT:
3089          case OP_ASSERTBACK:
3090          case OP_ASSERTBACK_NOT:
3091          case OP_ONCE:
3092          case OP_ONCE_NC:
3093          /* Atomic sub-patterns and assertions can always auto-possessify their
3094          last iterator. */
3095          return TRUE;
3096          }
3097    
3098        code += PRIV(OP_lengths)[c];
3099        continue;
3100    
3101        case OP_ONCE:
3102        case OP_ONCE_NC:
3103        case OP_BRA:
3104        case OP_CBRA:
3105        next_code = code;
3106        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3107    
3108        /* We do not support repeated brackets, because they can lead to
3109        infinite recursion. */
3110    
3111        if (*next_code != OP_KET) return FALSE;
3112    
3113        next_code = code + GET(code, 1);
3114        code += PRIV(OP_lengths)[c];
3115    
3116        while (*next_code == OP_ALT)
3117          {
3118          if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3119          code = next_code + 1 + LINK_SIZE;
3120          next_code += GET(next_code, 1);
3121          }
3122        continue;
3123    
3124        case OP_BRAZERO:
3125        case OP_BRAMINZERO:
3126    
3127        next_code = code + 1;
3128        if (*next_code != OP_BRA && *next_code != OP_CBRA)
3129          return FALSE;
3130    
3131        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3132    
3133        /* We do not support repeated brackets, because they can lead to
3134        infinite recursion. */
3135        if (*next_code != OP_KET) return FALSE;
3136    
3137        /* The bracket content will be checked by the
3138        OP_BRA/OP_CBRA case above. */
3139        next_code += 1 + LINK_SIZE;
3140        if (!compare_opcodes(next_code, utf, cd, base_list, base_end)) return FALSE;
3141    
3142      code += PRIV(OP_lengths)[c];      code += PRIV(OP_lengths)[c];
3143      continue;      continue;
3144      }      }
# Line 3011  for(;;) Line 3212  for(;;)
3212            /* This code is logically tricky. Think hard before fiddling with it.            /* This code is logically tricky. Think hard before fiddling with it.
3213            The posspropstab table has four entries per row. Each row relates to            The posspropstab table has four entries per row. Each row relates to
3214            one of PCRE's special properties such as ALNUM or SPACE or WORD.            one of PCRE's special properties such as ALNUM or SPACE or WORD.
3215            Only WORD actually needs all four entries, but using repeats for the            Only WORD actually needs all four entries, but using repeats for the
3216            others means they can all use the same code below.            others means they can all use the same code below.
3217    
3218            The first two entries in each row are Unicode general categories, and            The first two entries in each row are Unicode general categories, and
3219            apply always, because all the characters they include are part of the            apply always, because all the characters they include are part of the
3220            PCRE character set. The third and fourth entries are a general and a            PCRE character set. The third and fourth entries are a general and a
# Line 3023  for(;;) Line 3224  for(;;)
3224            category contains more characters than the specials that are defined            category contains more characters than the specials that are defined
3225            for the property being tested against. Therefore, it cannot be used            for the property being tested against. Therefore, it cannot be used
3226            in a NOTPROP case.            in a NOTPROP case.
3227    
3228            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3229            Underscore is covered by ucp_P or ucp_Po. */            Underscore is covered by ucp_P or ucp_Po. */
3230    
# Line 3193  for(;;) Line 3394  for(;;)
3394          return FALSE;          return FALSE;
3395        break;        break;
3396    
       /* The class comparisons work only when the class is the second item  
       of the pair, because there are at present no possessive forms of the  
       class opcodes. Note also that the "code" variable that is used below  
       points after the second item, and that the pointer for the first item  
       is not available, so even if there were possessive forms of the class  
       opcodes, the correct comparison could not be done. */  
   
3397        case OP_NCLASS:        case OP_NCLASS:
3398        if (chr > 255) return FALSE;        if (chr > 255) return FALSE;
3399        /* Fall through */        /* Fall through */
3400    
3401        case OP_CLASS:        case OP_CLASS:
       if (list_ptr != list) return FALSE;   /* Class is first opcode */  
3402        if (chr > 255) break;        if (chr > 255) break;
3403        if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)        class_bits = (pcre_uint8 *)((list_ptr == list ? code : base_end) - list_ptr[2]);
3404          if ((class_bits[chr >> 3] & (1 << (chr & 7))) != 0)
3405          return FALSE;          return FALSE;
3406        break;        break;
3407    
3408  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3409        case OP_XCLASS:        case OP_XCLASS:
3410        if (list_ptr != list) return FALSE;   /* Class is first opcode */        if (list_ptr != list) return FALSE;   /* Class is first opcode */
3411        if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))        if (PRIV(xclass)(chr, code - list_ptr[2] + LINK_SIZE, utf))
3412          return FALSE;          return FALSE;
3413        break;        break;
3414  #endif  #endif
# Line 3255  Returns:      nothing Line 3449  Returns:      nothing
3449  static void  static void
3450  auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)  auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3451  {  {
3452  register pcre_uchar c;  register pcre_uchar c, d;
3453  const pcre_uchar *end;  const pcre_uchar *end;
3454    pcre_uchar *repeat_code;
3455  pcre_uint32 list[8];  pcre_uint32 list[8];
3456    
3457  for (;;)  for (;;)
# Line 3270  for (;;) Line 3465  for (;;)
3465        get_chr_property_list(code, utf, cd->fcc, list) : NULL;        get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3466      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3467    
3468      if (end != NULL && compare_opcodes(end, utf, cd, list))      if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3469        {        {
3470        switch(c)        switch(c)
3471          {          {
# Line 3309  for (;;) Line 3504  for (;;)
3504        }        }
3505      c = *code;      c = *code;
3506      }      }
3507      else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3508        {
3509    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3510        if (c == OP_XCLASS)
3511          repeat_code = code + 1 + GET(code, 1);
3512        else
3513    #endif
3514          repeat_code = code + 1 + (32 / sizeof(pcre_uchar));
3515    
3516        d = *repeat_code;
3517        if (d >= OP_CRSTAR && d <= OP_CRMINRANGE)
3518          {
3519          /* end must not be NULL. */
3520          end = get_chr_property_list(code, utf, cd->fcc, list);
3521    
3522          list[1] = d == OP_CRSTAR || d == OP_CRPLUS || d == OP_CRQUERY ||
3523            d == OP_CRRANGE;
3524    
3525          if (compare_opcodes(end, utf, cd, list, end))
3526            {
3527            switch (d)
3528              {
3529              case OP_CRSTAR:
3530              *repeat_code = OP_CRPOSSTAR;
3531              break;
3532    
3533              case OP_CRPLUS:
3534              *repeat_code = OP_CRPOSPLUS;
3535              break;
3536    
3537              case OP_CRQUERY:
3538              *repeat_code = OP_CRPOSQUERY;
3539              break;
3540    
3541              case OP_CRRANGE:
3542              *repeat_code = OP_CRPOSRANGE;
3543              break;
3544              }
3545            }
3546          }
3547        }
3548    
3549    switch(c)    switch(c)
3550      {      {
# Line 3335  for (;;) Line 3571  for (;;)
3571        code += 2;        code += 2;
3572      break;      break;
3573    
3574    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3575      case OP_XCLASS:      case OP_XCLASS:
3576      code += GET(code, 1);      code += GET(code, 1);
3577      break;      break;
3578    #endif
3579    
3580      case OP_MARK:      case OP_MARK:
3581      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
# Line 4201  for (;; ptr++) Line 4439  for (;; ptr++)
4439        }        }
4440      }      }
4441    
   /* Fill in length of a previous callout, except when the next thing is  
   a quantifier. */  
   
4442    is_quantifier =    is_quantifier =
4443      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4444      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4445    
4446    if (!is_quantifier && previous_callout != NULL &&    /* Fill in length of a previous callout, except when the next thing is a
4447      quantifier or when processing a property substitution string in UCP mode. */
4448    
4449      if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4450         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
4451      {      {
4452      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
# Line 4239  for (;; ptr++) Line 4477  for (;; ptr++)
4477        }        }
4478      }      }
4479    
4480    /* No auto callout for quantifiers. */    /* No auto callout for quantifiers, or while processing property strings that
4481      are substituted for \w etc in UCP mode. */
4482    
4483    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4484      {      {
4485      previous_callout = code;      previous_callout = code;
4486      code = auto_callout(code, ptr, cd);      code = auto_callout(code, ptr, cd);
# Line 4627  for (;; ptr++) Line 4866  for (;; ptr++)
4866              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4867              continue;              continue;
4868    
4869              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4870              if it was previously set by something earlier in the character              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4871              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and              previously set by something earlier in the character class.
4872              EBCDIC, so we lazily just adjust the appropriate bit. */              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4873                we could just adjust the appropriate bit. From PCRE 8.34 we no
4874                longer treat \s and \S specially. */
4875    
4876              case ESC_s:              case ESC_s:
4877              classbits[0] |= cbits[cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
             classbits[1] |= cbits[cbit_space+1] & ~0x08;  
             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];  
4878              continue;              continue;
4879    
4880              case ESC_S:              case ESC_S:
4881              should_flip_negation = TRUE;              should_flip_negation = TRUE;
4882              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */  
4883              continue;              continue;
4884    
4885              /* The rest apply in both UCP and non-UCP cases. */              /* The rest apply in both UCP and non-UCP cases. */
# Line 5730  for (;; ptr++) Line 5968  for (;; ptr++)
5968        goto FAILED;        goto FAILED;
5969        }        }
5970    
5971      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', possessive_quantifier is
5972      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,      TRUE. For some opcodes, there are special alternative opcodes for this
5973      there are special alternative opcodes for this case. For anything else, we      case. For anything else, we wrap the entire repeated item inside OP_ONCE
5974      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      brackets. Logically, the '+' notation is just syntactic sugar, taken from
5975      notation is just syntactic sugar, taken from Sun's Java package, but the      Sun's Java package, but the special opcodes can optimize it.
     special opcodes can optimize it.  
5976    
5977      Some (but not all) possessively repeated subpatterns have already been      Some (but not all) possessively repeated subpatterns have already been
5978      completely handled in the code just above. For them, possessive_quantifier      completely handled in the code just above. For them, possessive_quantifier
5979      is always FALSE at this stage.      is always FALSE at this stage. Note that the repeated item starts at
5980        tempcode, not at previous, which might be the first part of a string whose
5981      Note that the repeated item starts at tempcode, not at previous, which      (former) last char we repeated. */
     might be the first part of a string whose (former) last char we repeated.  
   
     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But  
     an 'upto' may follow. We skip over an 'exact' item, and then test the  
     length of what remains before proceeding. */  
5982    
5983      if (possessive_quantifier)      if (possessive_quantifier)
5984        {        {
5985        int len;        int len;
5986    
5987        if (*tempcode == OP_TYPEEXACT)        /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
5988          However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
5989          {5,}, or {5,10}). We skip over an EXACT item; if the length of what
5990          remains is greater than zero, there's a further opcode that can be
5991          handled. If not, do nothing, leaving the EXACT alone. */
5992    
5993          switch(*tempcode)
5994            {
5995            case OP_TYPEEXACT:
5996          tempcode += PRIV(OP_lengths)[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
5997            ((tempcode[1 + IMM2_SIZE] == OP_PROP            ((tempcode[1 + IMM2_SIZE] == OP_PROP
5998            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5999            break;
6000    
6001        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)          /* CHAR opcodes are used for exacts whose count is 1. */
6002          {  
6003            case OP_CHAR:
6004            case OP_CHARI:
6005            case OP_NOT:
6006            case OP_NOTI:
6007            case OP_EXACT:
6008            case OP_EXACTI:
6009            case OP_NOTEXACT:
6010            case OP_NOTEXACTI:
6011          tempcode += PRIV(OP_lengths)[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
6012  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6013          if (utf && HAS_EXTRALEN(tempcode[-1]))          if (utf && HAS_EXTRALEN(tempcode[-1]))
6014            tempcode += GET_EXTRALEN(tempcode[-1]);            tempcode += GET_EXTRALEN(tempcode[-1]);
6015  #endif  #endif
6016            break;
6017    
6018            /* For the class opcodes, the repeat operator appears at the end;
6019            adjust tempcode to point to it. */
6020    
6021            case OP_CLASS:
6022            case OP_NCLASS:
6023            tempcode += 1 + 32/sizeof(pcre_uchar);
6024            break;
6025    
6026    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6027            case OP_XCLASS:
6028            tempcode += GET(tempcode, 1);
6029            break;
6030    #endif
6031          }          }
6032    
6033          /* If tempcode is equal to code (which points to the end of the repeated
6034          item), it means we have skipped an EXACT item but there is no following
6035          QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6036          all other cases, tempcode will be pointing to the repeat opcode, and will
6037          be less than code, so the value of len will be greater than 0. */
6038    
6039        len = (int)(code - tempcode);        len = (int)(code - tempcode);
6040          if (len > 0)
6041            {
6042            unsigned int repcode = *tempcode;
6043    
6044            /* There is a table for possessifying opcodes, all of which are less
6045            than OP_CALLOUT. A zero entry means there is no possessified version.
6046            */
6047    
6048            if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6049              *tempcode = opcode_possessify[repcode];
6050    
6051            /* For opcode without a special possessified version, wrap the item in
6052            ONCE brackets. Because we are moving code along, we must ensure that any
6053            pending recursive references are updated. */
6054    
6055            else
6056              {
6057              *code = OP_END;
6058              adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6059              memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6060              code += 1 + LINK_SIZE;
6061              len += 1 + LINK_SIZE;
6062              tempcode[0] = OP_ONCE;
6063              *code++ = OP_KET;
6064              PUTINC(code, 0, len);
6065              PUT(tempcode, 1, len);
6066              }
6067            }
6068    
6069    #ifdef NEVER
6070        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
6071          {          {
6072          case OP_STAR:  *tempcode = OP_POSSTAR; break;          case OP_STAR:  *tempcode = OP_POSSTAR; break;
# Line 5794  for (;; ptr++) Line 6094  for (;; ptr++)
6094          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6095          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6096    
6097            case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6098            case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6099            case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6100            case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6101    
6102          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
6103          pending recursive references are updated. */          pending recursive references are updated. */
6104    
# Line 5809  for (;; ptr++) Line 6114  for (;; ptr++)
6114          PUT(tempcode, 1, len);          PUT(tempcode, 1, len);
6115          break;          break;
6116          }          }
6117    #endif
6118        }        }
6119    
6120      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 6031  for (;; ptr++) Line 6337  for (;; ptr++)
6337                 tempptr[2] == CHAR_LESS_THAN_SIGN))                 tempptr[2] == CHAR_LESS_THAN_SIGN))
6338            break;            break;
6339    
6340          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6341          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6342    
6343          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6344          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
# Line 6048  for (;; ptr++) Line 6354  for (;; ptr++)
6354            }            }
6355    
6356          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6357          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6358            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6359            consist entirely of digits, there is scope for ambiguity. */
6360    
6361          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6362            {            {
# Line 6066  for (;; ptr++) Line 6374  for (;; ptr++)
6374            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6375            }            }
6376    
6377          /* We now expect to read a name; any thing else is an error */          /* When a name is one of a number of duplicates, a different opcode is
6378            used and it needs more memory. Unfortunately we cannot tell whether a
6379            name is a duplicate in the first pass, so we have to allow for more
6380            memory except when we know it is a relative numerical reference. */
6381    
6382            if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6383    
6384            /* We now expect to read a name (possibly all digits); any thing else
6385            is an error. In the case of all digits, also get it as a number. */
6386    
6387          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6388            {            {
# Line 6075  for (;; ptr++) Line 6391  for (;; ptr++)
6391            goto FAILED;            goto FAILED;
6392            }            }
6393    
         /* Read the name, but also get it as a number if it's all digits */  
   
6394          recno = 0;          recno = 0;
6395          name = ++ptr;          name = ++ptr;
6396          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
# Line 6087  for (;; ptr++) Line 6401  for (;; ptr++)
6401            }            }
6402          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6403    
6404            /* Check the terminator */
6405    
6406          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6407              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6408            {            {
# Line 6122  for (;; ptr++) Line 6438  for (;; ptr++)
6438            }            }
6439    
6440          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
6441          name. If we find a name, add one to the opcode to change OP_CREF or          name. */
         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
6442    
6443          slot = cd->name_table;          slot = cd->name_table;
6444          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 6134  for (;; ptr++) Line 6447  for (;; ptr++)
6447            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6448            }            }
6449    
6450          /* Found the named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6451            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6452            appropriate data values. Otherwise, just insert the unique subpattern
6453            number. */
6454    
6455          if (i < cd->names_found)          if (i < cd->names_found)
6456            {            {
6457            recno = GET2(slot, 0);            int offset = i++;
6458            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6459            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6460              for (; i < cd->names_found; i++)
6461                {
6462                slot += cd->name_entry_size;
6463                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6464                count++;
6465                }
6466              if (count > 1)
6467                {
6468                PUT2(code, 2+LINK_SIZE, offset);
6469                PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6470                skipbytes += IMM2_SIZE;
6471                code[1+LINK_SIZE]++;
6472                }
6473              else  /* Not a duplicated name */
6474                {
6475                PUT2(code, 2+LINK_SIZE, recno);
6476                }
6477            }            }
6478    
6479          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
# Line 7830  do { Line 8163  do {
8163       switch (*scode)       switch (*scode)
8164         {         {
8165         case OP_CREF:         case OP_CREF:
8166         case OP_NCREF:         case OP_DNCREF:
8167         case OP_RREF:         case OP_RREF:
8168         case OP_NRREF:         case OP_DNRREF:
8169         case OP_DEF:         case OP_DEF:
8170         return FALSE;         return FALSE;
8171    
# Line 8837  return (pcre32 *)re; Line 9170  return (pcre32 *)re;
9170  }  }
9171    
9172  /* End of pcre_compile.c */  /* End of pcre_compile.c */
9173    

Legend:
Removed from v.1363  
changed lines
  Added in v.1380

  ViewVC Help
Powered by ViewVC 1.1.5