/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1365 by ph10, Sun Oct 6 18:33:56 2013 UTC revision 1384 by zherczeg, Fri Oct 25 17:37:50 2013 UTC
# Line 462  static const char error_texts[] = Line 462  static const char error_texts[] =
462    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
463    "this version of PCRE is compiled without UTF support\0"    "this version of PCRE is compiled without UTF support\0"
464    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
465    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
466    /* 35 */    /* 35 */
467    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
468    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 516  static const char error_texts[] = Line 516  static const char error_texts[] =
516    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
517    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
518    "setting UTF is disabled by the application\0"    "setting UTF is disabled by the application\0"
519      "non-hex character in \\x{} (closing brace missing?)\0"
520      /* 80 */
521      "non-octal character in \\o{} (closing brace missing?)\0"
522      "missing opening brace after \\o\0"
523    ;    ;
524    
525  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 773  static const pcre_uint8 posspropstab[3][ Line 777  static const pcre_uint8 posspropstab[3][
777    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
778  };  };
779    
780    /* This table is used when converting repeating opcodes into possessified
781    versions as a result of an explicit possessive quantifier such as ++. A zero
782    value means there is no possessified version - in those cases the item in
783    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
784    because all relevant opcodes are less than that. */
785    
786    static const pcre_uint8 opcode_possessify[] = {
787      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
788      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
789    
790      0,                       /* NOTI */
791      OP_POSSTAR, 0,           /* STAR, MINSTAR */
792      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
793      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
794      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
795      0,                       /* EXACT */
796      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
797    
798      OP_POSSTARI, 0,          /* STARI, MINSTARI */
799      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
800      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
801      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
802      0,                       /* EXACTI */
803      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
804    
805      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
806      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
807      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
808      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
809      0,                       /* NOTEXACT */
810      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
811    
812      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
813      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
814      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
815      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
816      0,                       /* NOTEXACTI */
817      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
818    
819      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
820      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
821      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
822      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
823      0,                       /* TYPEEXACT */
824      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
825    
826      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
827      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
828      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
829      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
830      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
831    
832      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
833      0, 0,                    /* REF, REFI */
834      0, 0,                    /* DNREF, DNREFI */
835      0, 0                     /* RECURSE, CALLOUT */
836    };
837    
838    
839    
840  /*************************************************  /*************************************************
# Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 941  return (*p == CHAR_RIGHT_CURLY_BRACKET);
941  *************************************************/  *************************************************/
942    
943  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
944  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
945  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
946  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
947  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
948  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
949    
950  Arguments:  Arguments:
951    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
952    chptr          points to the data character    chptr          points to a returned data character
953    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
954    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
955    options        the options bits    options        the options bits
# Line 1092  else Line 1153  else
1153      break;      break;
1154    
1155      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1156      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1157      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1158        recommended to avoid the ambiguities in the old syntax.
1159    
1160      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1161      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1162      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1163      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1164      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1165      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1166      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1167    
1168        Inside a character class, \ followed by a digit is always either a literal
1169        8 or 9 or an octal number. */
1170    
1171      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1172      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1128  else Line 1193  else
1193          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1194          break;          break;
1195          }          }
1196        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1197          {          {
1198          escape = -s;          escape = -s;
1199          break;          break;
# Line 1136  else Line 1201  else
1201        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1202        }        }
1203    
1204      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1205      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1206      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1207        changed so as not to insert the binary zero. */
1208    
1209      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1210        {  
1211        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1212    
1213      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1214      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1162  else Line 1225  else
1225  #endif  #endif
1226      break;      break;
1227    
1228      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1229      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.      specifying character codes in octal. The only supported form is \o{ddd}. */
1230      If not, { is treated as a data character. */  
1231        case CHAR_o:
1232        if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1233          {
1234          ptr += 2;
1235          c = 0;
1236          overflow = FALSE;
1237          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1238            {
1239            register pcre_uint32 cc = *ptr++;
1240            if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1241    #ifdef COMPILE_PCRE32
1242            if (c >= 0x20000000l) { overflow = TRUE; break; }
1243    #endif
1244            c = (c << 3) + cc - CHAR_0 ;
1245    #if defined COMPILE_PCRE8
1246            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1247    #elif defined COMPILE_PCRE16
1248            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1249    #elif defined COMPILE_PCRE32
1250            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1251    #endif
1252            }
1253          if (overflow)
1254            {
1255            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1256            *errorcodeptr = ERR34;
1257            }
1258          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1259            {
1260            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1261            }
1262          else *errorcodeptr = ERR80;
1263          }
1264        break;
1265    
1266        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1267        numbers. Otherwise it is a lowercase x letter. */
1268    
1269      case CHAR_x:      case CHAR_x:
1270      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1271        {        {
       /* In JavaScript, \x must be followed by two hexadecimal numbers.  
       Otherwise it is a lowercase x letter. */  
1272        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1273          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1274          {          {
# Line 1187  else Line 1285  else
1285  #endif  #endif
1286            }            }
1287          }          }
1288        break;        }    /* End JavaScript handling */
       }  
1289    
1290      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1291        {      greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1292        const pcre_uchar *pt = ptr + 2;      digits. If not, { used to be treated as a data character. However, Perl
1293        seems to read hex digits up to the first non-such, and ignore the rest, so
1294        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1295        now gives an error. */
1296    
1297        c = 0;      else
1298        overflow = FALSE;        {
1299        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1300          {          {
1301          register pcre_uint32 cc = *pt++;          ptr += 2;
1302          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          c = 0;
1303            overflow = FALSE;
1304            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1305              {
1306              register pcre_uint32 cc = *ptr++;
1307              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1308    
1309  #ifdef COMPILE_PCRE32  #ifdef COMPILE_PCRE32
1310          if (c >= 0x10000000l) { overflow = TRUE; break; }            if (c >= 0x10000000l) { overflow = TRUE; break; }
1311  #endif  #endif
1312    
1313  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1314          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1315          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1316  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1317          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1318          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1319  #endif  #endif
1320    
1321  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1322          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1323  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1324          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1325  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1326          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1327  #endif  #endif
1328          }            }
1329    
1330        if (overflow)          if (overflow)
1331          {            {
1332          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1333          *errorcodeptr = ERR34;            *errorcodeptr = ERR34;
1334          }            }
1335    
1336        if (*pt == CHAR_RIGHT_CURLY_BRACKET)          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1337          {            {
1338          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1339          ptr = pt;            }
         break;  
         }  
1340    
1341        /* If the sequence of hex digits does not end with '}', then we don't          /* If the sequence of hex digits does not end with '}', give an error.
1342        recognize this construct; fall through to the normal \x handling. */          We used just to recognize this construct and fall through to the normal
1343        }          \x handling, but nowadays Perl gives an error, which seems much more
1344            sensible, so we do too. */
1345    
1346      /* Read just a single-byte hex-defined char */          else *errorcodeptr = ERR79;
1347            }   /* End of \x{} processing */
1348    
1349      c = 0;        /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1350      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)  
1351        {        else
1352        pcre_uint32 cc;                          /* Some compilers don't like */          {
1353        cc = *(++ptr);                           /* ++ in initializers */          c = 0;
1354            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1355              {
1356              pcre_uint32 cc;                          /* Some compilers don't like */
1357              cc = *(++ptr);                           /* ++ in initializers */
1358  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1359        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1360        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1361  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1362        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1363        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1364  #endif  #endif
1365        }            }
1366            }     /* End of \xdd handling */
1367          }       /* End of Perl-style \x handling */
1368      break;      break;
1369    
1370      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 1764  for (;;) Line 1875  for (;;)
1875    
1876      switch (*cc)      switch (*cc)
1877        {        {
       case OP_CRPLUS:  
       case OP_CRMINPLUS:  
1878        case OP_CRSTAR:        case OP_CRSTAR:
1879        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1880          case OP_CRPLUS:
1881          case OP_CRMINPLUS:
1882        case OP_CRQUERY:        case OP_CRQUERY:
1883        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1884          case OP_CRPOSSTAR:
1885          case OP_CRPOSPLUS:
1886          case OP_CRPOSQUERY:
1887        return -1;        return -1;
1888    
1889        case OP_CRRANGE:        case OP_CRRANGE:
1890        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1891          case OP_CRPOSRANGE:
1892        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1893        branchlength += (int)GET2(cc,1);        branchlength += (int)GET2(cc,1);
1894        cc += 1 + 2 * IMM2_SIZE;        cc += 1 + 2 * IMM2_SIZE;
# Line 2366  for (code = first_significant_code(code Line 2481  for (code = first_significant_code(code
2481        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2482        case OP_CRQUERY:        case OP_CRQUERY:
2483        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2484          case OP_CRPOSSTAR:
2485          case OP_CRPOSQUERY:
2486        break;        break;
2487    
2488        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2489        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2490        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2491          case OP_CRPOSPLUS:
2492        return FALSE;        return FALSE;
2493    
2494        case OP_CRRANGE:        case OP_CRRANGE:
2495        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2496          case OP_CRPOSRANGE:
2497        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2498        break;        break;
2499        }        }
# Line 2653  switch(ptype) Line 2772  switch(ptype)
2772    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2773    means that Perl space and POSIX space are now identical. PCRE was changed    means that Perl space and POSIX space are now identical. PCRE was changed
2774    at release 8.34. */    at release 8.34. */
2775    
2776    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2777    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2778    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    switch(c)
2779            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||      {
2780            c == CHAR_FF || c == CHAR_CR)      HSPACE_CASES:
2781            == negated;      VSPACE_CASES:
2782        return negated;
2783    
2784        default:
2785        return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2786        }
2787      break;  /* Control never reaches here */
2788    
2789    case PT_WORD:    case PT_WORD:
2790    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
# Line 2818  switch(c) Line 2943  switch(c)
2943      return code + 2;      return code + 2;
2944      }      }
2945    
2946    /* Convert only if we have anough space. */    /* Convert only if we have enough space. */
2947    
2948    clist_src = PRIV(ucd_caseless_sets) + code[1];    clist_src = PRIV(ucd_caseless_sets) + code[1];
2949    clist_dest = list + 2;    clist_dest = list + 2;
2950    code += 2;    code += 2;
2951    
2952    do {    do {
      /* Early return if there is not enough space. */  
2953       if (clist_dest >= list + 8)       if (clist_dest >= list + 8)
2954         {         {
2955           /* Early return if there is not enough space. This should never
2956           happen, since all clists are shorter than 5 character now. */
2957         list[2] = code[0];         list[2] = code[0];
2958         list[3] = code[1];         list[3] = code[1];
2959         return code;         return code;
2960         }         }
2961       *clist_dest++ = *clist_src;       *clist_dest++ = *clist_src;
2962       }       }
2963     while(*clist_src++ != NOTACHAR);    while(*clist_src++ != NOTACHAR);
2964    
2965    /* Enough space to store all characters. */    /* All characters are stored. The terminating NOTACHAR
2966      is copied form the clist itself. */
2967    
2968    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2969    return code;    return code;
# Line 2848  switch(c) Line 2975  switch(c)
2975    case OP_XCLASS:    case OP_XCLASS:
2976    
2977    if (c == OP_XCLASS)    if (c == OP_XCLASS)
2978      end = code + GET(code, 0);      end = code + GET(code, 0) - 1;
2979    else    else
2980  #endif  #endif
2981      end = code + 32 / sizeof(pcre_uchar);      end = code + 32 / sizeof(pcre_uchar);
# Line 2859  switch(c) Line 2986  switch(c)
2986      case OP_CRMINSTAR:      case OP_CRMINSTAR:
2987      case OP_CRQUERY:      case OP_CRQUERY:
2988      case OP_CRMINQUERY:      case OP_CRMINQUERY:
2989        case OP_CRPOSSTAR:
2990        case OP_CRPOSQUERY:
2991      list[1] = TRUE;      list[1] = TRUE;
2992      end++;      end++;
2993      break;      break;
2994    
2995        case OP_CRPLUS:
2996        case OP_CRMINPLUS:
2997        case OP_CRPOSPLUS:
2998        end++;
2999        break;
3000    
3001      case OP_CRRANGE:      case OP_CRRANGE:
3002      case OP_CRMINRANGE:      case OP_CRMINRANGE:
3003        case OP_CRPOSRANGE:
3004      list[1] = (GET2(end, 1) == 0);      list[1] = (GET2(end, 1) == 0);
3005      end += 1 + 2 * IMM2_SIZE;      end += 1 + 2 * IMM2_SIZE;
3006      break;      break;
# Line 2895  Returns:      TRUE if the auto-possessif Line 3031  Returns:      TRUE if the auto-possessif
3031    
3032  static BOOL  static BOOL
3033  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3034    const pcre_uint32* base_list)    const pcre_uint32 *base_list, const pcre_uchar *base_end)
3035  {  {
3036  pcre_uchar c;  pcre_uchar c;
3037  pcre_uint32 list[8];  pcre_uint32 list[8];
3038  const pcre_uint32* chr_ptr;  const pcre_uint32 *chr_ptr;
3039  const pcre_uint32* ochr_ptr;  const pcre_uint32 *ochr_ptr;
3040  const pcre_uint32* list_ptr;  const pcre_uint32 *list_ptr;
3041    const pcre_uchar *next_code;
3042    const pcre_uint8 *class_bitset;
3043    const pcre_uint32 *set1, *set2, *set_end;
3044  pcre_uint32 chr;  pcre_uint32 chr;
3045    BOOL accepted, invert_bits;
3046    
3047    /* Note: the base_list[1] contains whether the current opcode has greedy
3048    (represented by a non-zero value) quantifier. This is a different from
3049    other character type lists, which stores here that the character iterator
3050    matches to an empty string (also represented by a non-zero value). */
3051    
3052  for(;;)  for(;;)
3053    {    {
3054      /* All operations move the code pointer forward.
3055      Therefore infinite recursions are not possible. */
3056    
3057    c = *code;    c = *code;
3058    
3059    /* Skip over callouts */    /* Skip over callouts */
# Line 2925  for(;;) Line 3073  for(;;)
3073    switch(c)    switch(c)
3074      {      {
3075      case OP_END:      case OP_END:
3076      /* TRUE only in greedy case. The non-greedy case could be replaced by an      case OP_KETRPOS:
3077      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses      /* TRUE only in greedy case. The non-greedy case could be replaced by
3078      more memory, which we cannot get at this stage.) */      an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3079        uses more memory, which we cannot get at this stage.) */
3080    
3081      return base_list[1] != 0;      return base_list[1] != 0;
3082    
3083      case OP_KET:      case OP_KET:
3084      /* If the bracket is capturing, and referenced by an OP_RECURSE, the      /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3085      non-greedy case cannot be converted to a possessive form. We do not test      it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3086      the bracket type at the moment, but we might do it in the future to improve      cannot be converted to a possessive form. */
     this condition. (But note that recursive calls are always atomic.) */  
3087    
3088      if (base_list[1] == 0) return FALSE;      if (base_list[1] == 0) return FALSE;
3089    
3090        switch(*(code - GET(code, 1)))
3091          {
3092          case OP_ASSERT:
3093          case OP_ASSERT_NOT:
3094          case OP_ASSERTBACK:
3095          case OP_ASSERTBACK_NOT:
3096          case OP_ONCE:
3097          case OP_ONCE_NC:
3098          /* Atomic sub-patterns and assertions can always auto-possessify their
3099          last iterator. */
3100          return TRUE;
3101          }
3102    
3103        code += PRIV(OP_lengths)[c];
3104        continue;
3105    
3106        case OP_ONCE:
3107        case OP_ONCE_NC:
3108        case OP_BRA:
3109        case OP_CBRA:
3110        next_code = code + GET(code, 1);
3111        code += PRIV(OP_lengths)[c];
3112    
3113        while (*next_code == OP_ALT)
3114          {
3115          if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3116          code = next_code + 1 + LINK_SIZE;
3117          next_code += GET(next_code, 1);
3118          }
3119        continue;
3120    
3121        case OP_BRAZERO:
3122        case OP_BRAMINZERO:
3123    
3124        next_code = code + 1;
3125        if (*next_code != OP_BRA && *next_code != OP_CBRA
3126            && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3127    
3128        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3129    
3130        /* The bracket content will be checked by the
3131        OP_BRA/OP_CBRA case above. */
3132        next_code += 1 + LINK_SIZE;
3133        if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3134          return FALSE;
3135    
3136      code += PRIV(OP_lengths)[c];      code += PRIV(OP_lengths)[c];
3137      continue;      continue;
3138      }      }
# Line 2961  for(;;) Line 3156  for(;;)
3156      list_ptr = base_list;      list_ptr = base_list;
3157      }      }
3158    
3159      /* Character bitsets can also be compared to certain opcodes. */
3160    
3161      else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3162    #ifdef COMPILE_PCRE8
3163          /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3164          || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3165    #endif
3166          )
3167        {
3168    #ifdef COMPILE_PCRE8
3169        if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3170    #else
3171        if (base_list[0] == OP_CLASS)
3172    #endif
3173          {
3174          set1 = (pcre_uint32 *)(base_end - base_list[2]);
3175          list_ptr = list;
3176          }
3177        else
3178          {
3179          set1 = (pcre_uint32 *)(code - list[2]);
3180          list_ptr = base_list;
3181          }
3182    
3183        invert_bits = FALSE;
3184        switch(list_ptr[0])
3185          {
3186          case OP_CLASS:
3187          case OP_NCLASS:
3188          set2 = (pcre_uint32 *)
3189            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3190          break;
3191    
3192          /* OP_XCLASS cannot be supported here, because its bitset
3193          is not necessarily complete. E.g: [a-\0x{200}] is stored
3194          as a character range, and the appropriate bits are not set. */
3195    
3196          case OP_NOT_DIGIT:
3197            invert_bits = TRUE;
3198            /* Fall through */
3199          case OP_DIGIT:
3200            set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3201            break;
3202    
3203          case OP_NOT_WHITESPACE:
3204            invert_bits = TRUE;
3205            /* Fall through */
3206          case OP_WHITESPACE:
3207            set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3208            break;
3209    
3210          case OP_NOT_WORDCHAR:
3211            invert_bits = TRUE;
3212            /* Fall through */
3213          case OP_WORDCHAR:
3214            set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3215            break;
3216    
3217          default:
3218          return FALSE;
3219          }
3220    
3221        /* Compare 4 bytes to improve speed. */
3222        set_end = set1 + (32 / 4);
3223        if (invert_bits)
3224          {
3225          do
3226            {
3227            if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3228            }
3229          while (set1 < set_end);
3230          }
3231        else
3232          {
3233          do
3234            {
3235            if ((*set1++ & *set2++) != 0) return FALSE;
3236            }
3237          while (set1 < set_end);
3238          }
3239    
3240        if (list[1] == 0) return TRUE;
3241        /* Might be an empty repeat. */
3242        continue;
3243        }
3244    
3245    /* Some property combinations also acceptable. Unicode property opcodes are    /* Some property combinations also acceptable. Unicode property opcodes are
3246    processed specially; the rest can be handled with a lookup table. */    processed specially; the rest can be handled with a lookup table. */
3247    
# Line 2968  for(;;) Line 3249  for(;;)
3249      {      {
3250      pcre_uint32 leftop, rightop;      pcre_uint32 leftop, rightop;
3251    
     if (list[1] != 0) return FALSE;   /* Must match at least one character */  
3252      leftop = base_list[0];      leftop = base_list[0];
3253      rightop = list[0];      rightop = list[0];
3254    
3255  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3256        accepted = FALSE; /* Always set in non-unicode case. */
3257      if (leftop == OP_PROP || leftop == OP_NOTPROP)      if (leftop == OP_PROP || leftop == OP_NOTPROP)
3258        {        {
3259        if (rightop == OP_EOD) return TRUE;        if (rightop == OP_EOD)
3260        if (rightop == OP_PROP || rightop == OP_NOTPROP)          accepted = TRUE;
3261          else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3262          {          {
3263          int n;          int n;
3264          const pcre_uint8 *p;          const pcre_uint8 *p;
# Line 2997  for(;;) Line 3279  for(;;)
3279          n = propposstab[base_list[2]][list[2]];          n = propposstab[base_list[2]][list[2]];
3280          switch(n)          switch(n)
3281            {            {
3282            case 0: return FALSE;            case 0: break;
3283            case 1: return bothprop;            case 1: accepted = bothprop; break;
3284            case 2: return (base_list[3] == list[3]) != same;            case 2: accepted = (base_list[3] == list[3]) != same; break;
3285            case 3: return !same;            case 3: accepted = !same; break;
3286    
3287            case 4:  /* Left general category, right particular category */            case 4:  /* Left general category, right particular category */
3288            return risprop && catposstab[base_list[3]][list[3]] == same;            accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3289              break;
3290    
3291            case 5:  /* Right general category, left particular category */            case 5:  /* Right general category, left particular category */
3292            return lisprop && catposstab[list[3]][base_list[3]] == same;            accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3293              break;
3294    
3295            /* This code is logically tricky. Think hard before fiddling with it.            /* This code is logically tricky. Think hard before fiddling with it.
3296            The posspropstab table has four entries per row. Each row relates to            The posspropstab table has four entries per row. Each row relates to
3297            one of PCRE's special properties such as ALNUM or SPACE or WORD.            one of PCRE's special properties such as ALNUM or SPACE or WORD.
3298            Only WORD actually needs all four entries, but using repeats for the            Only WORD actually needs all four entries, but using repeats for the
3299            others means they can all use the same code below.            others means they can all use the same code below.
3300    
3301            The first two entries in each row are Unicode general categories, and            The first two entries in each row are Unicode general categories, and
3302            apply always, because all the characters they include are part of the            apply always, because all the characters they include are part of the
3303            PCRE character set. The third and fourth entries are a general and a            PCRE character set. The third and fourth entries are a general and a
# Line 3023  for(;;) Line 3307  for(;;)
3307            category contains more characters than the specials that are defined            category contains more characters than the specials that are defined
3308            for the property being tested against. Therefore, it cannot be used            for the property being tested against. Therefore, it cannot be used
3309            in a NOTPROP case.            in a NOTPROP case.
3310    
3311            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3312            Underscore is covered by ucp_P or ucp_Po. */            Underscore is covered by ucp_P or ucp_Po. */
3313    
# Line 3031  for(;;) Line 3315  for(;;)
3315            case 7:  /* Left space vs right general category */            case 7:  /* Left space vs right general category */
3316            case 8:  /* Left word vs right general category */            case 8:  /* Left word vs right general category */
3317            p = posspropstab[n-6];            p = posspropstab[n-6];
3318            return risprop && lisprop ==            accepted = risprop && lisprop ==
3319              (list[3] != p[0] &&              (list[3] != p[0] &&
3320               list[3] != p[1] &&               list[3] != p[1] &&
3321              (list[3] != p[2] || !lisprop));              (list[3] != p[2] || !lisprop));
3322              break;
3323    
3324            case 9:   /* Right alphanum vs left general category */            case 9:   /* Right alphanum vs left general category */
3325            case 10:  /* Right space vs left general category */            case 10:  /* Right space vs left general category */
3326            case 11:  /* Right word vs left general category */            case 11:  /* Right word vs left general category */
3327            p = posspropstab[n-9];            p = posspropstab[n-9];
3328            return lisprop && risprop ==            accepted = lisprop && risprop ==
3329              (base_list[3] != p[0] &&              (base_list[3] != p[0] &&
3330               base_list[3] != p[1] &&               base_list[3] != p[1] &&
3331              (base_list[3] != p[2] || !risprop));              (base_list[3] != p[2] || !risprop));
3332              break;
3333    
3334            case 12:  /* Left alphanum vs right particular category */            case 12:  /* Left alphanum vs right particular category */
3335            case 13:  /* Left space vs right particular category */            case 13:  /* Left space vs right particular category */
3336            case 14:  /* Left word vs right particular category */            case 14:  /* Left word vs right particular category */
3337            p = posspropstab[n-12];            p = posspropstab[n-12];
3338            return risprop && lisprop ==            accepted = risprop && lisprop ==
3339              (catposstab[p[0]][list[3]] &&              (catposstab[p[0]][list[3]] &&
3340               catposstab[p[1]][list[3]] &&               catposstab[p[1]][list[3]] &&
3341              (list[3] != p[3] || !lisprop));              (list[3] != p[3] || !lisprop));
3342              break;
3343    
3344            case 15:  /* Right alphanum vs left particular category */            case 15:  /* Right alphanum vs left particular category */
3345            case 16:  /* Right space vs left particular category */            case 16:  /* Right space vs left particular category */
3346            case 17:  /* Right word vs left particular category */            case 17:  /* Right word vs left particular category */
3347            p = posspropstab[n-15];            p = posspropstab[n-15];
3348            return lisprop && risprop ==            accepted = lisprop && risprop ==
3349              (catposstab[p[0]][base_list[3]] &&              (catposstab[p[0]][base_list[3]] &&
3350               catposstab[p[1]][base_list[3]] &&               catposstab[p[1]][base_list[3]] &&
3351              (base_list[3] != p[3] || !risprop));              (base_list[3] != p[3] || !risprop));
3352              break;
3353            }            }
3354          }          }
       return FALSE;  
3355        }        }
3356    
3357      else      else
3358  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
3359    
3360      return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&      accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3361             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3362             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3363    
3364        if (!accepted)
3365          return FALSE;
3366    
3367        if (list[1] == 0) return TRUE;
3368        /* Might be an empty repeat. */
3369        continue;
3370      }      }
3371    
3372    /* Control reaches here only if one of the items is a small character list.    /* Control reaches here only if one of the items is a small character list.
# Line 3186  for(;;) Line 3480  for(;;)
3480        case OP_EOD:    /* Can always possessify before \z */        case OP_EOD:    /* Can always possessify before \z */
3481        break;        break;
3482    
3483    #ifdef SUPPORT_UCP
3484        case OP_PROP:        case OP_PROP:
3485        case OP_NOTPROP:        case OP_NOTPROP:
3486        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3487              list_ptr[0] == OP_NOTPROP))              list_ptr[0] == OP_NOTPROP))
3488          return FALSE;          return FALSE;
3489        break;        break;
3490    #endif
       /* The class comparisons work only when the class is the second item  
       of the pair, because there are at present no possessive forms of the  
       class opcodes. Note also that the "code" variable that is used below  
       points after the second item, and that the pointer for the first item  
       is not available, so even if there were possessive forms of the class  
       opcodes, the correct comparison could not be done. */  
3491    
3492        case OP_NCLASS:        case OP_NCLASS:
3493        if (chr > 255) return FALSE;        if (chr > 255) return FALSE;
3494        /* Fall through */        /* Fall through */
3495    
3496        case OP_CLASS:        case OP_CLASS:
       if (list_ptr != list) return FALSE;   /* Class is first opcode */  
3497        if (chr > 255) break;        if (chr > 255) break;
3498        if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)        class_bitset = (pcre_uint8 *)
3499          return FALSE;          ((list_ptr == list ? code : base_end) - list_ptr[2]);
3500          if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3501        break;        break;
3502    
3503  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3504        case OP_XCLASS:        case OP_XCLASS:
3505        if (list_ptr != list) return FALSE;   /* Class is first opcode */        if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3506        if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))            list_ptr[2] + LINK_SIZE, utf)) return FALSE;
         return FALSE;  
3507        break;        break;
3508  #endif  #endif
3509    
# Line 3257  auto_possessify(pcre_uchar *code, BOOL u Line 3545  auto_possessify(pcre_uchar *code, BOOL u
3545  {  {
3546  register pcre_uchar c;  register pcre_uchar c;
3547  const pcre_uchar *end;  const pcre_uchar *end;
3548    pcre_uchar *repeat_opcode;
3549  pcre_uint32 list[8];  pcre_uint32 list[8];
3550    
3551  for (;;)  for (;;)
# Line 3270  for (;;) Line 3559  for (;;)
3559        get_chr_property_list(code, utf, cd->fcc, list) : NULL;        get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3560      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3561    
3562      if (end != NULL && compare_opcodes(end, utf, cd, list))      if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3563        {        {
3564        switch(c)        switch(c)
3565          {          {
# Line 3309  for (;;) Line 3598  for (;;)
3598        }        }
3599      c = *code;      c = *code;
3600      }      }
3601      else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3602        {
3603    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3604        if (c == OP_XCLASS)
3605          repeat_opcode = code + GET(code, 1);
3606        else
3607    #endif
3608          repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3609    
3610        c = *repeat_opcode;
3611        if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3612          {
3613          /* end must not be NULL. */
3614          end = get_chr_property_list(code, utf, cd->fcc, list);
3615    
3616          list[1] = (c & 1) == 0;
3617    
3618          if (compare_opcodes(end, utf, cd, list, end))
3619            {
3620            switch (c)
3621              {
3622              case OP_CRSTAR:
3623              case OP_CRMINSTAR:
3624              *repeat_opcode = OP_CRPOSSTAR;
3625              break;
3626    
3627              case OP_CRPLUS:
3628              case OP_CRMINPLUS:
3629              *repeat_opcode = OP_CRPOSPLUS;
3630              break;
3631    
3632              case OP_CRQUERY:
3633              case OP_CRMINQUERY:
3634              *repeat_opcode = OP_CRPOSQUERY;
3635              break;
3636    
3637              case OP_CRRANGE:
3638              case OP_CRMINRANGE:
3639              *repeat_opcode = OP_CRPOSRANGE;
3640              break;
3641              }
3642            }
3643          }
3644        c = *code;
3645        }
3646    
3647    switch(c)    switch(c)
3648      {      {
# Line 3335  for (;;) Line 3669  for (;;)
3669        code += 2;        code += 2;
3670      break;      break;
3671    
3672    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3673      case OP_XCLASS:      case OP_XCLASS:
3674      code += GET(code, 1);      code += GET(code, 1);
3675      break;      break;
3676    #endif
3677    
3678      case OP_MARK:      case OP_MARK:
3679      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
# Line 4201  for (;; ptr++) Line 4537  for (;; ptr++)
4537        }        }
4538      }      }
4539    
   /* Fill in length of a previous callout, except when the next thing is  
   a quantifier. */  
   
4540    is_quantifier =    is_quantifier =
4541      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4542      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4543    
4544    if (!is_quantifier && previous_callout != NULL &&    /* Fill in length of a previous callout, except when the next thing is a
4545      quantifier or when processing a property substitution string in UCP mode. */
4546    
4547      if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4548         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
4549      {      {
4550      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
# Line 4239  for (;; ptr++) Line 4575  for (;; ptr++)
4575        }        }
4576      }      }
4577    
4578    /* No auto callout for quantifiers. */    /* No auto callout for quantifiers, or while processing property strings that
4579      are substituted for \w etc in UCP mode. */
4580    
4581    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4582      {      {
4583      previous_callout = code;      previous_callout = code;
4584      code = auto_callout(code, ptr, cd);      code = auto_callout(code, ptr, cd);
# Line 4631  for (;; ptr++) Line 4968  for (;; ptr++)
4968              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4969              previously set by something earlier in the character class.              previously set by something earlier in the character class.
4970              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4971              we could just adjust the appropriate bit. From PCRE 8.34 we no              we could just adjust the appropriate bit. From PCRE 8.34 we no
4972              longer treat \s and \S specially. */              longer treat \s and \S specially. */
4973    
4974              case ESC_s:              case ESC_s:
# Line 5729  for (;; ptr++) Line 6066  for (;; ptr++)
6066        goto FAILED;        goto FAILED;
6067        }        }
6068    
6069      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', possessive_quantifier is
6070      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,      TRUE. For some opcodes, there are special alternative opcodes for this
6071      there are special alternative opcodes for this case. For anything else, we      case. For anything else, we wrap the entire repeated item inside OP_ONCE
6072      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      brackets. Logically, the '+' notation is just syntactic sugar, taken from
6073      notation is just syntactic sugar, taken from Sun's Java package, but the      Sun's Java package, but the special opcodes can optimize it.
     special opcodes can optimize it.  
6074    
6075      Some (but not all) possessively repeated subpatterns have already been      Some (but not all) possessively repeated subpatterns have already been
6076      completely handled in the code just above. For them, possessive_quantifier      completely handled in the code just above. For them, possessive_quantifier
6077      is always FALSE at this stage.      is always FALSE at this stage. Note that the repeated item starts at
6078        tempcode, not at previous, which might be the first part of a string whose
6079      Note that the repeated item starts at tempcode, not at previous, which      (former) last char we repeated. */
     might be the first part of a string whose (former) last char we repeated.  
   
     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But  
     an 'upto' may follow. We skip over an 'exact' item, and then test the  
     length of what remains before proceeding. */  
6080    
6081      if (possessive_quantifier)      if (possessive_quantifier)
6082        {        {
6083        int len;        int len;
6084    
6085        if (*tempcode == OP_TYPEEXACT)        /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6086          However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6087          {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6088          remains is greater than zero, there's a further opcode that can be
6089          handled. If not, do nothing, leaving the EXACT alone. */
6090    
6091          switch(*tempcode)
6092            {
6093            case OP_TYPEEXACT:
6094          tempcode += PRIV(OP_lengths)[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
6095            ((tempcode[1 + IMM2_SIZE] == OP_PROP            ((tempcode[1 + IMM2_SIZE] == OP_PROP
6096            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6097            break;
6098    
6099        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)          /* CHAR opcodes are used for exacts whose count is 1. */
6100          {  
6101            case OP_CHAR:
6102            case OP_CHARI:
6103            case OP_NOT:
6104            case OP_NOTI:
6105            case OP_EXACT:
6106            case OP_EXACTI:
6107            case OP_NOTEXACT:
6108            case OP_NOTEXACTI:
6109          tempcode += PRIV(OP_lengths)[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
6110  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6111          if (utf && HAS_EXTRALEN(tempcode[-1]))          if (utf && HAS_EXTRALEN(tempcode[-1]))
6112            tempcode += GET_EXTRALEN(tempcode[-1]);            tempcode += GET_EXTRALEN(tempcode[-1]);
6113  #endif  #endif
6114            break;
6115    
6116            /* For the class opcodes, the repeat operator appears at the end;
6117            adjust tempcode to point to it. */
6118    
6119            case OP_CLASS:
6120            case OP_NCLASS:
6121            tempcode += 1 + 32/sizeof(pcre_uchar);
6122            break;
6123    
6124    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6125            case OP_XCLASS:
6126            tempcode += GET(tempcode, 1);
6127            break;
6128    #endif
6129          }          }
6130    
6131          /* If tempcode is equal to code (which points to the end of the repeated
6132          item), it means we have skipped an EXACT item but there is no following
6133          QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6134          all other cases, tempcode will be pointing to the repeat opcode, and will
6135          be less than code, so the value of len will be greater than 0. */
6136    
6137        len = (int)(code - tempcode);        len = (int)(code - tempcode);
6138          if (len > 0)
6139            {
6140            unsigned int repcode = *tempcode;
6141    
6142            /* There is a table for possessifying opcodes, all of which are less
6143            than OP_CALLOUT. A zero entry means there is no possessified version.
6144            */
6145    
6146            if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6147              *tempcode = opcode_possessify[repcode];
6148    
6149            /* For opcode without a special possessified version, wrap the item in
6150            ONCE brackets. Because we are moving code along, we must ensure that any
6151            pending recursive references are updated. */
6152    
6153            else
6154              {
6155              *code = OP_END;
6156              adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6157              memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6158              code += 1 + LINK_SIZE;
6159              len += 1 + LINK_SIZE;
6160              tempcode[0] = OP_ONCE;
6161              *code++ = OP_KET;
6162              PUTINC(code, 0, len);
6163              PUT(tempcode, 1, len);
6164              }
6165            }
6166    
6167    #ifdef NEVER
6168        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
6169          {          {
6170          case OP_STAR:  *tempcode = OP_POSSTAR; break;          case OP_STAR:  *tempcode = OP_POSSTAR; break;
# Line 5793  for (;; ptr++) Line 6192  for (;; ptr++)
6192          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6193          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6194    
6195            case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6196            case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6197            case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6198            case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6199    
6200          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
6201          pending recursive references are updated. */          pending recursive references are updated. */
6202    
# Line 5808  for (;; ptr++) Line 6212  for (;; ptr++)
6212          PUT(tempcode, 1, len);          PUT(tempcode, 1, len);
6213          break;          break;
6214          }          }
6215    #endif
6216        }        }
6217    
6218      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 6048  for (;; ptr++) Line 6453  for (;; ptr++)
6453    
6454          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6455          syntax (?(<name>) or (?('name'), and also allow for the original PCRE          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6456          syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may          syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6457          consist entirely of digits, there is scope for ambiguity. */          consist entirely of digits, there is scope for ambiguity. */
6458    
6459          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
# Line 6066  for (;; ptr++) Line 6471  for (;; ptr++)
6471            terminator = CHAR_NULL;            terminator = CHAR_NULL;
6472            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6473            }            }
6474    
6475          /* When a name is one of a number of duplicates, a different opcode is          /* When a name is one of a number of duplicates, a different opcode is
6476          used and it needs more memory. Unfortunately we cannot tell whether a          used and it needs more memory. Unfortunately we cannot tell whether a
6477          name is a duplicate in the first pass, so we have to allow for more          name is a duplicate in the first pass, so we have to allow for more
6478          memory except when we know it is a relative numerical reference. */          memory except when we know it is a relative numerical reference. */
6479    
6480          if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;          if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6481    
6482          /* We now expect to read a name (possibly all digits); any thing else          /* We now expect to read a name (possibly all digits); any thing else
# Line 6095  for (;; ptr++) Line 6500  for (;; ptr++)
6500          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6501    
6502          /* Check the terminator */          /* Check the terminator */
6503    
6504          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6505              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6506            {            {
# Line 6132  for (;; ptr++) Line 6537  for (;; ptr++)
6537    
6538          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
6539          name. */          name. */
6540    
6541          slot = cd->name_table;          slot = cd->name_table;
6542          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
6543            {            {
# Line 6140  for (;; ptr++) Line 6545  for (;; ptr++)
6545            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6546            }            }
6547    
6548          /* Found the named subpattern. If the name is duplicated, add one to          /* Found the named subpattern. If the name is duplicated, add one to
6549          the opcode to change CREF/RREF into DNCREF/DNRREF and insert          the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6550          appropriate data values. Otherwise, just insert the unique subpattern          appropriate data values. Otherwise, just insert the unique subpattern
6551          number. */          number. */
6552    
6553          if (i < cd->names_found)          if (i < cd->names_found)
6554            {            {
6555            int offset = i++;            int offset = i++;
6556            int count = 1;            int count = 1;
6557            recno = GET2(slot, 0);   /* Number from first found */            recno = GET2(slot, 0);   /* Number from first found */
6558            for (; i < cd->names_found; i++)            for (; i < cd->names_found; i++)
6559              {              {
6560              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6561              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6562              count++;              count++;
6563              }              }
6564            if (count > 1)            if (count > 1)
6565              {              {
6566              PUT2(code, 2+LINK_SIZE, offset);              PUT2(code, 2+LINK_SIZE, offset);
6567              PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);              PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6568              skipbytes += IMM2_SIZE;              skipbytes += IMM2_SIZE;
6569              code[1+LINK_SIZE]++;              code[1+LINK_SIZE]++;
6570              }              }
6571            else  /* Not a duplicated name */            else  /* Not a duplicated name */
6572              {              {
6573              PUT2(code, 2+LINK_SIZE, recno);              PUT2(code, 2+LINK_SIZE, recno);
6574              }              }
6575            }            }
6576    
6577          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
# Line 8863  return (pcre32 *)re; Line 9268  return (pcre32 *)re;
9268  }  }
9269    
9270  /* End of pcre_compile.c */  /* End of pcre_compile.c */
9271    

Legend:
Removed from v.1365  
changed lines
  Added in v.1384

  ViewVC Help
Powered by ViewVC 1.1.5