/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1364 by ph10, Sat Oct 5 15:45:11 2013 UTC revision 1375 by zherczeg, Sat Oct 12 17:56:40 2013 UTC
# Line 462  static const char error_texts[] = Line 462  static const char error_texts[] =
462    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
463    "this version of PCRE is compiled without UTF support\0"    "this version of PCRE is compiled without UTF support\0"
464    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
465    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
466    /* 35 */    /* 35 */
467    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
468    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 516  static const char error_texts[] = Line 516  static const char error_texts[] =
516    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
517    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
518    "setting UTF is disabled by the application\0"    "setting UTF is disabled by the application\0"
519      "non-hex character in \\x{} (closing brace missing?)\0"
520      /* 80 */
521      "non-octal character in \\o{} (closing brace missing?)\0"
522      "missing opening brace after \\o\0"
523    ;    ;
524    
525  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 883  return (*p == CHAR_RIGHT_CURLY_BRACKET);
883  *************************************************/  *************************************************/
884    
885  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
886  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
887  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
888  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
889  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
890  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
891    
892  Arguments:  Arguments:
893    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
894    chptr          points to the data character    chptr          points to a returned data character
895    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
896    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
897    options        the options bits    options        the options bits
# Line 1092  else Line 1095  else
1095      break;      break;
1096    
1097      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1098      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1099      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1100        recommended to avoid the ambiguities in the old syntax.
1101    
1102      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1103      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1104      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1105      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1106      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1107      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1108      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1109    
1110        Inside a character class, \ followed by a digit is always either a literal
1111        8 or 9 or an octal number. */
1112    
1113      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1114      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1128  else Line 1135  else
1135          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1136          break;          break;
1137          }          }
1138        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1139          {          {
1140          escape = -s;          escape = -s;
1141          break;          break;
# Line 1136  else Line 1143  else
1143        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1144        }        }
1145    
1146      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1147      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1148      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1149        changed so as not to insert the binary zero. */
1150    
1151      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1152        {  
1153        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1154    
1155      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1156      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1162  else Line 1167  else
1167  #endif  #endif
1168      break;      break;
1169    
1170      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1171      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.      specifying character codes in octal. The only supported form is \o{ddd}. */
1172      If not, { is treated as a data character. */  
1173        case CHAR_o:
1174        if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1175          {
1176          ptr += 2;
1177          c = 0;
1178          overflow = FALSE;
1179          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1180            {
1181            register pcre_uint32 cc = *ptr++;
1182            if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1183    #ifdef COMPILE_PCRE32
1184            if (c >= 0x20000000l) { overflow = TRUE; break; }
1185    #endif
1186            c = (c << 3) + cc - CHAR_0 ;
1187    #if defined COMPILE_PCRE8
1188            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1189    #elif defined COMPILE_PCRE16
1190            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1191    #elif defined COMPILE_PCRE32
1192            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1193    #endif
1194            }
1195          if (overflow)
1196            {
1197            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1198            *errorcodeptr = ERR34;
1199            }
1200          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1201            {
1202            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1203            }
1204          else *errorcodeptr = ERR80;
1205          }
1206        break;
1207    
1208        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1209        numbers. Otherwise it is a lowercase x letter. */
1210    
1211      case CHAR_x:      case CHAR_x:
1212      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1213        {        {
       /* In JavaScript, \x must be followed by two hexadecimal numbers.  
       Otherwise it is a lowercase x letter. */  
1214        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1215          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1216          {          {
# Line 1187  else Line 1227  else
1227  #endif  #endif
1228            }            }
1229          }          }
1230        break;        }    /* End JavaScript handling */
       }  
1231    
1232      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1233        {      greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1234        const pcre_uchar *pt = ptr + 2;      digits. If not, { used to be treated as a data character. However, Perl
1235        seems to read hex digits up to the first non-such, and ignore the rest, so
1236        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1237        now gives an error. */
1238    
1239        c = 0;      else
1240        overflow = FALSE;        {
1241        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1242          {          {
1243          register pcre_uint32 cc = *pt++;          ptr += 2;
1244          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          c = 0;
1245            overflow = FALSE;
1246            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1247              {
1248              register pcre_uint32 cc = *ptr++;
1249              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1250    
1251  #ifdef COMPILE_PCRE32  #ifdef COMPILE_PCRE32
1252          if (c >= 0x10000000l) { overflow = TRUE; break; }            if (c >= 0x10000000l) { overflow = TRUE; break; }
1253  #endif  #endif
1254    
1255  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1256          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1257          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1258  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1259          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1260          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1261  #endif  #endif
1262    
1263  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1264          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1265  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1266          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1267  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1268          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1269  #endif  #endif
1270          }            }
1271    
1272        if (overflow)          if (overflow)
1273          {            {
1274          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1275          *errorcodeptr = ERR34;            *errorcodeptr = ERR34;
1276          }            }
1277    
1278        if (*pt == CHAR_RIGHT_CURLY_BRACKET)          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1279          {            {
1280          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1281          ptr = pt;            }
         break;  
         }  
1282    
1283        /* If the sequence of hex digits does not end with '}', then we don't          /* If the sequence of hex digits does not end with '}', give an error.
1284        recognize this construct; fall through to the normal \x handling. */          We used just to recognize this construct and fall through to the normal
1285        }          \x handling, but nowadays Perl gives an error, which seems much more
1286            sensible, so we do too. */
1287    
1288      /* Read just a single-byte hex-defined char */          else *errorcodeptr = ERR79;
1289            }   /* End of \x{} processing */
1290    
1291      c = 0;        /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1292      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)  
1293        {        else
1294        pcre_uint32 cc;                          /* Some compilers don't like */          {
1295        cc = *(++ptr);                           /* ++ in initializers */          c = 0;
1296            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1297              {
1298              pcre_uint32 cc;                          /* Some compilers don't like */
1299              cc = *(++ptr);                           /* ++ in initializers */
1300  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1301        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1302        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1303  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1304        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1305        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1306  #endif  #endif
1307        }            }
1308            }     /* End of \xdd handling */
1309          }       /* End of Perl-style \x handling */
1310      break;      break;
1311    
1312      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 1524  for (;;) Line 1577  for (;;)
1577    
1578      case OP_CALLOUT:      case OP_CALLOUT:
1579      case OP_CREF:      case OP_CREF:
1580      case OP_NCREF:      case OP_DNCREF:
1581      case OP_RREF:      case OP_RREF:
1582      case OP_NRREF:      case OP_DNRREF:
1583      case OP_DEF:      case OP_DEF:
1584      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1585      break;      break;
# Line 1663  for (;;) Line 1716  for (;;)
1716      case OP_COMMIT:      case OP_COMMIT:
1717      case OP_CREF:      case OP_CREF:
1718      case OP_DEF:      case OP_DEF:
1719        case OP_DNCREF:
1720        case OP_DNRREF:
1721      case OP_DOLL:      case OP_DOLL:
1722      case OP_DOLLM:      case OP_DOLLM:
1723      case OP_EOD:      case OP_EOD:
1724      case OP_EODN:      case OP_EODN:
1725      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1726      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1727      case OP_PRUNE:      case OP_PRUNE:
1728      case OP_REVERSE:      case OP_REVERSE:
# Line 2653  switch(ptype) Line 2706  switch(ptype)
2706    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2707    means that Perl space and POSIX space are now identical. PCRE was changed    means that Perl space and POSIX space are now identical. PCRE was changed
2708    at release 8.34. */    at release 8.34. */
2709    
2710    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2711    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2712    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
# Line 2818  switch(c) Line 2871  switch(c)
2871      return code + 2;      return code + 2;
2872      }      }
2873    
2874    /* Convert only if we have anough space. */    /* Convert only if we have enough space. */
2875    
2876    clist_src = PRIV(ucd_caseless_sets) + code[1];    clist_src = PRIV(ucd_caseless_sets) + code[1];
2877    clist_dest = list + 2;    clist_dest = list + 2;
2878    code += 2;    code += 2;
2879    
2880    do {    do {
      /* Early return if there is not enough space. */  
2881       if (clist_dest >= list + 8)       if (clist_dest >= list + 8)
2882         {         {
2883           /* Early return if there is not enough space. This should never
2884           happen, since all clists are shorter than 5 character now. */
2885         list[2] = code[0];         list[2] = code[0];
2886         list[3] = code[1];         list[3] = code[1];
2887         return code;         return code;
2888         }         }
2889       *clist_dest++ = *clist_src;       *clist_dest++ = *clist_src;
2890       }       }
2891     while(*clist_src++ != NOTACHAR);    while(*clist_src++ != NOTACHAR);
2892    
2893    /* Enough space to store all characters. */    /* All characters are stored. The terminating NOTACHAR
2894      is copied form the clist itself. */
2895    
2896    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2897    return code;    return code;
# Line 2902  pcre_uint32 list[8]; Line 2957  pcre_uint32 list[8];
2957  const pcre_uint32* chr_ptr;  const pcre_uint32* chr_ptr;
2958  const pcre_uint32* ochr_ptr;  const pcre_uint32* ochr_ptr;
2959  const pcre_uint32* list_ptr;  const pcre_uint32* list_ptr;
2960    const pcre_uchar *next_code;
2961  pcre_uint32 chr;  pcre_uint32 chr;
2962    
2963    /* Note: the base_list[1] contains whether the current opcode has greedy
2964    (represented by a non-zero value) quantifier. This is a different from
2965    other character type lists, which stores here that the character iterator
2966    matches to an empty string (also represented by a non-zero value). */
2967    
2968  for(;;)  for(;;)
2969    {    {
2970    c = *code;    c = *code;
# Line 2925  for(;;) Line 2986  for(;;)
2986    switch(c)    switch(c)
2987      {      {
2988      case OP_END:      case OP_END:
2989      /* TRUE only in greedy case. The non-greedy case could be replaced by an      case OP_KETRPOS:
2990      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses      /* TRUE only in greedy case. The non-greedy case could be replaced by
2991      more memory, which we cannot get at this stage.) */      an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
2992        uses more memory, which we cannot get at this stage.) */
2993    
2994      return base_list[1] != 0;      return base_list[1] != 0;
2995    
2996      case OP_KET:      case OP_KET:
2997      /* If the bracket is capturing, and referenced by an OP_RECURSE, the      /* If the bracket is capturing, and referenced by an OP_RECURSE, or
2998      non-greedy case cannot be converted to a possessive form. We do not test      it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
2999      the bracket type at the moment, but we might do it in the future to improve      cannot be converted to a possessive form. */
     this condition. (But note that recursive calls are always atomic.) */  
3000    
3001      if (base_list[1] == 0) return FALSE;      if (base_list[1] == 0) return FALSE;
3002    
3003        switch(*(code - GET(code, 1)))
3004          {
3005          case OP_ASSERT:
3006          case OP_ASSERT_NOT:
3007          case OP_ASSERTBACK:
3008          case OP_ASSERTBACK_NOT:
3009          case OP_ONCE:
3010          case OP_ONCE_NC:
3011          /* Atomic sub-patterns and assertions can always auto-possessify their
3012          last iterator. */
3013          return TRUE;
3014          }
3015    
3016        code += PRIV(OP_lengths)[c];
3017        continue;
3018    
3019        case OP_ONCE:
3020        case OP_ONCE_NC:
3021        case OP_BRA:
3022        case OP_CBRA:
3023        next_code = code;
3024        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3025    
3026        /* We do not support repeated brackets, because they can lead to
3027        infinite recursion. */
3028    
3029        if (*next_code != OP_KET) return FALSE;
3030    
3031        next_code = code + GET(code, 1);
3032        code += PRIV(OP_lengths)[c];
3033    
3034        while (*next_code == OP_ALT)
3035          {
3036          if (!compare_opcodes(code, utf, cd, base_list)) return FALSE;
3037          code = next_code + 1 + LINK_SIZE;
3038          next_code += GET(next_code, 1);
3039          }
3040        continue;
3041    
3042        case OP_BRAZERO:
3043        case OP_BRAMINZERO:
3044    
3045        next_code = code + 1;
3046        if (*next_code != OP_BRA && *next_code != OP_CBRA)
3047          return FALSE;
3048    
3049        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3050    
3051        /* We do not support repeated brackets, because they can lead to
3052        infinite recursion. */
3053        if (*next_code != OP_KET) return FALSE;
3054    
3055        /* The bracket content will be checked by the
3056        OP_BRA/OP_CBRA case above. */
3057        next_code += 1 + LINK_SIZE;
3058        if (!compare_opcodes(next_code, utf, cd, base_list)) return FALSE;
3059    
3060      code += PRIV(OP_lengths)[c];      code += PRIV(OP_lengths)[c];
3061      continue;      continue;
3062      }      }
# Line 3011  for(;;) Line 3130  for(;;)
3130            /* This code is logically tricky. Think hard before fiddling with it.            /* This code is logically tricky. Think hard before fiddling with it.
3131            The posspropstab table has four entries per row. Each row relates to            The posspropstab table has four entries per row. Each row relates to
3132            one of PCRE's special properties such as ALNUM or SPACE or WORD.            one of PCRE's special properties such as ALNUM or SPACE or WORD.
3133            Only WORD actually needs all four entries, but using repeats for the            Only WORD actually needs all four entries, but using repeats for the
3134            others means they can all use the same code below.            others means they can all use the same code below.
3135    
3136            The first two entries in each row are Unicode general categories, and            The first two entries in each row are Unicode general categories, and
3137            apply always, because all the characters they include are part of the            apply always, because all the characters they include are part of the
3138            PCRE character set. The third and fourth entries are a general and a            PCRE character set. The third and fourth entries are a general and a
# Line 3023  for(;;) Line 3142  for(;;)
3142            category contains more characters than the specials that are defined            category contains more characters than the specials that are defined
3143            for the property being tested against. Therefore, it cannot be used            for the property being tested against. Therefore, it cannot be used
3144            in a NOTPROP case.            in a NOTPROP case.
3145    
3146            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3147            Underscore is covered by ucp_P or ucp_Po. */            Underscore is covered by ucp_P or ucp_Po. */
3148    
# Line 3207  for(;;) Line 3326  for(;;)
3326        case OP_CLASS:        case OP_CLASS:
3327        if (list_ptr != list) return FALSE;   /* Class is first opcode */        if (list_ptr != list) return FALSE;   /* Class is first opcode */
3328        if (chr > 255) break;        if (chr > 255) break;
3329        if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)        if ((((pcre_uint8 *)(code - list_ptr[2]))[chr >> 3] & (1 << (chr & 7))) != 0)
3330          return FALSE;          return FALSE;
3331        break;        break;
3332    
3333  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3334        case OP_XCLASS:        case OP_XCLASS:
3335        if (list_ptr != list) return FALSE;   /* Class is first opcode */        if (list_ptr != list) return FALSE;   /* Class is first opcode */
3336        if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))        if (PRIV(xclass)(chr, code - list_ptr[2] + LINK_SIZE, utf))
3337          return FALSE;          return FALSE;
3338        break;        break;
3339  #endif  #endif
# Line 4201  for (;; ptr++) Line 4320  for (;; ptr++)
4320        }        }
4321      }      }
4322    
   /* Fill in length of a previous callout, except when the next thing is  
   a quantifier. */  
   
4323    is_quantifier =    is_quantifier =
4324      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4325      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4326    
4327    if (!is_quantifier && previous_callout != NULL &&    /* Fill in length of a previous callout, except when the next thing is a
4328      quantifier or when processing a property substitution string in UCP mode. */
4329    
4330      if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4331         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
4332      {      {
4333      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
# Line 4239  for (;; ptr++) Line 4358  for (;; ptr++)
4358        }        }
4359      }      }
4360    
4361    /* No auto callout for quantifiers. */    /* No auto callout for quantifiers, or while processing property strings that
4362      are substituted for \w etc in UCP mode. */
4363    
4364    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4365      {      {
4366      previous_callout = code;      previous_callout = code;
4367      code = auto_callout(code, ptr, cd);      code = auto_callout(code, ptr, cd);
# Line 4631  for (;; ptr++) Line 4751  for (;; ptr++)
4751              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4752              previously set by something earlier in the character class.              previously set by something earlier in the character class.
4753              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4754              we could just adjust the appropriate bit. From PCRE 8.34 we no              we could just adjust the appropriate bit. From PCRE 8.34 we no
4755              longer treat \s and \S specially. */              longer treat \s and \S specially. */
4756    
4757              case ESC_s:              case ESC_s:
# Line 6030  for (;; ptr++) Line 6150  for (;; ptr++)
6150                 tempptr[2] == CHAR_LESS_THAN_SIGN))                 tempptr[2] == CHAR_LESS_THAN_SIGN))
6151            break;            break;
6152    
6153          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6154          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6155    
6156          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6157          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
# Line 6047  for (;; ptr++) Line 6167  for (;; ptr++)
6167            }            }
6168    
6169          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6170          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6171            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6172            consist entirely of digits, there is scope for ambiguity. */
6173    
6174          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6175            {            {
# Line 6065  for (;; ptr++) Line 6187  for (;; ptr++)
6187            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6188            }            }
6189    
6190          /* We now expect to read a name; any thing else is an error */          /* When a name is one of a number of duplicates, a different opcode is
6191            used and it needs more memory. Unfortunately we cannot tell whether a
6192            name is a duplicate in the first pass, so we have to allow for more
6193            memory except when we know it is a relative numerical reference. */
6194    
6195            if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6196    
6197            /* We now expect to read a name (possibly all digits); any thing else
6198            is an error. In the case of all digits, also get it as a number. */
6199    
6200          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6201            {            {
# Line 6074  for (;; ptr++) Line 6204  for (;; ptr++)
6204            goto FAILED;            goto FAILED;
6205            }            }
6206    
         /* Read the name, but also get it as a number if it's all digits */  
   
6207          recno = 0;          recno = 0;
6208          name = ++ptr;          name = ++ptr;
6209          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
# Line 6086  for (;; ptr++) Line 6214  for (;; ptr++)
6214            }            }
6215          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6216    
6217            /* Check the terminator */
6218    
6219          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6220              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6221            {            {
# Line 6121  for (;; ptr++) Line 6251  for (;; ptr++)
6251            }            }
6252    
6253          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
6254          name. If we find a name, add one to the opcode to change OP_CREF or          name. */
         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
6255    
6256          slot = cd->name_table;          slot = cd->name_table;
6257          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 6133  for (;; ptr++) Line 6260  for (;; ptr++)
6260            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6261            }            }
6262    
6263          /* Found the named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6264            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6265            appropriate data values. Otherwise, just insert the unique subpattern
6266            number. */
6267    
6268          if (i < cd->names_found)          if (i < cd->names_found)
6269            {            {
6270            recno = GET2(slot, 0);            int offset = i++;
6271            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6272            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6273              for (; i < cd->names_found; i++)
6274                {
6275                slot += cd->name_entry_size;
6276                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6277                count++;
6278                }
6279              if (count > 1)
6280                {
6281                PUT2(code, 2+LINK_SIZE, offset);
6282                PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6283                skipbytes += IMM2_SIZE;
6284                code[1+LINK_SIZE]++;
6285                }
6286              else  /* Not a duplicated name */
6287                {
6288                PUT2(code, 2+LINK_SIZE, recno);
6289                }
6290            }            }
6291    
6292          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
# Line 7829  do { Line 7976  do {
7976       switch (*scode)       switch (*scode)
7977         {         {
7978         case OP_CREF:         case OP_CREF:
7979         case OP_NCREF:         case OP_DNCREF:
7980         case OP_RREF:         case OP_RREF:
7981         case OP_NRREF:         case OP_DNRREF:
7982         case OP_DEF:         case OP_DEF:
7983         return FALSE;         return FALSE;
7984    

Legend:
Removed from v.1364  
changed lines
  Added in v.1375

  ViewVC Help
Powered by ViewVC 1.1.5