/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 782 by zherczeg, Sat Dec 3 23:58:37 2011 UTC revision 798 by zherczeg, Sun Dec 11 18:07:25 2011 UTC
# Line 102  overrun before it actually does run off Line 102  overrun before it actually does run off
102  #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */  #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
103  #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */  #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
104    
105    /* Repeated character flags. */
106    
107    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
108    
109  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
110  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
111  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
# Line 2353  for (code = first_significant_code(code Line 2357  for (code = first_significant_code(code
2357      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2358      here. */      here. */
2359    
2360  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2361      case OP_XCLASS:      case OP_XCLASS:
2362      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2363      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2363  for (code = first_significant_code(code Line 2367  for (code = first_significant_code(code
2367      case OP_NCLASS:      case OP_NCLASS:
2368      ccode = code + PRIV(OP_lengths)[OP_CLASS];      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2369    
2370  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2371      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2372  #endif  #endif
2373    
# Line 2896  static BOOL Line 2900  static BOOL
2900  check_auto_possessive(const pcre_uchar *previous, BOOL utf,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2901    const pcre_uchar *ptr, int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2902  {  {
2903  int c, next;  pcre_int32 c, next;
2904  int op_code = *previous++;  int op_code = *previous++;
2905    
2906  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2932  if (*ptr == CHAR_BACKSLASH) Line 2936  if (*ptr == CHAR_BACKSLASH)
2936    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
2937    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
2938    }    }
2939    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
2940    {    {
2941  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2942    if (utf) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
2943  #endif  #endif
2944    next = *ptr++;    next = *ptr++;
2945    }    }
   
2946  else return FALSE;  else return FALSE;
2947    
2948  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2978  the next item is a character. */ Line 2980  the next item is a character. */
2980  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
2981    {    {
2982    case OP_CHAR:    case OP_CHAR:
2983  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2984    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
2985  #else  #else
2986    c = *previous;    c = *previous;
# Line 2990  if (next >= 0) switch(op_code) Line 2992  if (next >= 0) switch(op_code)
2992    high-valued characters. */    high-valued characters. */
2993    
2994    case OP_CHARI:    case OP_CHARI:
2995  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2996    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
2997  #else  #else
2998    c = *previous;    c = *previous;
2999  #endif  #endif
3000    if (c == next) return FALSE;    if (c == next) return FALSE;
3001  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3002    if (utf)    if (utf)
3003      {      {
3004      unsigned int othercase;      unsigned int othercase;
# Line 3009  if (next >= 0) switch(op_code) Line 3011  if (next >= 0) switch(op_code)
3011      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3012      }      }
3013    else    else
3014  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3015    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
3016    
3017    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
# Line 3021  if (next >= 0) switch(op_code) Line 3023  if (next >= 0) switch(op_code)
3023    
3024    case OP_NOTI:    case OP_NOTI:
3025    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
3026  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3027    if (utf)    if (utf)
3028      {      {
3029      unsigned int othercase;      unsigned int othercase;
# Line 3034  if (next >= 0) switch(op_code) Line 3036  if (next >= 0) switch(op_code)
3036      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3037      }      }
3038    else    else
3039  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3040    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
3041    
3042    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
# Line 3126  switch(op_code) Line 3128  switch(op_code)
3128    {    {
3129    case OP_CHAR:    case OP_CHAR:
3130    case OP_CHARI:    case OP_CHARI:
3131  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3132    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3133  #else  #else
3134    c = *previous;    c = *previous;
# Line 3356  pcre_uint8 classbits[32]; Line 3358  pcre_uint8 classbits[32];
3358  must not do this for other options (e.g. PCRE_EXTENDED) because they may change  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3359  dynamically as we process the pattern. */  dynamically as we process the pattern. */
3360    
3361  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3362  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3363  BOOL utf = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
3364  pcre_uchar utf_chars[6];  pcre_uchar utf_chars[6];
# Line 3411  for (;; ptr++) Line 3413  for (;; ptr++)
3413    BOOL is_quantifier;    BOOL is_quantifier;
3414    BOOL is_recurse;    BOOL is_recurse;
3415    BOOL reset_bracount;    BOOL reset_bracount;
3416    int class_charcount;    int class_has_8bitchar;
3417      int class_single_char;
3418    int class_lastchar;    int class_lastchar;
3419    int newoptions;    int newoptions;
3420    int recno;    int recno;
# Line 3708  for (;; ptr++) Line 3711  for (;; ptr++)
3711    
3712      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3713    
3714      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3715      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3716      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1, if the class only contains
3717        a single character. */
3718    
3719      class_charcount = 0;      class_has_8bitchar = 0;
3720        class_single_char = 0;
3721      class_lastchar = -1;      class_lastchar = -1;
3722    
3723      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
# Line 3736  for (;; ptr++) Line 3741  for (;; ptr++)
3741        {        {
3742        const pcre_uchar *oldptr;        const pcre_uchar *oldptr;
3743    
3744  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3745        if (utf && c > 127)        if (utf && HAS_EXTRALEN(c))
3746          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3747          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3748          }          }
# Line 3868  for (;; ptr++) Line 3873  for (;; ptr++)
3873            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3874    
3875          ptr = tempptr + 1;          ptr = tempptr + 1;
3876          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3877            class_has_8bitchar = 1;
3878            /* Every class contains at least two characters. */
3879            class_single_char = 2;
3880          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3881          }          }
3882    
3883        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3884        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3885        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3886        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3887        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar class_single_char bigger
3888        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3889          as literal characters (by default), or are faulted if
3890        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3891    
3892        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3900  for (;; ptr++) Line 3909  for (;; ptr++)
3909          if (c < 0)          if (c < 0)
3910            {            {
3911            register const pcre_uint8 *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3912            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3913              class_has_8bitchar++;
3914              /* Every class contains at least two characters. */
3915              class_single_char += 2;
3916    
3917            switch (-c)            switch (-c)
3918              {              {
# Line 3913  for (;; ptr++) Line 3925  for (;; ptr++)
3925              case ESC_SU:              case ESC_SU:
3926              nestptr = ptr;              nestptr = ptr;
3927              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3928              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
3929              continue;              continue;
3930  #endif  #endif
3931              case ESC_d:              case ESC_d:
# Line 4079  for (;; ptr++) Line 4091  for (;; ptr++)
4091                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4092                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
4093                *class_uchardata++ = pdata;                *class_uchardata++ = pdata;
4094                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4095                continue;                continue;
4096                }                }
4097  #endif  #endif
# Line 4093  for (;; ptr++) Line 4105  for (;; ptr++)
4105                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4106                goto FAILED;                goto FAILED;
4107                }                }
4108              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4109              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4110                c = *ptr;                /* Get the final character and fall through */
4111              break;              break;
4112              }              }
4113            }            }
4114    
4115          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4116          greater than 256 mode. */          greater than 256. */
4117    
4118          }   /* End of backslash handling */          }   /* End of backslash handling */
4119    
# Line 4148  for (;; ptr++) Line 4161  for (;; ptr++)
4161            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4162            }            }
4163    
4164  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4165          if (utf)          if (utf)
4166            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4167            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
# Line 4193  for (;; ptr++) Line 4206  for (;; ptr++)
4206    
4207          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4208    
4209            /* Since we found a character range, single character optimizations
4210            cannot be done anymore. */
4211            class_single_char = 2;
4212    
4213          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4214          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4215          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4216          available. */          available. */
4217    
4218  #ifdef SUPPORT_UTF  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4219            if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4220    #elif defined  SUPPORT_UTF
4221          if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4222  #endif  #elif !(defined COMPILE_PCRE8)
 #ifndef COMPILE_PCRE8  
4223          if (d > 255)          if (d > 255)
4224  #endif  #endif
4225  #if defined SUPPORT_UTF || defined COMPILE_PCRE16  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4226            {            {
4227            xclass = TRUE;            xclass = TRUE;
4228    
# Line 4213  for (;; ptr++) Line 4231  for (;; ptr++)
4231            they fit with the basic range. */            they fit with the basic range. */
4232    
4233  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4234    #ifndef COMPILE_PCRE8
4235              if (utf && (options & PCRE_CASELESS) != 0)
4236    #else
4237            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4238    #endif
4239              {              {
4240              unsigned int occ, ocd;              unsigned int occ, ocd;
4241              unsigned int cc = c;              unsigned int cc = c;
# Line 4256  for (;; ptr++) Line 4278  for (;; ptr++)
4278    
4279            *class_uchardata++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4280  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4281    #ifndef COMPILE_PCRE8
4282              if (utf)
4283                {
4284                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4285                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4286                }
4287              else
4288                {
4289                *class_uchardata++ = c;
4290                *class_uchardata++ = d;
4291                }
4292    #else
4293            class_uchardata += PRIV(ord2utf)(c, class_uchardata);            class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4294            class_uchardata += PRIV(ord2utf)(d, class_uchardata);            class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4295  #else  #endif
4296    #else /* SUPPORT_UTF */
4297            *class_uchardata++ = c;            *class_uchardata++ = c;
4298            *class_uchardata++ = d;            *class_uchardata++ = d;
4299  #endif  #endif /* SUPPORT_UTF */
4300    
4301            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4302            caseless matching for UTF characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
# Line 4269  for (;; ptr++) Line 4304  for (;; ptr++)
4304            can still use  */            can still use  */
4305    
4306  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4307            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4308  #else            if (utf)
4309  #ifdef SUPPORT_UTF  #endif
4310                continue;    /* With next character in the class */
4311    #endif  /* SUPPORT_UCP */
4312    
4313    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4314              if (utf)
4315                {
4316                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4317                /* Adjust upper limit and fall through to set up the map */
4318                d = 127;
4319                }
4320              else
4321                {
4322                if (c > 255) continue;
4323                /* Adjust upper limit and fall through to set up the map */
4324                d = 255;
4325                }
4326    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4327            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4328            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
4329            d = 127;            d = 127;
# Line 4279  for (;; ptr++) Line 4331  for (;; ptr++)
4331            if (c > 255) continue;            if (c > 255) continue;
4332            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
4333            d = 255;            d = 255;
4334  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
 #endif  /* SUPPORT_UCP */  
4335            }            }
4336  #endif  /* SUPPORT_UTF8 || COMPILE_PCRE16 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4337    
4338          /* We use the bit map for 8 bit mode, or when the characters fall          /* We use the bit map for 8 bit mode, or when the characters fall
4339          partially or entirely to [0-255] ([0-127] for UCP) ranges. */          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4340    
4341          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4342    
4343          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4344    
# Line 4311  for (;; ptr++) Line 4361  for (;; ptr++)
4361    
4362        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4363    
4364        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4365          if (class_single_char < 2) class_single_char++;
4366          class_lastchar = c;
4367    
4368  #ifdef SUPPORT_UTF        /* Handle a character that cannot go in the bit map */
4369    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4370          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4371    #elif defined SUPPORT_UTF
4372        if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))        if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4373  #endif  #elif !(defined COMPILE_PCRE8)
 #ifndef COMPILE_PCRE8  
4374        if (c > 255)        if (c > 255)
4375  #endif  #endif
4376  #if defined SUPPORT_UTF || defined COMPILE_PCRE16  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4377          {          {
4378          xclass = TRUE;          xclass = TRUE;
4379          *class_uchardata++ = XCL_SINGLE;          *class_uchardata++ = XCL_SINGLE;
4380  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4381          class_uchardata += PRIV(ord2utf)(c, class_uchardata);  #ifndef COMPILE_PCRE8
4382  #else          /* In non 8 bit mode, we can get here even
4383          *class_uchardata++ = c;          if we are not in UTF mode. */
4384            if (!utf)
4385              *class_uchardata++ = c;
4386            else
4387  #endif  #endif
4388              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4389    #else /* SUPPORT_UTF */
4390            *class_uchardata++ = c;
4391    #endif /* SUPPORT_UTF */
4392    
4393  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4394    #ifdef COMPILE_PCRE8
4395          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4396    #else
4397            /* In non 8 bit mode, we can get here even
4398            if we are not in UTF mode. */
4399            if (utf && (options & PCRE_CASELESS) != 0)
4400    #endif
4401            {            {
4402            unsigned int othercase;            unsigned int othercase;
4403            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((othercase = UCD_OTHERCASE(c)) != c)
# Line 4343  for (;; ptr++) Line 4410  for (;; ptr++)
4410    
4411          }          }
4412        else        else
4413  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
   
4414        /* Handle a single-byte character */        /* Handle a single-byte character */
4415          {          {
4416            class_has_8bitchar = 1;
4417          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4418          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4419            {            {
4420            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c];   /* flip case */
4421            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4422            }            }
         class_charcount++;  
         class_lastchar = c;  
4423          }          }
4424    
4425        }        }
4426    
4427      /* Loop until ']' reached. This "while" is the end of the "do" far above.      /* Loop until ']' reached. This "while" is the end of the "do" far above.
# Line 4393  for (;; ptr++) Line 4459  for (;; ptr++)
4459      of reqchar, save the previous value for reinstating. */      of reqchar, save the previous value for reinstating. */
4460    
4461  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4462      if (class_charcount == 1 && !xclass &&      if (class_single_char == 1 && (!utf || !negate_class
4463        (!utf || !negate_class || class_lastchar < 128))        || class_lastchar < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4464  #else  #else
4465      if (class_charcount == 1)      if (class_single_char == 1)
4466  #endif  #endif
4467        {        {
4468        zeroreqchar = reqchar;        zeroreqchar = reqchar;
4469    
4470        /* The OP_NOT[I] opcodes work on one-byte characters only. */        /* The OP_NOT[I] opcodes work on single characters only. */
4471    
4472        if (negate_class)        if (negate_class)
4473          {          {
# Line 4415  for (;; ptr++) Line 4481  for (;; ptr++)
4481        /* For a single, positive character, get the value into mcbuffer, and        /* For a single, positive character, get the value into mcbuffer, and
4482        then we can handle this with the normal one-character code. */        then we can handle this with the normal one-character code. */
4483    
4484  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4485        if (utf && class_lastchar > 127)        if (utf && class_lastchar > MAX_VALUE_FOR_SINGLE_CHAR)
4486          mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);          mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);
4487        else        else
4488  #endif  #endif
# Line 4460  for (;; ptr++) Line 4526  for (;; ptr++)
4526        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4527        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4528    
4529        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4530          {          {
4531          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4532          memmove(code + (32 / sizeof(pcre_uchar)), code,          memmove(code + (32 / sizeof(pcre_uchar)), code,
# Line 4603  for (;; ptr++) Line 4669  for (;; ptr++)
4669    
4670        /* Deal with UTF characters that take up more than one character. It's        /* Deal with UTF characters that take up more than one character. It's
4671        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4672        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus UTF_LENGTH to flag that
4673        length rather than a small character. */        it's a length rather than a small character. */
4674    
4675  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4676        if (utf && (code[-1] & 0x80) != 0)        if (utf && NOT_FIRSTCHAR(code[-1]))
4677          {          {
4678          pcre_uchar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4679          BACKCHAR(lastchar);          BACKCHAR(lastchar);
4680          c = code - lastchar;            /* Length of UTF-8 character */          c = code - lastchar;            /* Length of UTF-8 character */
4681          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4682          c |= 0x80;                      /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4683          }          }
4684        else        else
4685  #endif  #endif /* SUPPORT_UTF */
4686    
4687        /* Handle the case of a single charater - either with no UTF support, or        /* Handle the case of a single charater - either with no UTF support, or
4688        with UTF disabled, or for a single character UTF character. */        with UTF disabled, or for a single character UTF character. */
   
4689          {          {
4690          c = code[-1];          c = code[-1];
4691          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
# Line 4758  for (;; ptr++) Line 4823  for (;; ptr++)
4823          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
4824          Unicode property match, there are two extra bytes that define the          Unicode property match, there are two extra bytes that define the
4825          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
4826          c, with the 0x80 bit as a flag. */          c, with the UTF_LENGTH bit as a flag. */
4827    
4828          if (repeat_max < 0)          if (repeat_max < 0)
4829            {            {
4830  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4831            if (utf && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4832              {              {
4833              memcpy(code, utf_chars, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4834              code += c & 7;              code += c & 7;
4835              }              }
4836            else            else
# Line 4787  for (;; ptr++) Line 4852  for (;; ptr++)
4852    
4853          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
4854            {            {
4855  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4856            if (utf && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4857              {              {
4858              memcpy(code, utf_chars, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4859              code += c & 7;              code += c & 7;
4860              }              }
4861            else            else
# Line 4817  for (;; ptr++) Line 4882  for (;; ptr++)
4882    
4883        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
4884    
4885  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4886        if (utf && c >= 128)        if (utf && (c & UTF_LENGTH) != 0)
4887          {          {
4888          memcpy(code, utf_chars, c & 7);          memcpy(code, utf_chars, IN_UCHARS(c & 7));
4889          code += c & 7;          code += c & 7;
4890          }          }
4891        else        else
# Line 4844  for (;; ptr++) Line 4909  for (;; ptr++)
4909    
4910      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
4911               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
4912  #if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4913               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
4914  #endif  #endif
4915               *previous == OP_REF ||               *previous == OP_REF ||
# Line 5811  for (;; ptr++) Line 5876  for (;; ptr++)
5876                *errorcodeptr = ERR49;                *errorcodeptr = ERR49;
5877                goto FAILED;                goto FAILED;
5878                }                }
5879              if (namelen + 3 > cd->name_entry_size)              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
5880                {                {
5881                cd->name_entry_size = namelen + 3;                cd->name_entry_size = namelen + IMM2_SIZE + 1;
5882                if (namelen > MAX_NAME_SIZE)                if (namelen > MAX_NAME_SIZE)
5883                  {                  {
5884                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR48;
# Line 5842  for (;; ptr++) Line 5907  for (;; ptr++)
5907    
5908              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
5909                {                {
5910                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
5911                if (crc == 0)                if (crc == 0)
5912                  {                  {
5913                  if (slot[2+namelen] == 0)                  if (slot[IMM2_SIZE+namelen] == 0)
5914                    {                    {
5915                    if (GET2(slot, 0) != cd->bracount + 1 &&                    if (GET2(slot, 0) != cd->bracount + 1 &&
5916                        (options & PCRE_DUPNAMES) == 0)                        (options & PCRE_DUPNAMES) == 0)
# Line 5897  for (;; ptr++) Line 5962  for (;; ptr++)
5962                }                }
5963    
5964              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
5965              memcpy(slot + 2, name, IN_UCHARS(namelen));              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
5966              slot[2 + namelen] = 0;              slot[IMM2_SIZE + namelen] = 0;
5967              }              }
5968            }            }
5969    
# Line 5982  for (;; ptr++) Line 6047  for (;; ptr++)
6047            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
6048              {              {
6049              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6050                  slot[2+namelen] == 0)                  slot[IMM2_SIZE+namelen] == 0)
6051                break;                break;
6052              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6053              }              }
# Line 6636  for (;; ptr++) Line 6701  for (;; ptr++)
6701      a value > 127. We set its representation in the length/buffer, and then      a value > 127. We set its representation in the length/buffer, and then
6702      handle it as a data character. */      handle it as a data character. */
6703    
6704  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
6705      if (utf && c > 127)      if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6706        mclength = PRIV(ord2utf)(c, mcbuffer);        mclength = PRIV(ord2utf)(c, mcbuffer);
6707      else      else
6708  #endif  #endif
# Line 6661  for (;; ptr++) Line 6726  for (;; ptr++)
6726    
6727  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6728      if (utf && HAS_EXTRALEN(c))      if (utf && HAS_EXTRALEN(c))
6729        {        ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
       INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));  
       }  
6730  #endif  #endif
6731    
6732      /* At this point we have the character's bytes in mcbuffer, and the length      /* At this point we have the character's bytes in mcbuffer, and the length
# Line 7435  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 7498  while (ptr[skipatstart] == CHAR_LEFT_PAR
7498    int newnl = 0;    int newnl = 0;
7499    int newbsr = 0;    int newbsr = 0;
7500    
7501    if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)  #ifdef COMPILE_PCRE8
7502      if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 5) == 0)
7503      { skipatstart += 7; options |= PCRE_UTF8; continue; }      { skipatstart += 7; options |= PCRE_UTF8; continue; }
7504    #endif
7505    #ifdef COMPILE_PCRE16
7506      if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0)
7507        { skipatstart += 8; options |= PCRE_UTF16; continue; }
7508    #endif
7509    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
7510      { skipatstart += 6; options |= PCRE_UCP; continue; }      { skipatstart += 6; options |= PCRE_UCP; continue; }
7511    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
# Line 7468  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 7537  while (ptr[skipatstart] == CHAR_LEFT_PAR
7537  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
7538  utf = (options & PCRE_UTF8) != 0;  utf = (options & PCRE_UTF8) != 0;
7539    
7540  /* Can't support UTF8 unless PCRE has been compiled to include the code. The  /* Can't support UTF unless PCRE has been compiled to include the code. The
7541  return of an error code from PRIV(valid_utf)() is a new feature, introduced in  return of an error code from PRIV(valid_utf)() is a new feature, introduced in
7542  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7543  not used here. */  not used here. */
7544    
7545  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
7546  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7547       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7548    {    {
# Line 7610  externally provided function. Integer ov Line 7679  externally provided function. Integer ov
7679  because nowadays we limit the maximum value of cd->names_found and  because nowadays we limit the maximum value of cd->names_found and
7680  cd->name_entry_size. */  cd->name_entry_size. */
7681    
7682  size = sizeof(real_pcre) + (length + cd->names_found * (cd->name_entry_size + 3)) * sizeof(pcre_uchar);  size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7683  re = (real_pcre *)(pcre_malloc)(size);  re = (real_pcre *)(pcre_malloc)(size);
7684    
7685  if (re == NULL)  if (re == NULL)
# Line 7670  code = (pcre_uchar *)codestart; Line 7739  code = (pcre_uchar *)codestart;
7739    &firstchar, &reqchar, NULL, cd, NULL);    &firstchar, &reqchar, NULL, cd, NULL);
7740  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
7741  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7742  re->flags = cd->external_flags;  re->flags = cd->external_flags | PCRE_MODE;
7743    
7744  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
7745    
# Line 7789  if ((re->options & PCRE_ANCHORED) == 0) Line 7858  if ((re->options & PCRE_ANCHORED) == 0)
7858        re->first_char = firstchar & 0xffff;        re->first_char = firstchar & 0xffff;
7859  #endif  #endif
7860  #endif  #endif
7861        if ((firstchar & REQ_CASELESS) != 0 && MAX_255(re->first_char)        if ((firstchar & REQ_CASELESS) != 0)
7862          && cd->fcc[re->first_char] != re->first_char)          {
7863          re->flags |= PCRE_FCH_CASELESS;  #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
7864            /* We ignore non-ASCII first chars in 8 bit mode. */
7865            if (utf)
7866              {
7867              if (re->first_char < 128)
7868                {
7869                if (cd->fcc[re->first_char] != re->first_char)
7870                  re->flags |= PCRE_FCH_CASELESS;
7871                }
7872              else if (UCD_OTHERCASE(re->first_char) != re->first_char)
7873                re->flags |= PCRE_FCH_CASELESS;
7874              }
7875            else
7876    #endif
7877            if (MAX_255(re->first_char)
7878                && cd->fcc[re->first_char] != re->first_char)
7879              re->flags |= PCRE_FCH_CASELESS;
7880            }
7881    
7882        re->flags |= PCRE_FIRSTSET;        re->flags |= PCRE_FIRSTSET;
7883        }        }
# Line 7814  if (reqchar >= 0 && Line 7900  if (reqchar >= 0 &&
7900    re->req_char = reqchar & 0xffff;    re->req_char = reqchar & 0xffff;
7901  #endif  #endif
7902  #endif  #endif
7903    if ((reqchar & REQ_CASELESS) != 0 && MAX_255(re->req_char)    if ((reqchar & REQ_CASELESS) != 0)
7904      && cd->fcc[re->req_char] != re->req_char)      {
7905      re->flags |= PCRE_RCH_CASELESS;  #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
7906        /* We ignore non-ASCII first chars in 8 bit mode. */
7907        if (utf)
7908          {
7909          if (re->req_char < 128)
7910            {
7911            if (cd->fcc[re->req_char] != re->req_char)
7912              re->flags |= PCRE_RCH_CASELESS;
7913            }
7914          else if (UCD_OTHERCASE(re->req_char) != re->req_char)
7915            re->flags |= PCRE_RCH_CASELESS;
7916          }
7917        else
7918    #endif
7919        if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
7920          re->flags |= PCRE_RCH_CASELESS;
7921        }
7922    
7923    re->flags |= PCRE_REQCHSET;    re->flags |= PCRE_REQCHSET;
7924    }    }

Legend:
Removed from v.782  
changed lines
  Added in v.798

  ViewVC Help
Powered by ViewVC 1.1.5