/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1045 by ph10, Sun Sep 23 16:50:00 2012 UTC revision 1047 by zherczeg, Fri Sep 28 15:06:38 2012 UTC
# Line 1859  for (;;) Line 1859  for (;;)
1859    
1860      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1861      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1862      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1863          cc += 2;
1864      cc += 1 + IMM2_SIZE + 1;      cc += 1 + IMM2_SIZE + 1;
1865      break;      break;
1866    
# Line 2097  for (;;) Line 2098  for (;;)
2098        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2099        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2100        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2101        if (code[1 + IMM2_SIZE] == OP_PROP        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2102          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;          code += 2;
2103        break;        break;
2104    
2105        case OP_MARK:        case OP_MARK:
# Line 2217  for (;;) Line 2218  for (;;)
2218        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2219        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2220        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2221        if (code[1 + IMM2_SIZE] == OP_PROP        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2222          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;          code += 2;
2223        break;        break;
2224    
2225        case OP_MARK:        case OP_MARK:
# Line 2543  for (code = first_significant_code(code Line 2544  for (code = first_significant_code(code
2544      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2545      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2546      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2547      if (code[1 + IMM2_SIZE] == OP_PROP      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2548        || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;        code += 2;
2549      break;      break;
2550    
2551      /* End of branch */      /* End of branch */
# Line 2951  Returns:       TRUE if auto-possessifyin Line 2952  Returns:       TRUE if auto-possessifyin
2952  static BOOL  static BOOL
2953  check_char_prop(int c, int ptype, int pdata, BOOL negated)  check_char_prop(int c, int ptype, int pdata, BOOL negated)
2954  {  {
2955    #ifdef SUPPORT_UCP
2956    const pcre_uint32 *p;
2957    #endif
2958    
2959  const ucd_record *prop = GET_UCD(c);  const ucd_record *prop = GET_UCD(c);
2960    
2961  switch(ptype)  switch(ptype)
2962    {    {
2963    case PT_LAMP:    case PT_LAMP:
# Line 2989  switch(ptype) Line 2995  switch(ptype)
2995    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2996            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2997            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
2998    
2999    #ifdef SUPPORT_UCP
3000      case PT_CLIST:
3001      p = PRIV(ucd_caseless_sets) + prop->caseset;
3002      for (;;)
3003        {
3004        if ((unsigned int)c < *p) return !negated;
3005        if ((unsigned int)c == *p++) return negated;
3006        }
3007      break;  /* Control never reaches here */
3008    #endif
3009    }    }
3010    
3011  return FALSE;  return FALSE;
3012  }  }
3013  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 3092  if (*ptr == CHAR_ASTERISK || *ptr == CHA Line 3110  if (*ptr == CHAR_ASTERISK || *ptr == CHA
3110    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3111      return FALSE;      return FALSE;
3112    
3113  /* Now compare the next item with the previous opcode. First, handle cases when  /* If the previous item is a character, get its value. */
 the next item is a character. */  
3114    
3115  if (next >= 0) switch(op_code)  if (op_code == OP_CHAR || op_code == OP_CHARI ||
3116        op_code == OP_NOT || op_code == OP_NOTI)
3117    {    {
   case OP_CHAR:  
3118  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
3119    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3120  #else  #else
3121    c = *previous;    c = *previous;
3122  #endif  #endif
3123    return c != next;    }
3124    
3125    /* For CHARI (caseless character) we must check the other case. If we have  /* Now compare the next item with the previous opcode. First, handle cases when
3126    Unicode property support, we can use it to test the other case of  the next item is a character. For a caseless UTF match, the next character may
3127    high-valued characters. */  have more than one other case; convert this to a special property. */
3128    
3129    case OP_CHARI:  if (next >= 0)
3130  #ifdef SUPPORT_UTF    {
   GETCHARTEST(c, previous);  
 #else  
   c = *previous;  
 #endif  
   if (c == next) return FALSE;  
 #ifdef SUPPORT_UTF  
   if (utf)  
     {  
     unsigned int othercase;  
     if (next < 128) othercase = cd->fcc[next]; else  
3131  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3132      othercase = UCD_OTHERCASE((unsigned int)next);    if (utf && (options & PCRE_CASELESS) != 0)
3133  #else      {
3134      othercase = NOTACHAR;      int ocs = UCD_CASESET(next);
3135  #endif      if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, FALSE);
     return (unsigned int)c != othercase;  
3136      }      }
   else  
 #endif  /* SUPPORT_UTF */  
   return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */  
   
   case OP_NOT:  
 #ifdef SUPPORT_UTF  
   GETCHARTEST(c, previous);  
 #else  
   c = *previous;  
3137  #endif  #endif
   return c == next;  
3138    
3139    case OP_NOTI:    switch(op_code)
3140        {
3141        case OP_CHAR:
3142        return c != next;
3143    
3144        /* For CHARI (caseless character) we must check the other case. If we have
3145        Unicode property support, we can use it to test the other case of
3146        high-valued characters. We know that next can have only one other case,
3147        because multi-other-case characters are dealt with above. */
3148    
3149        case OP_CHARI:
3150        if (c == next) return FALSE;
3151  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
3152    GETCHARTEST(c, previous);      if (utf)
3153          {
3154          unsigned int othercase;
3155          if (next < 128) othercase = cd->fcc[next]; else
3156    #ifdef SUPPORT_UCP
3157          othercase = UCD_OTHERCASE((unsigned int)next);
3158  #else  #else
3159    c = *previous;        othercase = NOTACHAR;
3160  #endif  #endif
3161    if (c == next) return TRUE;        return (unsigned int)c != othercase;
3162          }
3163        else
3164    #endif  /* SUPPORT_UTF */
3165        return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Not UTF */
3166    
3167        case OP_NOT:
3168        return c == next;
3169    
3170        case OP_NOTI:
3171        if (c == next) return TRUE;
3172  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
3173    if (utf)      if (utf)
3174      {        {
3175      unsigned int othercase;        unsigned int othercase;
3176      if (next < 128) othercase = cd->fcc[next]; else        if (next < 128) othercase = cd->fcc[next]; else
3177  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3178      othercase = UCD_OTHERCASE((unsigned int)next);        othercase = UCD_OTHERCASE((unsigned int)next);
3179  #else  #else
3180      othercase = NOTACHAR;        othercase = NOTACHAR;
3181  #endif  #endif
3182      return (unsigned int)c == othercase;        return (unsigned int)c == othercase;
3183      }        }
3184    else      else
3185  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
3186    return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */      return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Not UTF */
3187    
3188    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3189    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */      When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3190    
3191    case OP_DIGIT:      case OP_DIGIT:
3192    return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;      return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3193    
3194    case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
3195    return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;      return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3196    
3197    case OP_WHITESPACE:      case OP_WHITESPACE:
3198    return next > 255 || (cd->ctypes[next] & ctype_space) == 0;      return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3199    
3200    case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
3201    return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;      return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3202    
3203    case OP_WORDCHAR:      case OP_WORDCHAR:
3204    return next > 255 || (cd->ctypes[next] & ctype_word) == 0;      return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3205    
3206    case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
3207    return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;      return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3208    
3209    case OP_HSPACE:      case OP_HSPACE:
3210    case OP_NOT_HSPACE:      case OP_NOT_HSPACE:
3211    switch(next)      switch(next)
3212      {        {
3213      HSPACE_CASES:        HSPACE_CASES:
3214      return op_code == OP_NOT_HSPACE;        return op_code == OP_NOT_HSPACE;
   
     default:  
     return op_code != OP_NOT_HSPACE;  
     }  
3215    
3216    case OP_ANYNL:        default:
3217    case OP_VSPACE:        return op_code != OP_NOT_HSPACE;
3218    case OP_NOT_VSPACE:        }
3219    switch(next)  
3220      {      case OP_ANYNL:
3221      VSPACE_CASES:      case OP_VSPACE:
3222      return op_code == OP_NOT_VSPACE;      case OP_NOT_VSPACE:
3223        switch(next)
3224      default:        {
3225      return op_code != OP_NOT_VSPACE;        VSPACE_CASES:
3226      }        return op_code == OP_NOT_VSPACE;
3227    
3228          default:
3229          return op_code != OP_NOT_VSPACE;
3230          }
3231    
3232  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3233    case OP_PROP:      case OP_PROP:
3234    return check_char_prop(next, previous[0], previous[1], FALSE);      return check_char_prop(next, previous[0], previous[1], FALSE);
3235    
3236    case OP_NOTPROP:      case OP_NOTPROP:
3237    return check_char_prop(next, previous[0], previous[1], TRUE);      return check_char_prop(next, previous[0], previous[1], TRUE);
3238  #endif  #endif
3239    
3240    default:      default:
3241    return FALSE;      return FALSE;
3242        }
3243    }    }
3244    
   
3245  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3246  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3247  generated only when PCRE_UCP is *not* set, that is, when only ASCII  generated only when PCRE_UCP is *not* set, that is, when only ASCII
# Line 3230  switch(op_code) Line 3252  switch(op_code)
3252    {    {
3253    case OP_CHAR:    case OP_CHAR:
3254    case OP_CHARI:    case OP_CHARI:
 #ifdef SUPPORT_UTF  
   GETCHARTEST(c, previous);  
 #else  
   c = *previous;  
 #endif  
3255    switch(-next)    switch(-next)
3256      {      {
3257      case ESC_d:      case ESC_d:
# Line 3261  switch(op_code) Line 3278  switch(op_code)
3278        {        {
3279        HSPACE_CASES:        HSPACE_CASES:
3280        return -next != ESC_h;        return -next != ESC_h;
3281    
3282        default:        default:
3283        return -next == ESC_h;        return -next == ESC_h;
3284        }        }
# Line 3272  switch(op_code) Line 3289  switch(op_code)
3289        {        {
3290        VSPACE_CASES:        VSPACE_CASES:
3291        return -next != ESC_v;        return -next != ESC_v;
3292    
3293        default:        default:
3294        return -next == ESC_v;        return -next == ESC_v;
3295        }        }
# Line 3683  pcre_uchar utf_chars[6]; Line 3700  pcre_uchar utf_chars[6];
3700  BOOL utf = FALSE;  BOOL utf = FALSE;
3701  #endif  #endif
3702    
3703  /* Helper variables for OP_XCLASS opcode (for characters > 255). */  /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
3704    class_uchardata always so that it can be passed to add_to_class() always,
3705    though it will not be used in non-UTF 8-bit cases. This avoids having to supply
3706    alternative calls for the different cases. */
3707    
3708    pcre_uchar *class_uchardata;
3709  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3710  BOOL xclass;  BOOL xclass;
 pcre_uchar *class_uchardata;  
3711  pcre_uchar *class_uchardata_base;  pcre_uchar *class_uchardata_base;
3712  #endif  #endif
3713    
# Line 4133  for (;; ptr++) Line 4153  for (;; ptr++)
4153          alpha. This relies on the fact that the class table starts with          alpha. This relies on the fact that the class table starts with
4154          alpha, lower, upper as the first 3 entries. */          alpha, lower, upper as the first 3 entries. */
4155    
4156          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4157            posix_class = 0;            posix_class = 0;
4158    
4159          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
# Line 4476  for (;; ptr++) Line 4496  for (;; ptr++)
4496    
4497          if (negate_class)          if (negate_class)
4498            {            {
4499    #ifdef SUPPORT_UCP
4500              int d;
4501    #endif
4502            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4503            zerofirstchar = firstchar;            zerofirstchar = firstchar;
4504            *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
4505              /* For caseless UTF-8 mode when UCP support is available, check
4506              whether this character has more than one other case. If so, generate
4507              a special OP_NOTPROP item instead of OP_NOTI. */
4508    
4509    #ifdef SUPPORT_UCP
4510              if (utf && (options & PCRE_CASELESS) != 0 &&
4511                  (d = UCD_CASESET(c)) != 0)
4512                {
4513                *code++ = OP_NOTPROP;
4514                *code++ = PT_CLIST;
4515                *code++ = d;
4516                }
4517              else
4518    #endif
4519              /* Char has only one other case, or UCP not available */
4520    
4521                {
4522                *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4523  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4524            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)              if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4525              code += PRIV(ord2utf)(c, code);                code += PRIV(ord2utf)(c, code);
4526            else              else
4527  #endif  #endif
4528              *code++ = c;                *code++ = c;
4529            goto NOT_CHAR;              }
4530    
4531              /* We are finished with this character class */
4532    
4533              goto END_CLASS;
4534            }            }
4535    
4536          /* For a single, positive character, get the value into mcbuffer, and          /* For a single, positive character, get the value into mcbuffer, and
# Line 4601  for (;; ptr++) Line 4646  for (;; ptr++)
4646        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4647        }        }
4648      code += 32 / sizeof(pcre_uchar);      code += 32 / sizeof(pcre_uchar);
4649      NOT_CHAR:  
4650        END_CLASS:
4651      break;      break;
4652    
4653    
# Line 6836  for (;; ptr++) Line 6882  for (;; ptr++)
6882    
6883      ONE_CHAR:      ONE_CHAR:
6884      previous = code;      previous = code;
6885    
6886        /* For caseless UTF-8 mode when UCP support is available, check whether
6887        this character has more than one other case. If so, generate a special
6888        OP_PROP item instead of OP_CHARI. */
6889    
6890    #ifdef SUPPORT_UCP
6891        if (utf && (options & PCRE_CASELESS) != 0)
6892          {
6893          GETCHAR(c, mcbuffer);
6894          if ((c = UCD_CASESET(c)) != 0)
6895            {
6896            *code++ = OP_PROP;
6897            *code++ = PT_CLIST;
6898            *code++ = c;
6899            if (firstchar == REQ_UNSET) firstchar = zerofirstchar = REQ_NONE;
6900            break;
6901            }
6902          }
6903    #endif
6904    
6905        /* Caseful matches, or not one of the multicase characters. */
6906    
6907      *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;      *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
6908      for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];      for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6909    

Legend:
Removed from v.1045  
changed lines
  Added in v.1047

  ViewVC Help
Powered by ViewVC 1.1.5