/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 903 by ph10, Sat Jan 21 16:37:17 2012 UTC revision 932 by ph10, Fri Feb 24 18:54:43 2012 UTC
# Line 2225  for (;;) Line 2225  for (;;)
2225        {        {
2226        case OP_CHAR:        case OP_CHAR:
2227        case OP_CHARI:        case OP_CHARI:
2228          case OP_NOT:
2229          case OP_NOTI:
2230        case OP_EXACT:        case OP_EXACT:
2231        case OP_EXACTI:        case OP_EXACTI:
2232          case OP_NOTEXACT:
2233          case OP_NOTEXACTI:
2234        case OP_UPTO:        case OP_UPTO:
2235        case OP_UPTOI:        case OP_UPTOI:
2236          case OP_NOTUPTO:
2237          case OP_NOTUPTOI:
2238        case OP_MINUPTO:        case OP_MINUPTO:
2239        case OP_MINUPTOI:        case OP_MINUPTOI:
2240          case OP_NOTMINUPTO:
2241          case OP_NOTMINUPTOI:
2242        case OP_POSUPTO:        case OP_POSUPTO:
2243        case OP_POSUPTOI:        case OP_POSUPTOI:
2244          case OP_NOTPOSUPTO:
2245          case OP_NOTPOSUPTOI:
2246        case OP_STAR:        case OP_STAR:
2247        case OP_STARI:        case OP_STARI:
2248          case OP_NOTSTAR:
2249          case OP_NOTSTARI:
2250        case OP_MINSTAR:        case OP_MINSTAR:
2251        case OP_MINSTARI:        case OP_MINSTARI:
2252          case OP_NOTMINSTAR:
2253          case OP_NOTMINSTARI:
2254        case OP_POSSTAR:        case OP_POSSTAR:
2255        case OP_POSSTARI:        case OP_POSSTARI:
2256          case OP_NOTPOSSTAR:
2257          case OP_NOTPOSSTARI:
2258        case OP_PLUS:        case OP_PLUS:
2259        case OP_PLUSI:        case OP_PLUSI:
2260          case OP_NOTPLUS:
2261          case OP_NOTPLUSI:
2262        case OP_MINPLUS:        case OP_MINPLUS:
2263        case OP_MINPLUSI:        case OP_MINPLUSI:
2264          case OP_NOTMINPLUS:
2265          case OP_NOTMINPLUSI:
2266        case OP_POSPLUS:        case OP_POSPLUS:
2267        case OP_POSPLUSI:        case OP_POSPLUSI:
2268          case OP_NOTPOSPLUS:
2269          case OP_NOTPOSPLUSI:
2270        case OP_QUERY:        case OP_QUERY:
2271        case OP_QUERYI:        case OP_QUERYI:
2272          case OP_NOTQUERY:
2273          case OP_NOTQUERYI:
2274        case OP_MINQUERY:        case OP_MINQUERY:
2275        case OP_MINQUERYI:        case OP_MINQUERYI:
2276          case OP_NOTMINQUERY:
2277          case OP_NOTMINQUERYI:
2278        case OP_POSQUERY:        case OP_POSQUERY:
2279        case OP_POSQUERYI:        case OP_POSQUERYI:
2280          case OP_NOTPOSQUERY:
2281          case OP_NOTPOSQUERYI:
2282        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2283        break;        break;
2284        }        }
# Line 3067  if (next >= 0) switch(op_code) Line 3095  if (next >= 0) switch(op_code)
3095      }      }
3096    else    else
3097  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
3098    return (c != TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */    return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
   
   /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These  
   opcodes are not used for multi-byte characters, because they are coded using  
   an XCLASS instead. */  
3099    
3100    case OP_NOT:    case OP_NOT:
3101    return (c = *previous) == next;  #ifdef SUPPORT_UTF
3102      GETCHARTEST(c, previous);
3103    #else
3104      c = *previous;
3105    #endif
3106      return c == next;
3107    
3108    case OP_NOTI:    case OP_NOTI:
3109    if ((c = *previous) == next) return TRUE;  #ifdef SUPPORT_UTF
3110      GETCHARTEST(c, previous);
3111    #else
3112      c = *previous;
3113    #endif
3114      if (c == next) return TRUE;
3115  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
3116    if (utf)    if (utf)
3117      {      {
3118      unsigned int othercase;      unsigned int othercase;
3119      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
3120  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3121      othercase = UCD_OTHERCASE(next);      othercase = UCD_OTHERCASE((unsigned int)next);
3122  #else  #else
3123      othercase = NOTACHAR;      othercase = NOTACHAR;
3124  #endif  #endif
# Line 3092  if (next >= 0) switch(op_code) Line 3126  if (next >= 0) switch(op_code)
3126      }      }
3127    else    else
3128  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
3129    return (c == TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */    return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3130    
3131    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3132    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
# Line 4482  for (;; ptr++) Line 4516  for (;; ptr++)
4516        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4517    
4518        /* Only the value of 1 matters for class_single_char. */        /* Only the value of 1 matters for class_single_char. */
4519    
4520        if (class_single_char < 2) class_single_char++;        if (class_single_char < 2) class_single_char++;
4521    
4522        /* If class_charcount is 1, we saw precisely one character. As long as        /* If class_charcount is 1, we saw precisely one character. As long as
4523        there were no negated characters >= 128 and there was no use of \p or \P,        there was no use of \p or \P, in other words, no use of any XCLASS
4524        in other words, no use of any XCLASS features, we can optimize.        features, we can optimize.
   
       In UTF-8 mode, we can optimize the negative case only if there were no  
       characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR  
       operate on single-bytes characters only. This is an historical hangover.  
       Maybe one day we can tidy these opcodes to handle multi-byte characters.  
4525    
4526        The optimization throws away the bit map. We turn the item into a        The optimization throws away the bit map. We turn the item into a
4527        1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.        1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4528        Note that OP_NOT[I] does not support multibyte characters. In the positive        In the positive case, it can cause firstchar to be set. Otherwise, there
4529        case, it can cause firstchar to be set. Otherwise, there can be no first        can be no first char if this item is first, whatever repeat count may
4530        char if this item is first, whatever repeat count may follow. In the case        follow. In the case of reqchar, save the previous value for reinstating. */
       of reqchar, save the previous value for reinstating. */  
4531    
 #ifdef SUPPORT_UTF  
       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET  
         && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))  
 #else  
4532        if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)        if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 #endif  
4533          {          {
4534          ptr++;          ptr++;
4535          zeroreqchar = reqchar;          zeroreqchar = reqchar;
4536    
         /* The OP_NOT[I] opcodes work on single characters only. */  
   
4537          if (negate_class)          if (negate_class)
4538            {            {
4539            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4540            zerofirstchar = firstchar;            zerofirstchar = firstchar;
4541            *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;            *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4542            *code++ = c;  #ifdef SUPPORT_UTF
4543              if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4544                code += PRIV(ord2utf)(c, code);
4545              else
4546    #endif
4547                *code++ = c;
4548            goto NOT_CHAR;            goto NOT_CHAR;
4549            }            }
4550    
# Line 4571  for (;; ptr++) Line 4598  for (;; ptr++)
4598  #endif  #endif
4599            {            {
4600            unsigned int othercase;            unsigned int othercase;
4601            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((int)(othercase = UCD_OTHERCASE(c)) != c)
4602              {              {
4603              *class_uchardata++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4604              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
# Line 4775  for (;; ptr++) Line 4802  for (;; ptr++)
4802    
4803      /* Now handle repetition for the different types of item. */      /* Now handle repetition for the different types of item. */
4804    
4805      /* If previous was a character match, abolish the item and generate a      /* If previous was a character or negated character match, abolish the item
4806      repeat item instead. If a char item has a minumum of more than one, ensure      and generate a repeat item instead. If a char item has a minimum of more
4807      that it is set in reqchar - it might not be if a sequence such as x{3} is      than one, ensure that it is set in reqchar - it might not be if a sequence
4808      the first thing in a branch because the x will have gone into firstchar      such as x{3} is the first thing in a branch because the x will have gone
4809      instead.  */      into firstchar instead.  */
4810    
4811      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI
4812        {          || *previous == OP_NOT || *previous == OP_NOTI)
4813        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;        {
4814          switch (*previous)
4815            {
4816            default: /* Make compiler happy. */
4817            case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
4818            case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4819            case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
4820            case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
4821            }
4822    
4823        /* Deal with UTF characters that take up more than one character. It's        /* Deal with UTF characters that take up more than one character. It's
4824        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
# Line 4806  for (;; ptr++) Line 4841  for (;; ptr++)
4841        with UTF disabled, or for a single character UTF character. */        with UTF disabled, or for a single character UTF character. */
4842          {          {
4843          c = code[-1];          c = code[-1];
4844          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;          if (*previous <= OP_CHARI && repeat_min > 1)
4845              reqchar = c | req_caseopt | cd->req_varyopt;
4846          }          }
4847    
4848        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4825  for (;; ptr++) Line 4861  for (;; ptr++)
4861        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4862        }        }
4863    
     /* If previous was a single negated character ([^a] or similar), we use  
     one of the special opcodes, replacing it. The code is shared with single-  
     character repeats by setting opt_type to add a suitable offset into  
     repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI  
     are currently used only for single-byte chars. */  
   
     else if (*previous == OP_NOT || *previous == OP_NOTI)  
       {  
       op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;  
       c = previous[1];  
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
       goto OUTPUT_SINGLE_REPEAT;  
       }  
   
4864      /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
4865      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
4866      repeats by setting op_type to add a suitable offset into repeat_type. Note      repeats by setting op_type to add a suitable offset into repeat_type. Note
# Line 6836  for (;; ptr++) Line 6852  for (;; ptr++)
6852        /* For the rest (including \X when Unicode properties are supported), we        /* For the rest (including \X when Unicode properties are supported), we
6853        can obtain the OP value by negating the escape value in the default        can obtain the OP value by negating the escape value in the default
6854        situation when PCRE_UCP is not set. When it *is* set, we substitute        situation when PCRE_UCP is not set. When it *is* set, we substitute
6855        Unicode property tests. */        Unicode property tests. Note that \b and \B do a one-character
6856          lookbehind. */
6857    
6858        else        else
6859          {          {
6860            if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
6861              cd->max_lookbehind = 1;
6862  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
6863          if (-c >= ESC_DU && -c <= ESC_wu)          if (-c >= ESC_DU && -c <= ESC_wu)
6864            {            {
# Line 7147  for (;;) Line 7166  for (;;)
7166          *ptrptr = ptr;          *ptrptr = ptr;
7167          return FALSE;          return FALSE;
7168          }          }
7169        else { PUT(reverse_count, 0, fixed_length); }        else
7170            {
7171            if (fixed_length > cd->max_lookbehind)
7172              cd->max_lookbehind = fixed_length;
7173            PUT(reverse_count, 0, fixed_length);
7174            }
7175        }        }
7176      }      }
7177    
# Line 7817  cd->start_pattern = (const pcre_uchar *) Line 7841  cd->start_pattern = (const pcre_uchar *)
7841  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7842  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7843  cd->assert_depth = 0;  cd->assert_depth = 0;
7844    cd->max_lookbehind = 0;
7845  cd->external_options = options;  cd->external_options = options;
7846  cd->external_flags = 0;  cd->external_flags = 0;
7847  cd->open_caps = NULL;  cd->open_caps = NULL;
# Line 7867  re->magic_number = MAGIC_NUMBER; Line 7892  re->magic_number = MAGIC_NUMBER;
7892  re->size = (int)size;  re->size = (int)size;
7893  re->options = cd->external_options;  re->options = cd->external_options;
7894  re->flags = cd->external_flags;  re->flags = cd->external_flags;
 re->dummy1 = 0;  
7895  re->first_char = 0;  re->first_char = 0;
7896  re->req_char = 0;  re->req_char = 0;
7897  re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);  re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
# Line 7887  field; this time it's used for rememberi Line 7911  field; this time it's used for rememberi
7911  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7912  cd->assert_depth = 0;  cd->assert_depth = 0;
7913  cd->bracount = 0;  cd->bracount = 0;
7914    cd->max_lookbehind = 0;
7915  cd->names_found = 0;  cd->names_found = 0;
7916  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7917  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
# Line 7908  code = (pcre_uchar *)codestart; Line 7933  code = (pcre_uchar *)codestart;
7933    &firstchar, &reqchar, NULL, cd, NULL);    &firstchar, &reqchar, NULL, cd, NULL);
7934  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
7935  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7936    re->max_lookbehind = cd->max_lookbehind;
7937  re->flags = cd->external_flags | PCRE_MODE;  re->flags = cd->external_flags | PCRE_MODE;
7938    
7939  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
# Line 7995  if (cd->check_lookbehind) Line 8021  if (cd->check_lookbehind)
8021                      (fixed_length == -4)? ERR70 : ERR25;                      (fixed_length == -4)? ERR70 : ERR25;
8022          break;          break;
8023          }          }
8024          if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
8025        PUT(cc, 1, fixed_length);        PUT(cc, 1, fixed_length);
8026        }        }
8027      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 8134  if ((re->flags & PCRE_REQCHSET) != 0) Line 8161  if ((re->flags & PCRE_REQCHSET) != 0)
8161    }    }
8162    
8163  #ifdef COMPILE_PCRE8  #ifdef COMPILE_PCRE8
8164  pcre_printint(re, stdout, TRUE);  pcre_printint((pcre *)re, stdout, TRUE);
8165  #else  #else
8166  pcre16_printint(re, stdout, TRUE);  pcre16_printint((pcre *)re, stdout, TRUE);
8167  #endif  #endif
8168    
8169  /* This check is done here in the debugging case so that the code that  /* This check is done here in the debugging case so that the code that

Legend:
Removed from v.903  
changed lines
  Added in v.932

  ViewVC Help
Powered by ViewVC 1.1.5