/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 852 by zherczeg, Thu Jan 5 19:18:12 2012 UTC revision 994 by ph10, Tue Jul 10 14:29:26 2012 UTC
# Line 488  static const char error_texts[] = Line 488  static const char error_texts[] =
488    "\\N is not supported in a class\0"    "\\N is not supported in a class\0"
489    "too many forward references\0"    "too many forward references\0"
490    "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"    "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491    "invalid UTF-16 string\0"    "invalid UTF-16 string\0"
492      /* 75 */
493      "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
494      "character value in \\u.... sequence is too large\0"
495    ;    ;
496    
497  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 829  else Line 832  else
832            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
833  #endif  #endif
834            }            }
835    
836    #ifdef COMPILE_PCRE8
837            if (c > (utf ? 0x10ffff : 0xff))
838    #else
839    #ifdef COMPILE_PCRE16
840            if (c > (utf ? 0x10ffff : 0xffff))
841    #endif
842    #endif
843              {
844              *errorcodeptr = ERR76;
845              }
846            else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
847          }          }
848        }        }
849      else      else
# Line 998  else Line 1013  else
1013      c -= CHAR_0;      c -= CHAR_0;
1014      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1015          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
1016  #ifdef COMPILE_PCRE8  #ifdef COMPILE_PCRE8
1017      if (!utf && c > 0xff) *errorcodeptr = ERR51;      if (!utf && c > 0xff) *errorcodeptr = ERR51;
1018  #endif  #endif
1019      break;      break;
1020    
1021      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 2225  for (;;) Line 2240  for (;;)
2240        {        {
2241        case OP_CHAR:        case OP_CHAR:
2242        case OP_CHARI:        case OP_CHARI:
2243          case OP_NOT:
2244          case OP_NOTI:
2245        case OP_EXACT:        case OP_EXACT:
2246        case OP_EXACTI:        case OP_EXACTI:
2247          case OP_NOTEXACT:
2248          case OP_NOTEXACTI:
2249        case OP_UPTO:        case OP_UPTO:
2250        case OP_UPTOI:        case OP_UPTOI:
2251          case OP_NOTUPTO:
2252          case OP_NOTUPTOI:
2253        case OP_MINUPTO:        case OP_MINUPTO:
2254        case OP_MINUPTOI:        case OP_MINUPTOI:
2255          case OP_NOTMINUPTO:
2256          case OP_NOTMINUPTOI:
2257        case OP_POSUPTO:        case OP_POSUPTO:
2258        case OP_POSUPTOI:        case OP_POSUPTOI:
2259          case OP_NOTPOSUPTO:
2260          case OP_NOTPOSUPTOI:
2261        case OP_STAR:        case OP_STAR:
2262        case OP_STARI:        case OP_STARI:
2263          case OP_NOTSTAR:
2264          case OP_NOTSTARI:
2265        case OP_MINSTAR:        case OP_MINSTAR:
2266        case OP_MINSTARI:        case OP_MINSTARI:
2267          case OP_NOTMINSTAR:
2268          case OP_NOTMINSTARI:
2269        case OP_POSSTAR:        case OP_POSSTAR:
2270        case OP_POSSTARI:        case OP_POSSTARI:
2271          case OP_NOTPOSSTAR:
2272          case OP_NOTPOSSTARI:
2273        case OP_PLUS:        case OP_PLUS:
2274        case OP_PLUSI:        case OP_PLUSI:
2275          case OP_NOTPLUS:
2276          case OP_NOTPLUSI:
2277        case OP_MINPLUS:        case OP_MINPLUS:
2278        case OP_MINPLUSI:        case OP_MINPLUSI:
2279          case OP_NOTMINPLUS:
2280          case OP_NOTMINPLUSI:
2281        case OP_POSPLUS:        case OP_POSPLUS:
2282        case OP_POSPLUSI:        case OP_POSPLUSI:
2283          case OP_NOTPOSPLUS:
2284          case OP_NOTPOSPLUSI:
2285        case OP_QUERY:        case OP_QUERY:
2286        case OP_QUERYI:        case OP_QUERYI:
2287          case OP_NOTQUERY:
2288          case OP_NOTQUERYI:
2289        case OP_MINQUERY:        case OP_MINQUERY:
2290        case OP_MINQUERYI:        case OP_MINQUERYI:
2291          case OP_NOTMINQUERY:
2292          case OP_NOTMINQUERYI:
2293        case OP_POSQUERY:        case OP_POSQUERY:
2294        case OP_POSQUERYI:        case OP_POSQUERYI:
2295          case OP_NOTPOSQUERY:
2296          case OP_NOTPOSQUERYI:
2297        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2298        break;        break;
2299        }        }
# Line 3067  if (next >= 0) switch(op_code) Line 3110  if (next >= 0) switch(op_code)
3110      }      }
3111    else    else
3112  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
3113    return (c != TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */    return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
   
   /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These  
   opcodes are not used for multi-byte characters, because they are coded using  
   an XCLASS instead. */  
3114    
3115    case OP_NOT:    case OP_NOT:
3116    return (c = *previous) == next;  #ifdef SUPPORT_UTF
3117      GETCHARTEST(c, previous);
3118    #else
3119      c = *previous;
3120    #endif
3121      return c == next;
3122    
3123    case OP_NOTI:    case OP_NOTI:
3124    if ((c = *previous) == next) return TRUE;  #ifdef SUPPORT_UTF
3125      GETCHARTEST(c, previous);
3126    #else
3127      c = *previous;
3128    #endif
3129      if (c == next) return TRUE;
3130  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
3131    if (utf)    if (utf)
3132      {      {
3133      unsigned int othercase;      unsigned int othercase;
3134      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
3135  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3136      othercase = UCD_OTHERCASE(next);      othercase = UCD_OTHERCASE((unsigned int)next);
3137  #else  #else
3138      othercase = NOTACHAR;      othercase = NOTACHAR;
3139  #endif  #endif
# Line 3092  if (next >= 0) switch(op_code) Line 3141  if (next >= 0) switch(op_code)
3141      }      }
3142    else    else
3143  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
3144    return (c == TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */    return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3145    
3146    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3147    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3148    
3149    case OP_DIGIT:    case OP_DIGIT:
3150    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3151    
3152    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3153    return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3154    
3155    case OP_WHITESPACE:    case OP_WHITESPACE:
3156    return next > 127 || (cd->ctypes[next] & ctype_space) == 0;    return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3157    
3158    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3159    return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3160    
3161    case OP_WORDCHAR:    case OP_WORDCHAR:
3162    return next > 127 || (cd->ctypes[next] & ctype_word) == 0;    return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3163    
3164    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3165    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3166    
3167    case OP_HSPACE:    case OP_HSPACE:
3168    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
# Line 3191  switch(op_code) Line 3240  switch(op_code)
3240    switch(-next)    switch(-next)
3241      {      {
3242      case ESC_d:      case ESC_d:
3243      return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;      return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3244    
3245      case ESC_D:      case ESC_D:
3246      return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3247    
3248      case ESC_s:      case ESC_s:
3249      return c > 127 || (cd->ctypes[c] & ctype_space) == 0;      return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3250    
3251      case ESC_S:      case ESC_S:
3252      return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3253    
3254      case ESC_w:      case ESC_w:
3255      return c > 127 || (cd->ctypes[c] & ctype_word) == 0;      return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3256    
3257      case ESC_W:      case ESC_W:
3258      return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3259    
3260      case ESC_h:      case ESC_h:
3261      case ESC_H:      case ESC_H:
# Line 3315  switch(op_code) Line 3364  switch(op_code)
3364    return next == -ESC_d;    return next == -ESC_d;
3365    
3366    case OP_WHITESPACE:    case OP_WHITESPACE:
3367    return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;    return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3368    
3369    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3370    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3371    
3372    case OP_HSPACE:    case OP_HSPACE:
3373    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
# Line 4482  for (;; ptr++) Line 4531  for (;; ptr++)
4531        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4532    
4533        /* Only the value of 1 matters for class_single_char. */        /* Only the value of 1 matters for class_single_char. */
4534    
4535        if (class_single_char < 2) class_single_char++;        if (class_single_char < 2) class_single_char++;
4536    
4537        /* If class_charcount is 1, we saw precisely one character. As long as        /* If class_charcount is 1, we saw precisely one character. As long as
4538        there were no negated characters >= 128 and there was no use of \p or \P,        there was no use of \p or \P, in other words, no use of any XCLASS
4539        in other words, no use of any XCLASS features, we can optimize.        features, we can optimize.
   
       In UTF-8 mode, we can optimize the negative case only if there were no  
       characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR  
       operate on single-bytes characters only. This is an historical hangover.  
       Maybe one day we can tidy these opcodes to handle multi-byte characters.  
4540    
4541        The optimization throws away the bit map. We turn the item into a        The optimization throws away the bit map. We turn the item into a
4542        1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.        1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4543        Note that OP_NOT[I] does not support multibyte characters. In the positive        In the positive case, it can cause firstchar to be set. Otherwise, there
4544        case, it can cause firstchar to be set. Otherwise, there can be no first        can be no first char if this item is first, whatever repeat count may
4545        char if this item is first, whatever repeat count may follow. In the case        follow. In the case of reqchar, save the previous value for reinstating. */
       of reqchar, save the previous value for reinstating. */  
4546    
 #ifdef SUPPORT_UTF  
       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET  
         && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))  
 #else  
4547        if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)        if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 #endif  
4548          {          {
4549          ptr++;          ptr++;
4550          zeroreqchar = reqchar;          zeroreqchar = reqchar;
4551    
         /* The OP_NOT[I] opcodes work on single characters only. */  
   
4552          if (negate_class)          if (negate_class)
4553            {            {
4554            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4555            zerofirstchar = firstchar;            zerofirstchar = firstchar;
4556            *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;            *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4557            *code++ = c;  #ifdef SUPPORT_UTF
4558              if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4559                code += PRIV(ord2utf)(c, code);
4560              else
4561    #endif
4562                *code++ = c;
4563            goto NOT_CHAR;            goto NOT_CHAR;
4564            }            }
4565    
# Line 4571  for (;; ptr++) Line 4613  for (;; ptr++)
4613  #endif  #endif
4614            {            {
4615            unsigned int othercase;            unsigned int othercase;
4616            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((int)(othercase = UCD_OTHERCASE(c)) != c)
4617              {              {
4618              *class_uchardata++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4619              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
# Line 4775  for (;; ptr++) Line 4817  for (;; ptr++)
4817    
4818      /* Now handle repetition for the different types of item. */      /* Now handle repetition for the different types of item. */
4819    
4820      /* If previous was a character match, abolish the item and generate a      /* If previous was a character or negated character match, abolish the item
4821      repeat item instead. If a char item has a minumum of more than one, ensure      and generate a repeat item instead. If a char item has a minimum of more
4822      that it is set in reqchar - it might not be if a sequence such as x{3} is      than one, ensure that it is set in reqchar - it might not be if a sequence
4823      the first thing in a branch because the x will have gone into firstchar      such as x{3} is the first thing in a branch because the x will have gone
4824      instead.  */      into firstchar instead.  */
4825    
4826      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI
4827        {          || *previous == OP_NOT || *previous == OP_NOTI)
4828        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;        {
4829          switch (*previous)
4830            {
4831            default: /* Make compiler happy. */
4832            case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
4833            case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4834            case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
4835            case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
4836            }
4837    
4838        /* Deal with UTF characters that take up more than one character. It's        /* Deal with UTF characters that take up more than one character. It's
4839        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
# Line 4806  for (;; ptr++) Line 4856  for (;; ptr++)
4856        with UTF disabled, or for a single character UTF character. */        with UTF disabled, or for a single character UTF character. */
4857          {          {
4858          c = code[-1];          c = code[-1];
4859          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;          if (*previous <= OP_CHARI && repeat_min > 1)
4860              reqchar = c | req_caseopt | cd->req_varyopt;
4861          }          }
4862    
4863        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4825  for (;; ptr++) Line 4876  for (;; ptr++)
4876        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4877        }        }
4878    
     /* If previous was a single negated character ([^a] or similar), we use  
     one of the special opcodes, replacing it. The code is shared with single-  
     character repeats by setting opt_type to add a suitable offset into  
     repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI  
     are currently used only for single-byte chars. */  
   
     else if (*previous == OP_NOT || *previous == OP_NOTI)  
       {  
       op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;  
       c = previous[1];  
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
       goto OUTPUT_SINGLE_REPEAT;  
       }  
   
4879      /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
4880      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
4881      repeats by setting op_type to add a suitable offset into repeat_type. Note      repeats by setting op_type to add a suitable offset into repeat_type. Note
# Line 5585  for (;; ptr++) Line 5616  for (;; ptr++)
5616          arg = ++ptr;          arg = ++ptr;
5617          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5618          arglen = (int)(ptr - arg);          arglen = (int)(ptr - arg);
5619            if (arglen > (int)MAX_MARK)
5620              {
5621              *errorcodeptr = ERR75;
5622              goto FAILED;
5623              }
5624          }          }
5625    
5626        if (*ptr != CHAR_RIGHT_PARENTHESIS)        if (*ptr != CHAR_RIGHT_PARENTHESIS)
# Line 5600  for (;; ptr++) Line 5636  for (;; ptr++)
5636          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5637              STRNCMP_UC_C8(name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5638            {            {
5639              int setverb;
5640    
5641            /* Check for open captures before ACCEPT and convert it to            /* Check for open captures before ACCEPT and convert it to
5642            ASSERT_ACCEPT if in an assertion. */            ASSERT_ACCEPT if in an assertion. */
5643    
# Line 5617  for (;; ptr++) Line 5655  for (;; ptr++)
5655                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
5656                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5657                }                }
5658              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              setverb = *code++ =
5659                  (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5660    
5661              /* Do not set firstchar after *ACCEPT */              /* Do not set firstchar after *ACCEPT */
5662              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
# Line 5632  for (;; ptr++) Line 5671  for (;; ptr++)
5671                *errorcodeptr = ERR66;                *errorcodeptr = ERR66;
5672                goto FAILED;                goto FAILED;
5673                }                }
5674              *code = verbs[i].op;              setverb = *code++ = verbs[i].op;
             if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;  
5675              }              }
5676    
5677            else            else
# Line 5643  for (;; ptr++) Line 5681  for (;; ptr++)
5681                *errorcodeptr = ERR59;                *errorcodeptr = ERR59;
5682                goto FAILED;                goto FAILED;
5683                }                }
5684              *code = verbs[i].op_arg;              setverb = *code++ = verbs[i].op_arg;
             if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;  
5685              *code++ = arglen;              *code++ = arglen;
5686              memcpy(code, arg, IN_UCHARS(arglen));              memcpy(code, arg, IN_UCHARS(arglen));
5687              code += arglen;              code += arglen;
5688              *code++ = 0;              *code++ = 0;
5689              }              }
5690    
5691              switch (setverb)
5692                {
5693                case OP_THEN:
5694                case OP_THEN_ARG:
5695                cd->external_flags |= PCRE_HASTHEN;
5696                break;
5697    
5698                case OP_PRUNE:
5699                case OP_PRUNE_ARG:
5700                case OP_SKIP:
5701                case OP_SKIP_ARG:
5702                cd->had_pruneorskip = TRUE;
5703                break;
5704                }
5705    
5706            break;  /* Found verb, exit loop */            break;  /* Found verb, exit loop */
5707            }            }
5708    
# Line 6836  for (;; ptr++) Line 6888  for (;; ptr++)
6888        /* For the rest (including \X when Unicode properties are supported), we        /* For the rest (including \X when Unicode properties are supported), we
6889        can obtain the OP value by negating the escape value in the default        can obtain the OP value by negating the escape value in the default
6890        situation when PCRE_UCP is not set. When it *is* set, we substitute        situation when PCRE_UCP is not set. When it *is* set, we substitute
6891        Unicode property tests. */        Unicode property tests. Note that \b and \B do a one-character
6892          lookbehind. */
6893    
6894        else        else
6895          {          {
6896            if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
6897              cd->max_lookbehind = 1;
6898  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
6899          if (-c >= ESC_DU && -c <= ESC_wu)          if (-c >= ESC_DU && -c <= ESC_wu)
6900            {            {
# Line 7147  for (;;) Line 7202  for (;;)
7202          *ptrptr = ptr;          *ptrptr = ptr;
7203          return FALSE;          return FALSE;
7204          }          }
7205        else { PUT(reverse_count, 0, fixed_length); }        else
7206            {
7207            if (fixed_length > cd->max_lookbehind)
7208              cd->max_lookbehind = fixed_length;
7209            PUT(reverse_count, 0, fixed_length);
7210            }
7211        }        }
7212      }      }
7213    
# Line 7279  and the highest back reference was great Line 7339  and the highest back reference was great
7339  However, by keeping a bitmap of the first 31 back references, we can catch some  However, by keeping a bitmap of the first 31 back references, we can catch some
7340  of the more common cases more precisely.  of the more common cases more precisely.
7341    
7342    ... A second exception is when the .* appears inside an atomic group, because
7343    this prevents the number of characters it matches from being adjusted.
7344    
7345  Arguments:  Arguments:
7346    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
7347    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
7348                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
7349                    the less precise approach                    the less precise approach
7350    backref_map    the back reference bitmap    cd             points to the compile data block
7351      atomcount      atomic group level
7352    
7353  Returns:     TRUE or FALSE  Returns:     TRUE or FALSE
7354  */  */
7355    
7356  static BOOL  static BOOL
7357  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
7358    unsigned int backref_map)    compile_data *cd, int atomcount)
7359  {  {
7360  do {  do {
7361     const pcre_uchar *scode = first_significant_code(     const pcre_uchar *scode = first_significant_code(
# Line 7303  do { Line 7367  do {
7367     if (op == OP_BRA  || op == OP_BRAPOS ||     if (op == OP_BRA  || op == OP_BRAPOS ||
7368         op == OP_SBRA || op == OP_SBRAPOS)         op == OP_SBRA || op == OP_SBRAPOS)
7369       {       {
7370       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
7371       }       }
7372    
7373     /* Capturing brackets */     /* Capturing brackets */
# Line 7313  do { Line 7377  do {
7377       {       {
7378       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7379       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7380       if (!is_anchored(scode, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
7381         }
7382    
7383       /* Positive forward assertions and conditions */
7384    
7385       else if (op == OP_ASSERT || op == OP_COND)
7386         {
7387         if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
7388       }       }
7389    
7390     /* Other brackets */     /* Atomic groups */
7391    
7392     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||     else if (op == OP_ONCE || op == OP_ONCE_NC)
             op == OP_COND)  
7393       {       {
7394       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
7395           return FALSE;
7396       }       }
7397    
7398     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7399     it isn't in brackets that are or may be referenced. */     it isn't in brackets that are or may be referenced or inside an atomic
7400       group. */
7401    
7402     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7403               op == OP_TYPEPOSSTAR))               op == OP_TYPEPOSSTAR))
7404       {       {
7405       if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)       if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
7406             atomcount > 0 || cd->had_pruneorskip)
7407         return FALSE;         return FALSE;
7408       }       }
7409    
7410     /* Check for explicit anchoring */     /* Check for explicit anchoring */
7411    
7412     else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;     else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7413    
7414     code += GET(code, 1);     code += GET(code, 1);
7415     }     }
7416  while (*code == OP_ALT);   /* Loop for each alternative */  while (*code == OP_ALT);   /* Loop for each alternative */
# Line 7354  return TRUE; Line 7428  return TRUE;
7428  matching and for non-DOTALL patterns that start with .* (which must start at  matching and for non-DOTALL patterns that start with .* (which must start at
7429  the beginning or after \n). As in the case of is_anchored() (see above), we  the beginning or after \n). As in the case of is_anchored() (see above), we
7430  have to take account of back references to capturing brackets that contain .*  have to take account of back references to capturing brackets that contain .*
7431  because in that case we can't make the assumption.  because in that case we can't make the assumption. Also, the appearance of .*
7432    inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
7433    count, because once again the assumption no longer holds.
7434    
7435  Arguments:  Arguments:
7436    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
7437    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
7438                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
7439                    the less precise approach                    the less precise approach
7440    backref_map    the back reference bitmap    cd             points to the compile data
7441      atomcount      atomic group level
7442    
7443  Returns:         TRUE or FALSE  Returns:         TRUE or FALSE
7444  */  */
7445    
7446  static BOOL  static BOOL
7447  is_startline(const pcre_uchar *code, unsigned int bracket_map,  is_startline(const pcre_uchar *code, unsigned int bracket_map,
7448    unsigned int backref_map)    compile_data *cd, int atomcount)
7449  {  {
7450  do {  do {
7451     const pcre_uchar *scode = first_significant_code(     const pcre_uchar *scode = first_significant_code(
# Line 7394  do { Line 7471  do {
7471         return FALSE;         return FALSE;
7472    
7473         default:     /* Assertion */         default:     /* Assertion */
7474         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;         if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7475         do scode += GET(scode, 1); while (*scode == OP_ALT);         do scode += GET(scode, 1); while (*scode == OP_ALT);
7476         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
7477         break;         break;
# Line 7408  do { Line 7485  do {
7485     if (op == OP_BRA  || op == OP_BRAPOS ||     if (op == OP_BRA  || op == OP_BRAPOS ||
7486         op == OP_SBRA || op == OP_SBRAPOS)         op == OP_SBRA || op == OP_SBRAPOS)
7487       {       {
7488       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7489       }       }
7490    
7491     /* Capturing brackets */     /* Capturing brackets */
# Line 7418  do { Line 7495  do {
7495       {       {
7496       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7497       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7498       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
7499       }       }
7500    
7501     /* Other brackets */     /* Positive forward assertions */
7502    
7503     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)     else if (op == OP_ASSERT)
7504       {       {
7505       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7506       }       }
7507    
7508       /* Atomic brackets */
7509    
7510     /* .* means "start at start or after \n" if it isn't in brackets that     else if (op == OP_ONCE || op == OP_ONCE_NC)
7511     may be referenced. */       {
7512         if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
7513         }
7514    
7515       /* .* means "start at start or after \n" if it isn't in atomic brackets or
7516       brackets that may be referenced, as long as the pattern does not contain
7517       *PRUNE or *SKIP, because these break the feature. Consider, for example,
7518       /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
7519       start of a line. */
7520    
7521     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
7522       {       {
7523       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
7524             atomcount > 0 || cd->had_pruneorskip)
7525           return FALSE;
7526       }       }
7527    
7528     /* Check for explicit circumflex */     /* Check for explicit circumflex; anything else gives a FALSE result. Note
7529       in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
7530       because the number of characters matched by .* cannot be adjusted inside
7531       them. */
7532    
7533     else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;     else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
7534    
# Line 7709  not used here. */ Line 7801  not used here. */
7801  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7802       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7803    {    {
7804  #ifdef COMPILE_PCRE8  #ifdef COMPILE_PCRE8
7805    errorcode = ERR44;    errorcode = ERR44;
7806  #else  #else
7807    errorcode = ERR74;    errorcode = ERR74;
7808  #endif  #endif
7809    goto PCRE_EARLY_ERROR_RETURN2;    goto PCRE_EARLY_ERROR_RETURN2;
7810    }    }
7811  #else  #else
# Line 7817  cd->start_pattern = (const pcre_uchar *) Line 7909  cd->start_pattern = (const pcre_uchar *)
7909  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7910  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7911  cd->assert_depth = 0;  cd->assert_depth = 0;
7912    cd->max_lookbehind = 0;
7913  cd->external_options = options;  cd->external_options = options;
7914  cd->external_flags = 0;  cd->external_flags = 0;
7915  cd->open_caps = NULL;  cd->open_caps = NULL;
# Line 7867  re->magic_number = MAGIC_NUMBER; Line 7960  re->magic_number = MAGIC_NUMBER;
7960  re->size = (int)size;  re->size = (int)size;
7961  re->options = cd->external_options;  re->options = cd->external_options;
7962  re->flags = cd->external_flags;  re->flags = cd->external_flags;
 re->dummy1 = 0;  
7963  re->first_char = 0;  re->first_char = 0;
7964  re->req_char = 0;  re->req_char = 0;
7965  re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);  re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
# Line 7887  field; this time it's used for rememberi Line 7979  field; this time it's used for rememberi
7979  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7980  cd->assert_depth = 0;  cd->assert_depth = 0;
7981  cd->bracount = 0;  cd->bracount = 0;
7982    cd->max_lookbehind = 0;
7983  cd->names_found = 0;  cd->names_found = 0;
7984  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7985  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
# Line 7894  cd->start_code = codestart; Line 7987  cd->start_code = codestart;
7987  cd->hwm = (pcre_uchar *)(cd->start_workspace);  cd->hwm = (pcre_uchar *)(cd->start_workspace);
7988  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7989  cd->had_accept = FALSE;  cd->had_accept = FALSE;
7990    cd->had_pruneorskip = FALSE;
7991  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
7992  cd->open_caps = NULL;  cd->open_caps = NULL;
7993    
# Line 7908  code = (pcre_uchar *)codestart; Line 8002  code = (pcre_uchar *)codestart;
8002    &firstchar, &reqchar, NULL, cd, NULL);    &firstchar, &reqchar, NULL, cd, NULL);
8003  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
8004  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
8005    re->max_lookbehind = cd->max_lookbehind;
8006  re->flags = cd->external_flags | PCRE_MODE;  re->flags = cd->external_flags | PCRE_MODE;
8007    
8008  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
# Line 7995  if (cd->check_lookbehind) Line 8090  if (cd->check_lookbehind)
8090                      (fixed_length == -4)? ERR70 : ERR25;                      (fixed_length == -4)? ERR70 : ERR25;
8091          break;          break;
8092          }          }
8093          if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
8094        PUT(cc, 1, fixed_length);        PUT(cc, 1, fixed_length);
8095        }        }
8096      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 8015  if (errorcode != 0) Line 8111  if (errorcode != 0)
8111    }    }
8112    
8113  /* If the anchored option was not passed, set the flag if we can determine that  /* If the anchored option was not passed, set the flag if we can determine that
8114  the pattern is anchored by virtue of ^ characters or \A or anything else (such  the pattern is anchored by virtue of ^ characters or \A or anything else, such
8115  as starting with .* when DOTALL is set).  as starting with non-atomic .* when DOTALL is set and there are no occurrences
8116    of *PRUNE or *SKIP.
8117    
8118  Otherwise, if we know what the first byte has to be, save it, because that  Otherwise, if we know what the first byte has to be, save it, because that
8119  speeds up unanchored matches no end. If not, see if we can set the  speeds up unanchored matches no end. If not, see if we can set the
8120  PCRE_STARTLINE flag. This is helpful for multiline matches when all branches  PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
8121  start with ^. and also when all branches start with .* for non-DOTALL matches.  start with ^. and also when all branches start with non-atomic .* for
8122  */  non-DOTALL matches when *PRUNE and SKIP are not present. */
8123    
8124  if ((re->options & PCRE_ANCHORED) == 0)  if ((re->options & PCRE_ANCHORED) == 0)
8125    {    {
8126    if (is_anchored(codestart, 0, cd->backref_map))    if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
     re->options |= PCRE_ANCHORED;  
8127    else    else
8128      {      {
8129      if (firstchar < 0)      if (firstchar < 0)
# Line 8064  if ((re->options & PCRE_ANCHORED) == 0) Line 8160  if ((re->options & PCRE_ANCHORED) == 0)
8160    
8161        re->flags |= PCRE_FIRSTSET;        re->flags |= PCRE_FIRSTSET;
8162        }        }
8163      else if (is_startline(codestart, 0, cd->backref_map))  
8164        re->flags |= PCRE_STARTLINE;      else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
8165      }      }
8166    }    }
8167    
# Line 8134  if ((re->flags & PCRE_REQCHSET) != 0) Line 8230  if ((re->flags & PCRE_REQCHSET) != 0)
8230    }    }
8231    
8232  #ifdef COMPILE_PCRE8  #ifdef COMPILE_PCRE8
8233  pcre_printint(re, stdout, TRUE);  pcre_printint((pcre *)re, stdout, TRUE);
8234  #else  #else
8235  pcre16_printint(re, stdout, TRUE);  pcre16_printint((pcre *)re, stdout, TRUE);
8236  #endif  #endif
8237    
8238  /* This check is done here in the debugging case so that the code that  /* This check is done here in the debugging case so that the code that

Legend:
Removed from v.852  
changed lines
  Added in v.994

  ViewVC Help
Powered by ViewVC 1.1.5