/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 964 by ph10, Fri May 4 13:03:39 2012 UTC revision 1033 by ph10, Mon Sep 10 11:02:48 2012 UTC
# Line 490  static const char error_texts[] = Line 490  static const char error_texts[] =
490    "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"    "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491    "invalid UTF-16 string\0"    "invalid UTF-16 string\0"
492    /* 75 */    /* 75 */
493    "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"    "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
494      "character value in \\u.... sequence is too large\0"
495    ;    ;
496    
497  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 788  else if ((i = escapes[c - CHAR_0]) != 0) Line 789  else if ((i = escapes[c - CHAR_0]) != 0)
789    
790  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
791  /* Not alphanumeric */  /* Not alphanumeric */
792  else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}  else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
793  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
794  #endif  #endif
795    
# Line 831  else Line 832  else
832            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
833  #endif  #endif
834            }            }
835    
836    #ifdef COMPILE_PCRE8
837            if (c > (utf ? 0x10ffff : 0xff))
838    #else
839    #ifdef COMPILE_PCRE16
840            if (c > (utf ? 0x10ffff : 0xffff))
841    #endif
842    #endif
843              {
844              *errorcodeptr = ERR76;
845              }
846            else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
847          }          }
848        }        }
849      else      else
# Line 3155  if (next >= 0) switch(op_code) Line 3168  if (next >= 0) switch(op_code)
3168    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3169    switch(next)    switch(next)
3170      {      {
3171      case 0x09:      case CHAR_HT:
3172      case 0x20:      case CHAR_SPACE:
3173    #ifndef EBCDIC
3174      case 0xa0:      case 0xa0:
3175      case 0x1680:      case 0x1680:
3176      case 0x180e:      case 0x180e:
# Line 3174  if (next >= 0) switch(op_code) Line 3188  if (next >= 0) switch(op_code)
3188      case 0x202f:      case 0x202f:
3189      case 0x205f:      case 0x205f:
3190      case 0x3000:      case 0x3000:
3191    #endif  /* Not EBCDIC */
3192      return op_code == OP_NOT_HSPACE;      return op_code == OP_NOT_HSPACE;
3193      default:      default:
3194      return op_code != OP_NOT_HSPACE;      return op_code != OP_NOT_HSPACE;
# Line 3184  if (next >= 0) switch(op_code) Line 3199  if (next >= 0) switch(op_code)
3199    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3200    switch(next)    switch(next)
3201      {      {
3202      case 0x0a:      case CHAR_LF:
3203      case 0x0b:      case CHAR_VT:
3204      case 0x0c:      case CHAR_FF:
3205      case 0x0d:      case CHAR_CR:
3206      case 0x85:      case CHAR_NEL:
3207    #ifndef EBCDIC
3208      case 0x2028:      case 0x2028:
3209      case 0x2029:      case 0x2029:
3210    #endif
3211      return op_code == OP_NOT_VSPACE;      return op_code == OP_NOT_VSPACE;
3212      default:      default:
3213      return op_code != OP_NOT_VSPACE;      return op_code != OP_NOT_VSPACE;
# Line 3248  switch(op_code) Line 3265  switch(op_code)
3265      case ESC_H:      case ESC_H:
3266      switch(c)      switch(c)
3267        {        {
3268        case 0x09:        case CHAR_HT:
3269        case 0x20:        case CHAR_SPACE:
3270    #ifndef EBCDIC
3271        case 0xa0:        case 0xa0:
3272        case 0x1680:        case 0x1680:
3273        case 0x180e:        case 0x180e:
# Line 3267  switch(op_code) Line 3285  switch(op_code)
3285        case 0x202f:        case 0x202f:
3286        case 0x205f:        case 0x205f:
3287        case 0x3000:        case 0x3000:
3288    #endif  /* Not EBCDIC */
3289        return -next != ESC_h;        return -next != ESC_h;
3290        default:        default:
3291        return -next == ESC_h;        return -next == ESC_h;
# Line 3276  switch(op_code) Line 3295  switch(op_code)
3295      case ESC_V:      case ESC_V:
3296      switch(c)      switch(c)
3297        {        {
3298        case 0x0a:        case CHAR_LF:
3299        case 0x0b:        case CHAR_VT:
3300        case 0x0c:        case CHAR_FF:
3301        case 0x0d:        case CHAR_CR:
3302        case 0x85:        case CHAR_NEL:
3303    #ifndef EBCDIC
3304        case 0x2028:        case 0x2028:
3305        case 0x2029:        case 0x2029:
3306    #endif  /* Not EBCDIC */
3307        return -next != ESC_v;        return -next != ESC_v;
3308        default:        default:
3309        return -next == ESC_v;        return -next == ESC_v;
# Line 4044  for (;; ptr++) Line 4065  for (;; ptr++)
4065    
4066              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4067              if it was previously set by something earlier in the character              if it was previously set by something earlier in the character
4068              class. */              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and
4069                EBCDIC, so we lazily just adjust the appropriate bit. */
4070    
4071              case ESC_s:              case ESC_s:
4072              classbits[0] |= cbits[cbit_space];              classbits[0] |= cbits[cbit_space];
# Line 4059  for (;; ptr++) Line 4081  for (;; ptr++)
4081              continue;              continue;
4082    
4083              case ESC_h:              case ESC_h:
4084              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, CHAR_HT);
4085              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, CHAR_SPACE);
4086    #ifndef EBCDIC
4087              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
4088  #ifndef COMPILE_PCRE8  #ifndef COMPILE_PCRE8
4089              xclass = TRUE;              xclass = TRUE;
# Line 4096  for (;; ptr++) Line 4119  for (;; ptr++)
4119                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4120                }                }
4121  #endif  #endif
4122    #endif  /* Not EBCDIC */
4123              continue;              continue;
4124    
4125              case ESC_H:              case ESC_H:
# Line 4104  for (;; ptr++) Line 4128  for (;; ptr++)
4128                int x = 0xff;                int x = 0xff;
4129                switch (c)                switch (c)
4130                  {                  {
4131                  case 0x09/8: x ^= 1 << (0x09%8); break;                  case CHAR_HT/8:    x ^= 1 << (CHAR_HT%8); break;
4132                  case 0x20/8: x ^= 1 << (0x20%8); break;                  case CHAR_SPACE/8: x ^= 1 << (CHAR_SPACE%8); break;
4133                  case 0xa0/8: x ^= 1 << (0xa0%8); break;  #ifndef EBCDIC
4134                    case 0xa0/8: x ^= 1 << (0xa0%8); break;  /* NSBSP */
4135    #endif
4136                  default: break;                  default: break;
4137                  }                  }
4138                classbits[c] |= x;                classbits[c] |= x;
4139                }                }
4140    #ifndef EBCDIC
4141  #ifndef COMPILE_PCRE8  #ifndef COMPILE_PCRE8
4142              xclass = TRUE;              xclass = TRUE;
4143              *class_uchardata++ = XCL_RANGE;              *class_uchardata++ = XCL_RANGE;
# Line 4137  for (;; ptr++) Line 4164  for (;; ptr++)
4164              if (utf)              if (utf)
4165                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4166              else              else
4167  #endif  #endif   /* SUPPORT_UTF */
4168                *class_uchardata++ = 0xffff;                *class_uchardata++ = 0xffff;
4169  #elif defined SUPPORT_UTF  #elif defined SUPPORT_UTF
4170              if (utf)              if (utf)
# Line 4166  for (;; ptr++) Line 4193  for (;; ptr++)
4193                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4194                }                }
4195  #endif  #endif
4196    #endif  /* Not EBCDIC */
4197              continue;              continue;
4198    
4199              case ESC_v:              case ESC_v:
4200              SETBIT(classbits, 0x0a); /* LF */              SETBIT(classbits, CHAR_LF);
4201              SETBIT(classbits, 0x0b); /* VT */              SETBIT(classbits, CHAR_VT);
4202              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, CHAR_FF);
4203              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, CHAR_CR);
4204              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, CHAR_NEL);
4205    #ifndef EBCDIC
4206  #ifndef COMPILE_PCRE8  #ifndef COMPILE_PCRE8
4207              xclass = TRUE;              xclass = TRUE;
4208              *class_uchardata++ = XCL_RANGE;              *class_uchardata++ = XCL_RANGE;
# Line 4188  for (;; ptr++) Line 4217  for (;; ptr++)
4217                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4218                }                }
4219  #endif  #endif
4220    #endif  /* Not EBCDIC */
4221              continue;              continue;
4222    
4223              case ESC_V:              case ESC_V:
# Line 4196  for (;; ptr++) Line 4226  for (;; ptr++)
4226                int x = 0xff;                int x = 0xff;
4227                switch (c)                switch (c)
4228                  {                  {
4229                  case 0x0a/8: x ^= 1 << (0x0a%8);                  case CHAR_LF/8: x ^= 1 << (CHAR_LF%8);
4230                               x ^= 1 << (0x0b%8);                                  x ^= 1 << (CHAR_VT%8);
4231                               x ^= 1 << (0x0c%8);                                  x ^= 1 << (CHAR_FF%8);
4232                               x ^= 1 << (0x0d%8);                                  x ^= 1 << (CHAR_CR%8);
4233                               break;                                  break;
4234                  case 0x85/8: x ^= 1 << (0x85%8); break;                  case CHAR_NEL/8: x ^= 1 << (CHAR_NEL%8); break;
4235                  default: break;                  default: break;
4236                  }                  }
4237                classbits[c] |= x;                classbits[c] |= x;
4238                }                }
4239    
4240    #ifndef EBCDIC
4241  #ifndef COMPILE_PCRE8  #ifndef COMPILE_PCRE8
4242              xclass = TRUE;              xclass = TRUE;
4243              *class_uchardata++ = XCL_RANGE;              *class_uchardata++ = XCL_RANGE;
# Line 4232  for (;; ptr++) Line 4263  for (;; ptr++)
4263                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4264                }                }
4265  #endif  #endif
4266    #endif  /* Not EBCDIC */
4267              continue;              continue;
4268    
4269  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
# Line 4518  for (;; ptr++) Line 4550  for (;; ptr++)
4550        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4551    
4552        /* Only the value of 1 matters for class_single_char. */        /* Only the value of 1 matters for class_single_char. */
4553    
4554        if (class_single_char < 2) class_single_char++;        if (class_single_char < 2) class_single_char++;
4555    
4556        /* If class_charcount is 1, we saw precisely one character. As long as        /* If class_charcount is 1, we saw precisely one character. As long as
# Line 4813  for (;; ptr++) Line 4845  for (;; ptr++)
4845      if (*previous == OP_CHAR || *previous == OP_CHARI      if (*previous == OP_CHAR || *previous == OP_CHARI
4846          || *previous == OP_NOT || *previous == OP_NOTI)          || *previous == OP_NOT || *previous == OP_NOTI)
4847        {        {
4848        switch (*previous)        switch (*previous)
4849          {          {
4850          default: /* Make compiler happy. */          default: /* Make compiler happy. */
4851          case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;          case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
# Line 5593  for (;; ptr++) Line 5625  for (;; ptr++)
5625        ptr++;        ptr++;
5626        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5627        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5628    
5629        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
5630        a closing parenthesis, to appear in arguments, so we no longer insist on        a closing parenthesis, to appear in arguments, so we no longer insist on
5631        letters, digits, and underscores. */        letters, digits, and underscores. */
# Line 5607  for (;; ptr++) Line 5639  for (;; ptr++)
5639            {            {
5640            *errorcodeptr = ERR75;            *errorcodeptr = ERR75;
5641            goto FAILED;            goto FAILED;
5642            }            }
5643          }          }
5644    
5645        if (*ptr != CHAR_RIGHT_PARENTHESIS)        if (*ptr != CHAR_RIGHT_PARENTHESIS)
# Line 5623  for (;; ptr++) Line 5655  for (;; ptr++)
5655          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5656              STRNCMP_UC_C8(name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5657            {            {
5658              int setverb;
5659    
5660            /* Check for open captures before ACCEPT and convert it to            /* Check for open captures before ACCEPT and convert it to
5661            ASSERT_ACCEPT if in an assertion. */            ASSERT_ACCEPT if in an assertion. */
5662    
# Line 5640  for (;; ptr++) Line 5674  for (;; ptr++)
5674                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
5675                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5676                }                }
5677              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              setverb = *code++ =
5678                  (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5679    
5680              /* Do not set firstchar after *ACCEPT */              /* Do not set firstchar after *ACCEPT */
5681              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
# Line 5655  for (;; ptr++) Line 5690  for (;; ptr++)
5690                *errorcodeptr = ERR66;                *errorcodeptr = ERR66;
5691                goto FAILED;                goto FAILED;
5692                }                }
5693              *code = verbs[i].op;              setverb = *code++ = verbs[i].op;
             if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;  
5694              }              }
5695    
5696            else            else
# Line 5666  for (;; ptr++) Line 5700  for (;; ptr++)
5700                *errorcodeptr = ERR59;                *errorcodeptr = ERR59;
5701                goto FAILED;                goto FAILED;
5702                }                }
5703              *code = verbs[i].op_arg;              setverb = *code++ = verbs[i].op_arg;
             if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;  
5704              *code++ = arglen;              *code++ = arglen;
5705              memcpy(code, arg, IN_UCHARS(arglen));              memcpy(code, arg, IN_UCHARS(arglen));
5706              code += arglen;              code += arglen;
5707              *code++ = 0;              *code++ = 0;
5708              }              }
5709    
5710              switch (setverb)
5711                {
5712                case OP_THEN:
5713                case OP_THEN_ARG:
5714                cd->external_flags |= PCRE_HASTHEN;
5715                break;
5716    
5717                case OP_PRUNE:
5718                case OP_PRUNE_ARG:
5719                case OP_SKIP:
5720                case OP_SKIP_ARG:
5721                cd->had_pruneorskip = TRUE;
5722                break;
5723                }
5724    
5725            break;  /* Found verb, exit loop */            break;  /* Found verb, exit loop */
5726            }            }
5727    
# Line 6859  for (;; ptr++) Line 6907  for (;; ptr++)
6907        /* For the rest (including \X when Unicode properties are supported), we        /* For the rest (including \X when Unicode properties are supported), we
6908        can obtain the OP value by negating the escape value in the default        can obtain the OP value by negating the escape value in the default
6909        situation when PCRE_UCP is not set. When it *is* set, we substitute        situation when PCRE_UCP is not set. When it *is* set, we substitute
6910        Unicode property tests. Note that \b and \B do a one-character        Unicode property tests. Note that \b and \B do a one-character
6911        lookbehind. */        lookbehind. */
6912    
6913        else        else
6914          {          {
6915          if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)          if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
6916            cd->max_lookbehind = 1;            cd->max_lookbehind = 1;
6917  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
6918          if (-c >= ESC_DU && -c <= ESC_wu)          if (-c >= ESC_DU && -c <= ESC_wu)
6919            {            {
# Line 7173  for (;;) Line 7221  for (;;)
7221          *ptrptr = ptr;          *ptrptr = ptr;
7222          return FALSE;          return FALSE;
7223          }          }
7224        else        else
7225          {          {
7226          if (fixed_length > cd->max_lookbehind)          if (fixed_length > cd->max_lookbehind)
7227            cd->max_lookbehind = fixed_length;            cd->max_lookbehind = fixed_length;
7228          PUT(reverse_count, 0, fixed_length);          PUT(reverse_count, 0, fixed_length);
7229          }          }
7230        }        }
7231      }      }
# Line 7310  and the highest back reference was great Line 7358  and the highest back reference was great
7358  However, by keeping a bitmap of the first 31 back references, we can catch some  However, by keeping a bitmap of the first 31 back references, we can catch some
7359  of the more common cases more precisely.  of the more common cases more precisely.
7360    
7361    ... A second exception is when the .* appears inside an atomic group, because
7362    this prevents the number of characters it matches from being adjusted.
7363    
7364  Arguments:  Arguments:
7365    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
7366    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
7367                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
7368                    the less precise approach                    the less precise approach
7369    backref_map    the back reference bitmap    cd             points to the compile data block
7370      atomcount      atomic group level
7371    
7372  Returns:     TRUE or FALSE  Returns:     TRUE or FALSE
7373  */  */
7374    
7375  static BOOL  static BOOL
7376  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
7377    unsigned int backref_map)    compile_data *cd, int atomcount)
7378  {  {
7379  do {  do {
7380     const pcre_uchar *scode = first_significant_code(     const pcre_uchar *scode = first_significant_code(
# Line 7334  do { Line 7386  do {
7386     if (op == OP_BRA  || op == OP_BRAPOS ||     if (op == OP_BRA  || op == OP_BRAPOS ||
7387         op == OP_SBRA || op == OP_SBRAPOS)         op == OP_SBRA || op == OP_SBRAPOS)
7388       {       {
7389       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
7390       }       }
7391    
7392     /* Capturing brackets */     /* Capturing brackets */
# Line 7344  do { Line 7396  do {
7396       {       {
7397       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7398       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7399       if (!is_anchored(scode, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
7400       }       }
7401    
7402     /* Other brackets */     /* Positive forward assertions and conditions */
7403    
7404     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||     else if (op == OP_ASSERT || op == OP_COND)
             op == OP_COND)  
7405       {       {
7406       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
7407         }
7408    
7409       /* Atomic groups */
7410    
7411       else if (op == OP_ONCE || op == OP_ONCE_NC)
7412         {
7413         if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
7414           return FALSE;
7415       }       }
7416    
7417     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7418     it isn't in brackets that are or may be referenced. */     it isn't in brackets that are or may be referenced or inside an atomic
7419       group. */
7420    
7421     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7422               op == OP_TYPEPOSSTAR))               op == OP_TYPEPOSSTAR))
7423       {       {
7424       if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)       if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
7425             atomcount > 0 || cd->had_pruneorskip)
7426         return FALSE;         return FALSE;
7427       }       }
7428    
7429     /* Check for explicit anchoring */     /* Check for explicit anchoring */
7430    
7431     else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;     else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7432    
7433     code += GET(code, 1);     code += GET(code, 1);
7434     }     }
7435  while (*code == OP_ALT);   /* Loop for each alternative */  while (*code == OP_ALT);   /* Loop for each alternative */
# Line 7385  return TRUE; Line 7447  return TRUE;
7447  matching and for non-DOTALL patterns that start with .* (which must start at  matching and for non-DOTALL patterns that start with .* (which must start at
7448  the beginning or after \n). As in the case of is_anchored() (see above), we  the beginning or after \n). As in the case of is_anchored() (see above), we
7449  have to take account of back references to capturing brackets that contain .*  have to take account of back references to capturing brackets that contain .*
7450  because in that case we can't make the assumption.  because in that case we can't make the assumption. Also, the appearance of .*
7451    inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
7452    count, because once again the assumption no longer holds.
7453    
7454  Arguments:  Arguments:
7455    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
7456    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
7457                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
7458                    the less precise approach                    the less precise approach
7459    backref_map    the back reference bitmap    cd             points to the compile data
7460      atomcount      atomic group level
7461    
7462  Returns:         TRUE or FALSE  Returns:         TRUE or FALSE
7463  */  */
7464    
7465  static BOOL  static BOOL
7466  is_startline(const pcre_uchar *code, unsigned int bracket_map,  is_startline(const pcre_uchar *code, unsigned int bracket_map,
7467    unsigned int backref_map)    compile_data *cd, int atomcount)
7468  {  {
7469  do {  do {
7470     const pcre_uchar *scode = first_significant_code(     const pcre_uchar *scode = first_significant_code(
# Line 7425  do { Line 7490  do {
7490         return FALSE;         return FALSE;
7491    
7492         default:     /* Assertion */         default:     /* Assertion */
7493         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;         if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7494         do scode += GET(scode, 1); while (*scode == OP_ALT);         do scode += GET(scode, 1); while (*scode == OP_ALT);
7495         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
7496         break;         break;
# Line 7439  do { Line 7504  do {
7504     if (op == OP_BRA  || op == OP_BRAPOS ||     if (op == OP_BRA  || op == OP_BRAPOS ||
7505         op == OP_SBRA || op == OP_SBRAPOS)         op == OP_SBRA || op == OP_SBRAPOS)
7506       {       {
7507       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7508       }       }
7509    
7510     /* Capturing brackets */     /* Capturing brackets */
# Line 7449  do { Line 7514  do {
7514       {       {
7515       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7516       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7517       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
7518       }       }
7519    
7520     /* Other brackets */     /* Positive forward assertions */
7521    
7522       else if (op == OP_ASSERT)
7523         {
7524         if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7525         }
7526    
7527       /* Atomic brackets */
7528    
7529     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)     else if (op == OP_ONCE || op == OP_ONCE_NC)
7530       {       {
7531       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
7532       }       }
7533    
7534     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in atomic brackets or
7535     may be referenced. */     brackets that may be referenced, as long as the pattern does not contain
7536       *PRUNE or *SKIP, because these break the feature. Consider, for example,
7537       /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
7538       start of a line. */
7539    
7540     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
7541       {       {
7542       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
7543             atomcount > 0 || cd->had_pruneorskip)
7544           return FALSE;
7545       }       }
7546    
7547     /* Check for explicit circumflex */     /* Check for explicit circumflex; anything else gives a FALSE result. Note
7548       in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
7549       because the number of characters matched by .* cannot be adjusted inside
7550       them. */
7551    
7552     else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;     else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
7553    
# Line 7926  cd->start_code = codestart; Line 8006  cd->start_code = codestart;
8006  cd->hwm = (pcre_uchar *)(cd->start_workspace);  cd->hwm = (pcre_uchar *)(cd->start_workspace);
8007  cd->req_varyopt = 0;  cd->req_varyopt = 0;
8008  cd->had_accept = FALSE;  cd->had_accept = FALSE;
8009    cd->had_pruneorskip = FALSE;
8010  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
8011  cd->open_caps = NULL;  cd->open_caps = NULL;
8012    
# Line 8049  if (errorcode != 0) Line 8130  if (errorcode != 0)
8130    }    }
8131    
8132  /* If the anchored option was not passed, set the flag if we can determine that  /* If the anchored option was not passed, set the flag if we can determine that
8133  the pattern is anchored by virtue of ^ characters or \A or anything else (such  the pattern is anchored by virtue of ^ characters or \A or anything else, such
8134  as starting with .* when DOTALL is set).  as starting with non-atomic .* when DOTALL is set and there are no occurrences
8135    of *PRUNE or *SKIP.
8136    
8137  Otherwise, if we know what the first byte has to be, save it, because that  Otherwise, if we know what the first byte has to be, save it, because that
8138  speeds up unanchored matches no end. If not, see if we can set the  speeds up unanchored matches no end. If not, see if we can set the
8139  PCRE_STARTLINE flag. This is helpful for multiline matches when all branches  PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
8140  start with ^. and also when all branches start with .* for non-DOTALL matches.  start with ^. and also when all branches start with non-atomic .* for
8141  */  non-DOTALL matches when *PRUNE and SKIP are not present. */
8142    
8143  if ((re->options & PCRE_ANCHORED) == 0)  if ((re->options & PCRE_ANCHORED) == 0)
8144    {    {
8145    if (is_anchored(codestart, 0, cd->backref_map))    if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
     re->options |= PCRE_ANCHORED;  
8146    else    else
8147      {      {
8148      if (firstchar < 0)      if (firstchar < 0)
# Line 8098  if ((re->options & PCRE_ANCHORED) == 0) Line 8179  if ((re->options & PCRE_ANCHORED) == 0)
8179    
8180        re->flags |= PCRE_FIRSTSET;        re->flags |= PCRE_FIRSTSET;
8181        }        }
8182      else if (is_startline(codestart, 0, cd->backref_map))  
8183        re->flags |= PCRE_STARTLINE;      else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
8184      }      }
8185    }    }
8186    

Legend:
Removed from v.964  
changed lines
  Added in v.1033

  ViewVC Help
Powered by ViewVC 1.1.5