/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 604 by ph10, Thu Jun 2 19:04:54 2011 UTC revision 621 by ph10, Mon Jul 18 10:14:09 2011 UTC
# Line 1694  _pcre_find_bracket(const uschar *code, B Line 1694  _pcre_find_bracket(const uschar *code, B
1694  for (;;)  for (;;)
1695    {    {
1696    register int c = *code;    register int c = *code;
1697    
1698    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1699    
1700    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 4207  for (;; ptr++) Line 4208  for (;; ptr++)
4208        ptr++;        ptr++;
4209        }        }
4210      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4211    
4212        /* If previous was a recursion call, wrap it in atomic brackets so that
4213        previous becomes the atomic group. All recursions were so wrapped in the
4214        past, but it no longer happens for non-repeated recursions. In fact, the
4215        repeated ones could be re-implemented independently so as not to need this,
4216        but for the moment we rely on the code for repeating groups. */
4217    
4218        if (*previous == OP_RECURSE)
4219          {
4220          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
4221          *previous = OP_ONCE;
4222          PUT(previous, 1, 2 + 2*LINK_SIZE);
4223          previous[2 + 2*LINK_SIZE] = OP_KET;
4224          PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4225          code += 2 + 2 * LINK_SIZE;
4226          length_prevgroup = 3 + 3*LINK_SIZE;
4227    
4228          /* When actually compiling, we need to check whether this was a forward
4229          reference, and if so, adjust the offset. */
4230    
4231          if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4232            {
4233            int offset = GET(cd->hwm, -LINK_SIZE);
4234            if (offset == previous + 1 - cd->start_code)
4235              PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4236            }
4237          }
4238    
4239        /* Now handle repetition for the different types of item. */
4240    
4241      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4242      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
# Line 4510  for (;; ptr++) Line 4540  for (;; ptr++)
4540        int len = (int)(code - previous);        int len = (int)(code - previous);
4541        uschar *bralink = NULL;        uschar *bralink = NULL;
4542        uschar *brazeroptr = NULL;        uschar *brazeroptr = NULL;
4543    
4544        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless */
4545    
4546        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
# Line 4726  for (;; ptr++) Line 4756  for (;; ptr++)
4756          }          }
4757    
4758        /* If the maximum is unlimited, set a repeater in the final copy. For        /* If the maximum is unlimited, set a repeater in the final copy. For
4759        ONCE brackets, that's all we need to do.        ONCE brackets, that's all we need to do. However, possessively repeated
4760          ONCE brackets can be converted into non-capturing brackets, as the
4761          behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
4762          deal with possessive ONCEs specially.
4763    
4764        Otherwise, if the quantifier was possessive, we convert the BRA code to        Otherwise, if the quantifier was possessive, we convert the BRA code to
4765        the POS form, and the KET code to KETRPOS. (It turns out to be convenient        the POS form, and the KET code to KETRPOS. (It turns out to be convenient
4766        at runtime to detect this kind of subpattern at both the start and at the        at runtime to detect this kind of subpattern at both the start and at the
4767        end.) If the group is preceded by OP_BRAZERO, convert this to        end.) The use of special opcodes makes it possible to reduce greatly the
4768        OP_BRAPOSZERO. Then cancel the possessive flag so that the default action        stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO,
4769        below, of wrapping everything inside atomic brackets, does not happen.        convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that
4770          the default action below, of wrapping everything inside atomic brackets,
4771          does not happen.
4772    
4773        Then, when we are doing the actual compile phase, check to see whether        Then, when we are doing the actual compile phase, check to see whether
4774        this group is one that could match an empty string. If so, convert the        this group is one that could match an empty string. If so, convert the
# Line 4745  for (;; ptr++) Line 4780  for (;; ptr++)
4780          {          {
4781          uschar *ketcode = code - 1 - LINK_SIZE;          uschar *ketcode = code - 1 - LINK_SIZE;
4782          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
4783    
4784          if (*bracode == OP_ONCE)          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
4785            if (*bracode == OP_ONCE)
4786            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
4787          else          else
4788            {            {
# Line 4793  for (;; ptr++) Line 4829  for (;; ptr++)
4829        }        }
4830    
4831      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', or if certain optimization
4832      tests above succeeded, possessive_quantifier is TRUE. For some of the      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
4833      simpler opcodes, there is an special alternative opcode for this. For      there are special alternative opcodes for this case. For anything else, we
4834      anything else, we wrap the entire repeated item inside OP_ONCE brackets.      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
4835      The '+' notation is just syntactic sugar, taken from Sun's Java package,      notation is just syntactic sugar, taken from Sun's Java package, but the
4836      but the special opcodes can optimize it a bit. The repeated item starts at      special opcodes can optimize it.
4837      tempcode, not at previous, which might be the first part of a string whose  
4838      (former) last char we repeated.      Possessively repeated subpatterns have already been handled in the code
4839        just above, so possessive_quantifier is always FALSE for them at this
4840        stage.
4841    
4842        Note that the repeated item starts at tempcode, not at previous, which
4843        might be the first part of a string whose (former) last char we repeated.
4844    
4845      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4846      an 'upto' may follow. We skip over an 'exact' item, and then test the      an 'upto' may follow. We skip over an 'exact' item, and then test the
# Line 4924  for (;; ptr++) Line 4965  for (;; ptr++)
4965          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
4966              strncmp((char *)name, vn, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4967            {            {
4968            /* Check for open captures before ACCEPT */            /* Check for open captures before ACCEPT and convert it to
4969              ASSERT_ACCEPT if in an assertion. */
4970    
4971            if (verbs[i].op == OP_ACCEPT)            if (verbs[i].op == OP_ACCEPT)
4972              {              {
4973              open_capitem *oc;              open_capitem *oc;
4974                if (arglen != 0)
4975                  {
4976                  *errorcodeptr = ERR59;
4977                  goto FAILED;
4978                  }
4979              cd->had_accept = TRUE;              cd->had_accept = TRUE;
4980              for (oc = cd->open_caps; oc != NULL; oc = oc->next)              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4981                {                {
4982                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
4983                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
4984                }                }
4985                *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
4986              }              }
4987    
4988            /* Handle the cases with/without an argument */            /* Handle other cases with/without an argument */
4989    
4990            if (arglen == 0)            else if (arglen == 0)
4991              {              {
4992              if (verbs[i].op < 0)   /* Argument is mandatory */              if (verbs[i].op < 0)   /* Argument is mandatory */
4993                {                {
# Line 5228  for (;; ptr++) Line 5276  for (;; ptr++)
5276          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5277          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5278          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
5279            cd->assert_depth += 1;
5280          ptr++;          ptr++;
5281          break;          break;
5282    
# Line 5242  for (;; ptr++) Line 5291  for (;; ptr++)
5291            continue;            continue;
5292            }            }
5293          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
5294            cd->assert_depth += 1;
5295          break;          break;
5296    
5297    
# Line 5251  for (;; ptr++) Line 5301  for (;; ptr++)
5301            {            {
5302            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5303            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
5304              cd->assert_depth += 1;
5305            ptr += 2;            ptr += 2;
5306            break;            break;
5307    
5308            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5309            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
5310              cd->assert_depth += 1;
5311            ptr += 2;            ptr += 2;
5312            break;            break;
5313    
# Line 5646  for (;; ptr++) Line 5698  for (;; ptr++)
5698    
5699                /* Fudge the value of "called" so that when it is inserted as an                /* Fudge the value of "called" so that when it is inserted as an
5700                offset below, what it actually inserted is the reference number                offset below, what it actually inserted is the reference number
5701                of the group. */                of the group. Then remember the forward reference. */
5702    
5703                called = cd->start_code + recno;                called = cd->start_code + recno;
5704                PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
5705                }                }
5706    
5707              /* If not a forward reference, and the subpattern is still open,              /* If not a forward reference, and the subpattern is still open,
# Line 5664  for (;; ptr++) Line 5716  for (;; ptr++)
5716                }                }
5717              }              }
5718    
5719            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item. */
5720            "once" brackets. Set up a "previous group" length so that a  
           subsequent quantifier will work. */  
   
           *code = OP_ONCE;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
5721            *code = OP_RECURSE;            *code = OP_RECURSE;
5722            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
5723            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
   
           *code = OP_KET;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
           length_prevgroup = 3 + 3*LINK_SIZE;  
5724            }            }
5725    
5726          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 5823  for (;; ptr++) Line 5863  for (;; ptr++)
5863             &length_prevgroup           /* Pre-compile phase */             &length_prevgroup           /* Pre-compile phase */
5864           ))           ))
5865        goto FAILED;        goto FAILED;
5866    
5867        if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
5868          cd->assert_depth -= 1;
5869    
5870      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
5871      group, while tempcode has been updated to point past the end of the group      group, while tempcode has been updated to point past the end of the group
# Line 5894  for (;; ptr++) Line 5937  for (;; ptr++)
5937          goto FAILED;          goto FAILED;
5938          }          }
5939        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5940        *code++ = OP_BRA;        code++;   /* This already contains bravalue */
5941        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
5942        *code++ = OP_KET;        *code++ = OP_KET;
5943        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
# Line 6969  utf8 = (options & PCRE_UTF8) != 0; Line 7012  utf8 = (options & PCRE_UTF8) != 0;
7012    
7013  /* Can't support UTF8 unless PCRE has been compiled to include the code. The  /* Can't support UTF8 unless PCRE has been compiled to include the code. The
7014  return of an error code from _pcre_valid_utf8() is a new feature, introduced in  return of an error code from _pcre_valid_utf8() is a new feature, introduced in
7015  release 8.13. The only use we make of it here is to adjust the offset value to  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7016  the end of the string for a short string error, for compatibility with previous  not used here. */
 versions. */  
7017    
7018  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
7019  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7020       (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1, &errorcode)) >= 0)       (errorcode = _pcre_valid_utf8((USPTR)pattern, -1, erroroffset)) != 0)
7021    {    {
7022    errorcode = ERR44;    errorcode = ERR44;
7023    goto PCRE_EARLY_ERROR_RETURN2;    goto PCRE_EARLY_ERROR_RETURN2;
# Line 7146  field; this time it's used for rememberi Line 7188  field; this time it's used for rememberi
7188  */  */
7189    
7190  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7191    cd->assert_depth = 0;
7192  cd->bracount = 0;  cd->bracount = 0;
7193  cd->names_found = 0;  cd->names_found = 0;
7194  cd->name_table = (uschar *)re + re->name_table_offset;  cd->name_table = (uschar *)re + re->name_table_offset;

Legend:
Removed from v.604  
changed lines
  Added in v.621

  ViewVC Help
Powered by ViewVC 1.1.5