/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 604 by ph10, Thu Jun 2 19:04:54 2011 UTC revision 629 by ph10, Fri Jul 22 09:18:11 2011 UTC
# Line 409  static const char error_texts[] = Line 409  static const char error_texts[] =
409    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
410    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
411    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
412      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413    ;    ;
414    
415  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 1694  _pcre_find_bracket(const uschar *code, B Line 1695  _pcre_find_bracket(const uschar *code, B
1695  for (;;)  for (;;)
1696    {    {
1697    register int c = *code;    register int c = *code;
1698    
1699    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1700    
1701    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1974  for (code = first_significant_code(code Line 1976  for (code = first_significant_code(code
1976      }      }
1977    
1978    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
1979    implies a subroutine call, we can scan it. */    implies a backward reference subroutine call, we can scan it. If it's a
1980      forward reference subroutine call, we can't. To detect forward reference
1981      we have to scan up the list that is kept in the workspace. This function is
1982      called only when doing the real compile, not during the pre-compile that
1983      measures the size of the compiled pattern. */
1984    
1985    if (c == OP_RECURSE)    if (c == OP_RECURSE)
1986      {      {
1987      BOOL empty_branch = FALSE;      const uschar *scode;
1988      const uschar *scode = cd->start_code + GET(code, 1);      BOOL empty_branch;
1989    
1990        /* Test for forward reference */
1991    
1992        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
1993          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
1994    
1995        /* Not a forward reference, test for completed backward reference */
1996    
1997        empty_branch = FALSE;
1998        scode = cd->start_code + GET(code, 1);
1999      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2000    
2001        /* Completed backwards reference */
2002    
2003      do      do
2004        {        {
2005        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf8, cd))
# Line 1991  for (code = first_significant_code(code Line 2010  for (code = first_significant_code(code
2010        scode += GET(scode, 1);        scode += GET(scode, 1);
2011        }        }
2012      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2013    
2014      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2015      continue;      continue;
2016      }      }
# Line 2216  return TRUE; Line 2236  return TRUE;
2236  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2237  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2238  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2239    This function is called only during the real compile, not during the
2240    pre-compile.
2241    
2242  Arguments:  Arguments:
2243    code        points to start of the recursion    code        points to start of the recursion
# Line 4207  for (;; ptr++) Line 4229  for (;; ptr++)
4229        ptr++;        ptr++;
4230        }        }
4231      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4232    
4233        /* If previous was a recursion call, wrap it in atomic brackets so that
4234        previous becomes the atomic group. All recursions were so wrapped in the
4235        past, but it no longer happens for non-repeated recursions. In fact, the
4236        repeated ones could be re-implemented independently so as not to need this,
4237        but for the moment we rely on the code for repeating groups. */
4238    
4239        if (*previous == OP_RECURSE)
4240          {
4241          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
4242          *previous = OP_ONCE;
4243          PUT(previous, 1, 2 + 2*LINK_SIZE);
4244          previous[2 + 2*LINK_SIZE] = OP_KET;
4245          PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4246          code += 2 + 2 * LINK_SIZE;
4247          length_prevgroup = 3 + 3*LINK_SIZE;
4248    
4249          /* When actually compiling, we need to check whether this was a forward
4250          reference, and if so, adjust the offset. */
4251    
4252          if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4253            {
4254            int offset = GET(cd->hwm, -LINK_SIZE);
4255            if (offset == previous + 1 - cd->start_code)
4256              PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4257            }
4258          }
4259    
4260        /* Now handle repetition for the different types of item. */
4261    
4262      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4263      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
# Line 4510  for (;; ptr++) Line 4561  for (;; ptr++)
4561        int len = (int)(code - previous);        int len = (int)(code - previous);
4562        uschar *bralink = NULL;        uschar *bralink = NULL;
4563        uschar *brazeroptr = NULL;        uschar *brazeroptr = NULL;
4564    
4565        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless */
4566    
4567        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
# Line 4726  for (;; ptr++) Line 4777  for (;; ptr++)
4777          }          }
4778    
4779        /* If the maximum is unlimited, set a repeater in the final copy. For        /* If the maximum is unlimited, set a repeater in the final copy. For
4780        ONCE brackets, that's all we need to do.        ONCE brackets, that's all we need to do. However, possessively repeated
4781          ONCE brackets can be converted into non-capturing brackets, as the
4782          behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
4783          deal with possessive ONCEs specially.
4784    
4785        Otherwise, if the quantifier was possessive, we convert the BRA code to        Otherwise, if the quantifier was possessive, we convert the BRA code to
4786        the POS form, and the KET code to KETRPOS. (It turns out to be convenient        the POS form, and the KET code to KETRPOS. (It turns out to be convenient
4787        at runtime to detect this kind of subpattern at both the start and at the        at runtime to detect this kind of subpattern at both the start and at the
4788        end.) If the group is preceded by OP_BRAZERO, convert this to        end.) The use of special opcodes makes it possible to reduce greatly the
4789        OP_BRAPOSZERO. Then cancel the possessive flag so that the default action        stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO,
4790        below, of wrapping everything inside atomic brackets, does not happen.        convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that
4791          the default action below, of wrapping everything inside atomic brackets,
4792          does not happen.
4793    
4794        Then, when we are doing the actual compile phase, check to see whether        Then, when we are doing the actual compile phase, check to see whether
4795        this group is one that could match an empty string. If so, convert the        this group is one that could match an empty string. If so, convert the
# Line 4745  for (;; ptr++) Line 4801  for (;; ptr++)
4801          {          {
4802          uschar *ketcode = code - 1 - LINK_SIZE;          uschar *ketcode = code - 1 - LINK_SIZE;
4803          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
4804    
4805          if (*bracode == OP_ONCE)          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
4806            if (*bracode == OP_ONCE)
4807            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
4808          else          else
4809            {            {
# Line 4793  for (;; ptr++) Line 4850  for (;; ptr++)
4850        }        }
4851    
4852      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', or if certain optimization
4853      tests above succeeded, possessive_quantifier is TRUE. For some of the      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
4854      simpler opcodes, there is an special alternative opcode for this. For      there are special alternative opcodes for this case. For anything else, we
4855      anything else, we wrap the entire repeated item inside OP_ONCE brackets.      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
4856      The '+' notation is just syntactic sugar, taken from Sun's Java package,      notation is just syntactic sugar, taken from Sun's Java package, but the
4857      but the special opcodes can optimize it a bit. The repeated item starts at      special opcodes can optimize it.
4858      tempcode, not at previous, which might be the first part of a string whose  
4859      (former) last char we repeated.      Possessively repeated subpatterns have already been handled in the code
4860        just above, so possessive_quantifier is always FALSE for them at this
4861        stage.
4862    
4863        Note that the repeated item starts at tempcode, not at previous, which
4864        might be the first part of a string whose (former) last char we repeated.
4865    
4866      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4867      an 'upto' may follow. We skip over an 'exact' item, and then test the      an 'upto' may follow. We skip over an 'exact' item, and then test the
# Line 4924  for (;; ptr++) Line 4986  for (;; ptr++)
4986          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
4987              strncmp((char *)name, vn, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4988            {            {
4989            /* Check for open captures before ACCEPT */            /* Check for open captures before ACCEPT and convert it to
4990              ASSERT_ACCEPT if in an assertion. */
4991    
4992            if (verbs[i].op == OP_ACCEPT)            if (verbs[i].op == OP_ACCEPT)
4993              {              {
4994              open_capitem *oc;              open_capitem *oc;
4995                if (arglen != 0)
4996                  {
4997                  *errorcodeptr = ERR59;
4998                  goto FAILED;
4999                  }
5000              cd->had_accept = TRUE;              cd->had_accept = TRUE;
5001              for (oc = cd->open_caps; oc != NULL; oc = oc->next)              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5002                {                {
5003                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
5004                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5005                }                }
5006                *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5007              }              }
5008    
5009            /* Handle the cases with/without an argument */            /* Handle other cases with/without an argument */
5010    
5011            if (arglen == 0)            else if (arglen == 0)
5012              {              {
5013              if (verbs[i].op < 0)   /* Argument is mandatory */              if (verbs[i].op < 0)   /* Argument is mandatory */
5014                {                {
# Line 5228  for (;; ptr++) Line 5297  for (;; ptr++)
5297          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5298          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5299          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
5300            cd->assert_depth += 1;
5301          ptr++;          ptr++;
5302          break;          break;
5303    
# Line 5242  for (;; ptr++) Line 5312  for (;; ptr++)
5312            continue;            continue;
5313            }            }
5314          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
5315            cd->assert_depth += 1;
5316          break;          break;
5317    
5318    
# Line 5251  for (;; ptr++) Line 5322  for (;; ptr++)
5322            {            {
5323            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5324            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
5325              cd->assert_depth += 1;
5326            ptr += 2;            ptr += 2;
5327            break;            break;
5328    
5329            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5330            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
5331              cd->assert_depth += 1;
5332            ptr += 2;            ptr += 2;
5333            break;            break;
5334    
# Line 5646  for (;; ptr++) Line 5719  for (;; ptr++)
5719    
5720                /* Fudge the value of "called" so that when it is inserted as an                /* Fudge the value of "called" so that when it is inserted as an
5721                offset below, what it actually inserted is the reference number                offset below, what it actually inserted is the reference number
5722                of the group. */                of the group. Then remember the forward reference. */
5723    
5724                called = cd->start_code + recno;                called = cd->start_code + recno;
5725                PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
5726                }                }
5727    
5728              /* If not a forward reference, and the subpattern is still open,              /* If not a forward reference, and the subpattern is still open,
# Line 5664  for (;; ptr++) Line 5737  for (;; ptr++)
5737                }                }
5738              }              }
5739    
5740            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item. */
5741            "once" brackets. Set up a "previous group" length so that a  
           subsequent quantifier will work. */  
   
           *code = OP_ONCE;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
5742            *code = OP_RECURSE;            *code = OP_RECURSE;
5743            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
5744            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
   
           *code = OP_KET;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
           length_prevgroup = 3 + 3*LINK_SIZE;  
5745            }            }
5746    
5747          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 5823  for (;; ptr++) Line 5884  for (;; ptr++)
5884             &length_prevgroup           /* Pre-compile phase */             &length_prevgroup           /* Pre-compile phase */
5885           ))           ))
5886        goto FAILED;        goto FAILED;
5887    
5888        if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
5889          cd->assert_depth -= 1;
5890    
5891      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
5892      group, while tempcode has been updated to point past the end of the group      group, while tempcode has been updated to point past the end of the group
# Line 5894  for (;; ptr++) Line 5958  for (;; ptr++)
5958          goto FAILED;          goto FAILED;
5959          }          }
5960        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5961        *code++ = OP_BRA;        code++;   /* This already contains bravalue */
5962        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
5963        *code++ = OP_KET;        *code++ = OP_KET;
5964        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
# Line 6062  for (;; ptr++) Line 6126  for (;; ptr++)
6126          }          }
6127    
6128        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6129        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax).  */
6130    
6131        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||        if (-c == ESC_k)
           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))  
6132          {          {
6133            if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6134              ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6135              {
6136              *errorcodeptr = ERR69;
6137              break;
6138              }
6139          is_recurse = FALSE;          is_recurse = FALSE;
6140          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6141            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6142            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6143          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
6144          }          }
6145    
6146        /* Back references are handled specially; must disable firstbyte if        /* Back references are handled specially; must disable firstbyte if
6147        not set to cope with cases like (?=(\w+))\1: which would otherwise set        not set to cope with cases like (?=(\w+))\1: which would otherwise set
# Line 6969  utf8 = (options & PCRE_UTF8) != 0; Line 7038  utf8 = (options & PCRE_UTF8) != 0;
7038    
7039  /* Can't support UTF8 unless PCRE has been compiled to include the code. The  /* Can't support UTF8 unless PCRE has been compiled to include the code. The
7040  return of an error code from _pcre_valid_utf8() is a new feature, introduced in  return of an error code from _pcre_valid_utf8() is a new feature, introduced in
7041  release 8.13. The only use we make of it here is to adjust the offset value to  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7042  the end of the string for a short string error, for compatibility with previous  not used here. */
 versions. */  
7043    
7044  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
7045  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7046       (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1, &errorcode)) >= 0)       (errorcode = _pcre_valid_utf8((USPTR)pattern, -1, erroroffset)) != 0)
7047    {    {
7048    errorcode = ERR44;    errorcode = ERR44;
7049    goto PCRE_EARLY_ERROR_RETURN2;    goto PCRE_EARLY_ERROR_RETURN2;
# Line 7146  field; this time it's used for rememberi Line 7214  field; this time it's used for rememberi
7214  */  */
7215    
7216  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7217    cd->assert_depth = 0;
7218  cd->bracount = 0;  cd->bracount = 0;
7219  cd->names_found = 0;  cd->names_found = 0;
7220  cd->name_table = (uschar *)re + re->name_table_offset;  cd->name_table = (uschar *)re + re->name_table_offset;

Legend:
Removed from v.604  
changed lines
  Added in v.629

  ViewVC Help
Powered by ViewVC 1.1.5