/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 613 by ph10, Sat Jul 2 16:59:52 2011 UTC revision 629 by ph10, Fri Jul 22 09:18:11 2011 UTC
# Line 409  static const char error_texts[] = Line 409  static const char error_texts[] =
409    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
410    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
411    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
412      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413    ;    ;
414    
415  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 1694  _pcre_find_bracket(const uschar *code, B Line 1695  _pcre_find_bracket(const uschar *code, B
1695  for (;;)  for (;;)
1696    {    {
1697    register int c = *code;    register int c = *code;
1698    
1699    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1700    
1701    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1974  for (code = first_significant_code(code Line 1976  for (code = first_significant_code(code
1976      }      }
1977    
1978    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
1979    implies a subroutine call, we can scan it. */    implies a backward reference subroutine call, we can scan it. If it's a
1980      forward reference subroutine call, we can't. To detect forward reference
1981      we have to scan up the list that is kept in the workspace. This function is
1982      called only when doing the real compile, not during the pre-compile that
1983      measures the size of the compiled pattern. */
1984    
1985    if (c == OP_RECURSE)    if (c == OP_RECURSE)
1986      {      {
1987      BOOL empty_branch = FALSE;      const uschar *scode;
1988      const uschar *scode = cd->start_code + GET(code, 1);      BOOL empty_branch;
1989    
1990        /* Test for forward reference */
1991    
1992        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
1993          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
1994    
1995        /* Not a forward reference, test for completed backward reference */
1996    
1997        empty_branch = FALSE;
1998        scode = cd->start_code + GET(code, 1);
1999      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2000    
2001        /* Completed backwards reference */
2002    
2003      do      do
2004        {        {
2005        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf8, cd))
# Line 1991  for (code = first_significant_code(code Line 2010  for (code = first_significant_code(code
2010        scode += GET(scode, 1);        scode += GET(scode, 1);
2011        }        }
2012      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2013    
2014      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2015      continue;      continue;
2016      }      }
# Line 2216  return TRUE; Line 2236  return TRUE;
2236  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2237  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2238  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2239    This function is called only during the real compile, not during the
2240    pre-compile.
2241    
2242  Arguments:  Arguments:
2243    code        points to start of the recursion    code        points to start of the recursion
# Line 4207  for (;; ptr++) Line 4229  for (;; ptr++)
4229        ptr++;        ptr++;
4230        }        }
4231      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4232    
4233        /* If previous was a recursion call, wrap it in atomic brackets so that
4234        previous becomes the atomic group. All recursions were so wrapped in the
4235        past, but it no longer happens for non-repeated recursions. In fact, the
4236        repeated ones could be re-implemented independently so as not to need this,
4237        but for the moment we rely on the code for repeating groups. */
4238    
4239        if (*previous == OP_RECURSE)
4240          {
4241          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
4242          *previous = OP_ONCE;
4243          PUT(previous, 1, 2 + 2*LINK_SIZE);
4244          previous[2 + 2*LINK_SIZE] = OP_KET;
4245          PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4246          code += 2 + 2 * LINK_SIZE;
4247          length_prevgroup = 3 + 3*LINK_SIZE;
4248    
4249          /* When actually compiling, we need to check whether this was a forward
4250          reference, and if so, adjust the offset. */
4251    
4252          if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4253            {
4254            int offset = GET(cd->hwm, -LINK_SIZE);
4255            if (offset == previous + 1 - cd->start_code)
4256              PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4257            }
4258          }
4259    
4260        /* Now handle repetition for the different types of item. */
4261    
4262      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4263      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
# Line 4726  for (;; ptr++) Line 4777  for (;; ptr++)
4777          }          }
4778    
4779        /* If the maximum is unlimited, set a repeater in the final copy. For        /* If the maximum is unlimited, set a repeater in the final copy. For
4780        ONCE brackets, that's all we need to do.        ONCE brackets, that's all we need to do. However, possessively repeated
4781          ONCE brackets can be converted into non-capturing brackets, as the
4782          behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
4783          deal with possessive ONCEs specially.
4784    
4785        Otherwise, if the quantifier was possessive, we convert the BRA code to        Otherwise, if the quantifier was possessive, we convert the BRA code to
4786        the POS form, and the KET code to KETRPOS. (It turns out to be convenient        the POS form, and the KET code to KETRPOS. (It turns out to be convenient
# Line 4747  for (;; ptr++) Line 4801  for (;; ptr++)
4801          {          {
4802          uschar *ketcode = code - 1 - LINK_SIZE;          uschar *ketcode = code - 1 - LINK_SIZE;
4803          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
4804    
4805          if (*bracode == OP_ONCE)          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
4806            if (*bracode == OP_ONCE)
4807            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
4808          else          else
4809            {            {
# Line 5664  for (;; ptr++) Line 5719  for (;; ptr++)
5719    
5720                /* Fudge the value of "called" so that when it is inserted as an                /* Fudge the value of "called" so that when it is inserted as an
5721                offset below, what it actually inserted is the reference number                offset below, what it actually inserted is the reference number
5722                of the group. */                of the group. Then remember the forward reference. */
5723    
5724                called = cd->start_code + recno;                called = cd->start_code + recno;
5725                PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
5726                }                }
5727    
5728              /* If not a forward reference, and the subpattern is still open,              /* If not a forward reference, and the subpattern is still open,
# Line 5682  for (;; ptr++) Line 5737  for (;; ptr++)
5737                }                }
5738              }              }
5739    
5740            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item. */
5741            "once" brackets. Set up a "previous group" length so that a  
           subsequent quantifier will work. */  
   
           *code = OP_ONCE;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
5742            *code = OP_RECURSE;            *code = OP_RECURSE;
5743            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
5744            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
   
           *code = OP_KET;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
           length_prevgroup = 3 + 3*LINK_SIZE;  
5745            }            }
5746    
5747          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 6083  for (;; ptr++) Line 6126  for (;; ptr++)
6126          }          }
6127    
6128        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6129        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax).  */
6130    
6131        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||        if (-c == ESC_k)
           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))  
6132          {          {
6133            if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6134              ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6135              {
6136              *errorcodeptr = ERR69;
6137              break;
6138              }
6139          is_recurse = FALSE;          is_recurse = FALSE;
6140          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6141            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6142            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6143          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
6144          }          }
6145    
6146        /* Back references are handled specially; must disable firstbyte if        /* Back references are handled specially; must disable firstbyte if
6147        not set to cope with cases like (?=(\w+))\1: which would otherwise set        not set to cope with cases like (?=(\w+))\1: which would otherwise set

Legend:
Removed from v.613  
changed lines
  Added in v.629

  ViewVC Help
Powered by ViewVC 1.1.5