/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 613 by ph10, Sat Jul 2 16:59:52 2011 UTC revision 635 by ph10, Sat Jul 23 16:19:50 2011 UTC
# Line 409  static const char error_texts[] = Line 409  static const char error_texts[] =
409    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
410    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
411    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
412      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413    ;    ;
414    
415  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 1694  _pcre_find_bracket(const uschar *code, B Line 1695  _pcre_find_bracket(const uschar *code, B
1695  for (;;)  for (;;)
1696    {    {
1697    register int c = *code;    register int c = *code;
1698    
1699    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1700    
1701    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1974  for (code = first_significant_code(code Line 1976  for (code = first_significant_code(code
1976      }      }
1977    
1978    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
1979    implies a subroutine call, we can scan it. */    implies a backward reference subroutine call, we can scan it. If it's a
1980      forward reference subroutine call, we can't. To detect forward reference
1981      we have to scan up the list that is kept in the workspace. This function is
1982      called only when doing the real compile, not during the pre-compile that
1983      measures the size of the compiled pattern. */
1984    
1985    if (c == OP_RECURSE)    if (c == OP_RECURSE)
1986      {      {
1987      BOOL empty_branch = FALSE;      const uschar *scode;
1988      const uschar *scode = cd->start_code + GET(code, 1);      BOOL empty_branch;
1989    
1990        /* Test for forward reference */
1991    
1992        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
1993          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
1994    
1995        /* Not a forward reference, test for completed backward reference */
1996    
1997        empty_branch = FALSE;
1998        scode = cd->start_code + GET(code, 1);
1999      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2000    
2001        /* Completed backwards reference */
2002    
2003      do      do
2004        {        {
2005        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf8, cd))
# Line 1991  for (code = first_significant_code(code Line 2010  for (code = first_significant_code(code
2010        scode += GET(scode, 1);        scode += GET(scode, 1);
2011        }        }
2012      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2013    
2014      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2015      continue;      continue;
2016      }      }
# Line 2216  return TRUE; Line 2236  return TRUE;
2236  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2237  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2238  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2239    This function is called only during the real compile, not during the
2240    pre-compile.
2241    
2242  Arguments:  Arguments:
2243    code        points to start of the recursion    code        points to start of the recursion
# Line 3017  int greedy_default, greedy_non_default; Line 3039  int greedy_default, greedy_non_default;
3039  int firstbyte, reqbyte;  int firstbyte, reqbyte;
3040  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
3041  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
3042  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3043  int after_manual_callout = 0;  int after_manual_callout = 0;
3044  int length_prevgroup = 0;  int length_prevgroup = 0;
3045  register int c;  register int c;
# Line 3035  uschar *previous_callout = NULL; Line 3057  uschar *previous_callout = NULL;
3057  uschar *save_hwm = NULL;  uschar *save_hwm = NULL;
3058  uschar classbits[32];  uschar classbits[32];
3059    
3060    /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3061    must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3062    dynamically as we process the pattern. */
3063    
3064  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3065  BOOL class_utf8;  BOOL class_utf8;
3066  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
# Line 3215  for (;; ptr++) Line 3241  for (;; ptr++)
3241      previous_callout = NULL;      previous_callout = NULL;
3242      }      }
3243    
3244    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3245    
3246    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3247      {      {
# Line 4207  for (;; ptr++) Line 4233  for (;; ptr++)
4233        ptr++;        ptr++;
4234        }        }
4235      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4236    
4237        /* If previous was a recursion call, wrap it in atomic brackets so that
4238        previous becomes the atomic group. All recursions were so wrapped in the
4239        past, but it no longer happens for non-repeated recursions. In fact, the
4240        repeated ones could be re-implemented independently so as not to need this,
4241        but for the moment we rely on the code for repeating groups. */
4242    
4243        if (*previous == OP_RECURSE)
4244          {
4245          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
4246          *previous = OP_ONCE;
4247          PUT(previous, 1, 2 + 2*LINK_SIZE);
4248          previous[2 + 2*LINK_SIZE] = OP_KET;
4249          PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4250          code += 2 + 2 * LINK_SIZE;
4251          length_prevgroup = 3 + 3*LINK_SIZE;
4252    
4253          /* When actually compiling, we need to check whether this was a forward
4254          reference, and if so, adjust the offset. */
4255    
4256          if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4257            {
4258            int offset = GET(cd->hwm, -LINK_SIZE);
4259            if (offset == previous + 1 - cd->start_code)
4260              PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4261            }
4262          }
4263    
4264        /* Now handle repetition for the different types of item. */
4265    
4266      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4267      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
# Line 4726  for (;; ptr++) Line 4781  for (;; ptr++)
4781          }          }
4782    
4783        /* If the maximum is unlimited, set a repeater in the final copy. For        /* If the maximum is unlimited, set a repeater in the final copy. For
4784        ONCE brackets, that's all we need to do.        ONCE brackets, that's all we need to do. However, possessively repeated
4785          ONCE brackets can be converted into non-capturing brackets, as the
4786          behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
4787          deal with possessive ONCEs specially.
4788    
4789        Otherwise, if the quantifier was possessive, we convert the BRA code to        Otherwise, if the quantifier was possessive, we convert the BRA code to
4790        the POS form, and the KET code to KETRPOS. (It turns out to be convenient        the POS form, and the KET code to KETRPOS. (It turns out to be convenient
# Line 4747  for (;; ptr++) Line 4805  for (;; ptr++)
4805          {          {
4806          uschar *ketcode = code - 1 - LINK_SIZE;          uschar *ketcode = code - 1 - LINK_SIZE;
4807          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
4808    
4809          if (*bracode == OP_ONCE)          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
4810            if (*bracode == OP_ONCE)
4811            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
4812          else          else
4813            {            {
# Line 5517  for (;; ptr++) Line 5576  for (;; ptr++)
5576    
5577            temp = cd->end_pattern;            temp = cd->end_pattern;
5578            cd->end_pattern = ptr;            cd->end_pattern = ptr;
5579            recno = find_parens(cd, name, namelen,            recno = find_parens(cd, name, namelen,
5580              (options & PCRE_EXTENDED) != 0, utf8);              (options & PCRE_EXTENDED) != 0, utf8);
5581            cd->end_pattern = temp;            cd->end_pattern = temp;
5582            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
# Line 5664  for (;; ptr++) Line 5723  for (;; ptr++)
5723    
5724                /* Fudge the value of "called" so that when it is inserted as an                /* Fudge the value of "called" so that when it is inserted as an
5725                offset below, what it actually inserted is the reference number                offset below, what it actually inserted is the reference number
5726                of the group. */                of the group. Then remember the forward reference. */
5727    
5728                called = cd->start_code + recno;                called = cd->start_code + recno;
5729                PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
5730                }                }
5731    
5732              /* If not a forward reference, and the subpattern is still open,              /* If not a forward reference, and the subpattern is still open,
# Line 5682  for (;; ptr++) Line 5741  for (;; ptr++)
5741                }                }
5742              }              }
5743    
5744            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item. */
5745            "once" brackets. Set up a "previous group" length so that a  
           subsequent quantifier will work. */  
   
           *code = OP_ONCE;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
5746            *code = OP_RECURSE;            *code = OP_RECURSE;
5747            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
5748            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
   
           *code = OP_KET;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
           length_prevgroup = 3 + 3*LINK_SIZE;  
5749            }            }
5750    
5751          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 6083  for (;; ptr++) Line 6130  for (;; ptr++)
6130          }          }
6131    
6132        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6133        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax).  */
6134    
6135        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||        if (-c == ESC_k)
           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))  
6136          {          {
6137            if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6138              ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6139              {
6140              *errorcodeptr = ERR69;
6141              break;
6142              }
6143          is_recurse = FALSE;          is_recurse = FALSE;
6144          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6145            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6146            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6147          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
6148          }          }
6149    
6150        /* Back references are handled specially; must disable firstbyte if        /* Back references are handled specially; must disable firstbyte if
6151        not set to cope with cases like (?=(\w+))\1: which would otherwise set        not set to cope with cases like (?=(\w+))\1: which would otherwise set

Legend:
Removed from v.613  
changed lines
  Added in v.635

  ViewVC Help
Powered by ViewVC 1.1.5