/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 608 by ph10, Sun Jun 12 16:25:55 2011 UTC revision 615 by ph10, Mon Jul 11 14:23:06 2011 UTC
# Line 276  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM Line 276  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM
276         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277         RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,         RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278         RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,         RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279         RM61,  RM62, RM63, RM64 };         RM61,  RM62, RM63};
280    
281  /* These versions of the macros use the stack, as normal. There are debugging  /* These versions of the macros use the stack, as normal. There are debugging
282  versions and production versions. Note that the "rw" argument of RMATCH isn't  versions and production versions. Note that the "rw" argument of RMATCH isn't
# Line 858  for (;;) Line 858  for (;;)
858        md->offset_vector[offset+1] = save_offset2;        md->offset_vector[offset+1] = save_offset2;
859        md->offset_vector[md->offset_end - number] = save_offset3;        md->offset_vector[md->offset_end - number] = save_offset3;
860    
861        if (rrc != MATCH_THEN) md->mark = markptr;        if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
862        RRETURN(MATCH_NOMATCH);        RRETURN(MATCH_NOMATCH);
863        }        }
864    
# Line 875  for (;;) Line 875  for (;;)
875    
876      /* Non-capturing bracket, except for possessive with unlimited repeat. Loop      /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877      for all the alternatives. When we get to the final alternative within the      for all the alternatives. When we get to the final alternative within the
878      brackets, we would return the result of a recursive call to match()      brackets, we used to return the result of a recursive call to match()
879      whatever happened. We can reduce stack usage by turning this into a tail      whatever happened so it was possible to reduce stack usage by turning this
880      recursion, except in the case of a possibly empty group.*/      into a tail recursion, except in the case of a possibly empty group.
881        However, now that there is the possiblity of (*THEN) occurring in the final
882        alternative, this optimization is no longer possible. */
883    
884      case OP_BRA:      case OP_BRA:
885      case OP_SBRA:      case OP_SBRA:
886      DPRINTF(("start non-capturing bracket\n"));      DPRINTF(("start non-capturing bracket\n"));
887      for (;;)      for (;;)
888        {        {
       if (ecode[GET(ecode, 1)] != OP_ALT)   /* Final alternative */  
         {  
         if (op >= OP_SBRA)   /* Possibly empty group */  
           {  
           md->match_function_type = MATCH_CBEGROUP;  
           RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,  
             RM48);  
           if (rrc == MATCH_NOMATCH) md->mark = markptr;  
           RRETURN(rrc);  
           }  
         /* Not a possibly empty group; use tail recursion */  
         ecode += _pcre_OP_lengths[*ecode];  
         DPRINTF(("bracket 0 tail recursion\n"));  
         goto TAIL_RECURSE;  
         }  
   
       /* For non-final alternatives, continue the loop for a NOMATCH result;  
       otherwise return. */  
   
889        if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;        if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
890        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
891          RM2);          RM2);
# Line 910  for (;;) Line 893  for (;;)
893            (rrc != MATCH_THEN || md->start_match_ptr != ecode))            (rrc != MATCH_THEN || md->start_match_ptr != ecode))
894          RRETURN(rrc);          RRETURN(rrc);
895        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
896          if (*ecode != OP_ALT) break;
897        }        }
898      /* Control never reaches here. */  
899        if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
900        RRETURN(MATCH_NOMATCH);
901    
902      /* Handle possessive capturing brackets with an unlimited repeat. We come      /* Handle possessive capturing brackets with an unlimited repeat. We come
903      here from BRAZERO with allow_zero set TRUE. The offset_vector values are      here from BRAZERO with allow_zero set TRUE. The offset_vector values are
# Line 980  for (;;) Line 966  for (;;)
966          ecode += GET(ecode, 1);          ecode += GET(ecode, 1);
967          if (*ecode != OP_ALT) break;          if (*ecode != OP_ALT) break;
968          }          }
969    
970        if (!matched_once)        if (!matched_once)
971          {          {
972          md->offset_vector[offset] = save_offset1;          md->offset_vector[offset] = save_offset1;
# Line 988  for (;;) Line 974  for (;;)
974          md->offset_vector[md->offset_end - number] = save_offset3;          md->offset_vector[md->offset_end - number] = save_offset3;
975          }          }
976    
977        if (rrc != MATCH_THEN) md->mark = markptr;        if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
978        if (allow_zero || matched_once)        if (allow_zero || matched_once)
979          {          {
980          ecode += 1 + LINK_SIZE;          ecode += 1 + LINK_SIZE;
# Line 1026  for (;;) Line 1012  for (;;)
1012        {        {
1013        if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;        if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1014        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1015          eptrb, RM64);          eptrb, RM48);
1016        if (rrc == MATCH_KETRPOS)        if (rrc == MATCH_KETRPOS)
1017          {          {
1018            offset_top = md->end_offset_top;
1019          eptr = md->end_match_ptr;          eptr = md->end_match_ptr;
1020          ecode = md->start_code + code_offset;          ecode = md->start_code + code_offset;
1021          matched_once = TRUE;          matched_once = TRUE;
# Line 1040  for (;;) Line 1027  for (;;)
1027        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
1028        if (*ecode != OP_ALT) break;        if (*ecode != OP_ALT) break;
1029        }        }
1030    
1031      if (matched_once || allow_zero)      if (matched_once || allow_zero)
1032        {        {
1033        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
# Line 1053  for (;;) Line 1040  for (;;)
1040      /* Conditional group: compilation checked that there are no more than      /* Conditional group: compilation checked that there are no more than
1041      two branches. If the condition is false, skipping the first branch takes us      two branches. If the condition is false, skipping the first branch takes us
1042      past the end if there is only one branch, but that's OK because that is      past the end if there is only one branch, but that's OK because that is
1043      exactly what going to the ket would do. As there is only one branch to be      exactly what going to the ket would do. */
     obeyed, we can use tail recursion to avoid using another stack frame. */  
1044    
1045      case OP_COND:      case OP_COND:
1046      case OP_SCOND:      case OP_SCOND:
# Line 1259  for (;;) Line 1245  for (;;)
1245        }        }
1246    
1247      /* We are now at the branch that is to be obeyed. As there is only one,      /* We are now at the branch that is to be obeyed. As there is only one,
1248      we can use tail recursion to avoid using another stack frame, except when      we used to use tail recursion to avoid using another stack frame, except
1249      we have an unlimited repeat of a possibly empty group. If the second      when there was unlimited repeat of a possibly empty group. However, that
1250      alternative doesn't exist, we can just plough on. */      strategy no longer works because of the possibilty of (*THEN) being
1251        encountered in the branch. A recursive call to match() is always required,
1252        unless the second alternative doesn't exist, in which case we can just
1253        plough on. */
1254    
1255      if (condition || *ecode == OP_ALT)      if (condition || *ecode == OP_ALT)
1256        {        {
1257        ecode += 1 + LINK_SIZE;        if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1258        if (op == OP_SCOND)        /* Possibly empty group */        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1259          {        if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1260          md->match_function_type = MATCH_CBEGROUP;          rrc = MATCH_NOMATCH;
1261          RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);        RRETURN(rrc);
         RRETURN(rrc);  
         }  
       else goto TAIL_RECURSE;  
1262        }        }
1263      else                         /* Condition false & no alternative */      else                         /* Condition false & no alternative */
1264        {        {
# Line 1310  for (;;) Line 1296  for (;;)
1296      recursion, continue from after the call. */      recursion, continue from after the call. */
1297    
1298      case OP_ACCEPT:      case OP_ACCEPT:
1299        case OP_ASSERT_ACCEPT:
1300      case OP_END:      case OP_END:
1301      if (md->recursive != NULL)      if (md->recursive != NULL)
1302        {        {
# Line 1325  for (;;) Line 1312  for (;;)
1312          }          }
1313        }        }
1314    
1315      /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is      /* Otherwise, if we have matched an empty string, fail if not in an
1316      set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of      assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1317      the subject. In both cases, backtracking will then try other alternatives,      is set and we have matched at the start of the subject. In both cases,
1318      if any. */      backtracking will then try other alternatives, if any. */
1319    
1320      else if (eptr == mstart &&      else if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1321          (md->notempty ||          (md->notempty ||
1322            (md->notempty_atstart &&            (md->notempty_atstart &&
1323              mstart == md->start_subject + md->start_offset)))              mstart == md->start_subject + md->start_offset)))
# Line 1614  for (;;) Line 1601  for (;;)
1601    
1602      if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);      if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1603    
1604      /* Continue as from after the assertion, updating the offsets high water      /* Continue after the group, updating the offsets high water mark, since
1605      mark, since extracts may have been taken. */      extracts may have been taken. */
1606    
1607      do ecode += GET(ecode, 1); while (*ecode == OP_ALT);      do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1608    
# Line 1753  for (;;) Line 1740  for (;;)
1740        md->capture_last = number;        md->capture_last = number;
1741        if (offset >= md->offset_max) md->offset_overflow = TRUE; else        if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1742          {          {
1743            /* If offset is greater than offset_top, it means that we are
1744            "skipping" a capturing group, and that group's offsets must be marked
1745            unset. In earlier versions of PCRE, all the offsets were unset at the
1746            start of matching, but this doesn't work because atomic groups and
1747            assertions can cause a value to be set that should later be unset.
1748            Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1749            part of the atomic group, but this is not on the final matching path,
1750            so must be unset when 2 is set. (If there is no group 2, there is no
1751            problem, because offset_top will then be 2, indicating no capture.) */
1752    
1753            if (offset > offset_top)
1754              {
1755              register int *iptr = md->offset_vector + offset_top;
1756              register int *iend = md->offset_vector + offset;
1757              while (iptr < iend) *iptr++ = -1;
1758              }
1759    
1760            /* Now make the extraction */
1761    
1762          md->offset_vector[offset] =          md->offset_vector[offset] =
1763            md->offset_vector[md->offset_end - number];            md->offset_vector[md->offset_end - number];
1764          md->offset_vector[offset+1] = (int)(eptr - md->start_subject);          md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
# Line 5703  switch (frame->Xwhere) Line 5709  switch (frame->Xwhere)
5709    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5710    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5711    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5712    LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)    LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5713  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5714    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5715    LBL(32) LBL(34) LBL(42) LBL(46)    LBL(32) LBL(34) LBL(42) LBL(46)
# Line 5805  pcre_exec(const pcre *argument_re, const Line 5811  pcre_exec(const pcre *argument_re, const
5811    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5812    int offsetcount)    int offsetcount)
5813  {  {
5814  int rc, resetcount, ocount;  int rc, ocount;
5815  int first_byte = -1;  int first_byte = -1;
5816  int req_byte = -1;  int req_byte = -1;
5817  int req_byte2 = -1;  int req_byte2 = -1;
# Line 5913  utf8 = md->utf8 = (re->options & PCRE_UT Line 5919  utf8 = md->utf8 = (re->options & PCRE_UT
5919  md->use_ucp = (re->options & PCRE_UCP) != 0;  md->use_ucp = (re->options & PCRE_UCP) != 0;
5920  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5921    
5922    /* Some options are unpacked into BOOL variables in the hope that testing
5923    them will be faster than individual option bits. */
5924    
5925  md->notbol = (options & PCRE_NOTBOL) != 0;  md->notbol = (options & PCRE_NOTBOL) != 0;
5926  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
5927  md->notempty = (options & PCRE_NOTEMPTY) != 0;  md->notempty = (options & PCRE_NOTEMPTY) != 0;
5928  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5929  md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :  md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5930                ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;                ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5931    
5932    
5933  md->hitend = FALSE;  md->hitend = FALSE;
5934  md->mark = NULL;                        /* In case never set */  md->mark = NULL;                        /* In case never set */
5935    
# Line 6049  md->offset_max = (2*ocount)/3; Line 6060  md->offset_max = (2*ocount)/3;
6060  md->offset_overflow = FALSE;  md->offset_overflow = FALSE;
6061  md->capture_last = -1;  md->capture_last = -1;
6062    
 /* Compute the minimum number of offsets that we need to reset each time. Doing  
 this makes a huge difference to execution time when there aren't many brackets  
 in the pattern. */  
   
 resetcount = 2 + re->top_bracket * 2;  
 if (resetcount > offsetcount) resetcount = ocount;  
   
6063  /* Reset the working variable associated with each extraction. These should  /* Reset the working variable associated with each extraction. These should
6064  never be used unless previously set, but they get saved and restored, and so we  never be used unless previously set, but they get saved and restored, and so we
6065  initialize them to avoid reading uninitialized locations. */  initialize them to avoid reading uninitialized locations. Also, unset the
6066    offsets for the matched string. This is really just for tidiness with callouts,
6067    in case they inspect these fields. */
6068    
6069  if (md->offset_vector != NULL)  if (md->offset_vector != NULL)
6070    {    {
6071    register int *iptr = md->offset_vector + ocount;    register int *iptr = md->offset_vector + ocount;
6072    register int *iend = iptr - resetcount/2 + 1;    register int *iend = iptr - re->top_bracket;
6073      if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6074    while (--iptr >= iend) *iptr = -1;    while (--iptr >= iend) *iptr = -1;
6075      md->offset_vector[0] = md->offset_vector[1] = -1;
6076    }    }
6077    
6078  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 6098  if ((re->flags & PCRE_REQCHSET) != 0) Line 6106  if ((re->flags & PCRE_REQCHSET) != 0)
6106    }    }
6107    
6108    
6109    
6110    
6111  /* ==========================================================================*/  /* ==========================================================================*/
6112    
6113  /* Loop for handling unanchored repeated matching attempts; for anchored regexs  /* Loop for handling unanchored repeated matching attempts; for anchored regexs
# Line 6108  for(;;) Line 6118  for(;;)
6118    USPTR save_end_subject = end_subject;    USPTR save_end_subject = end_subject;
6119    USPTR new_start_match;    USPTR new_start_match;
6120    
   /* Reset the maximum number of extractions we might see. */  
   
   if (md->offset_vector != NULL)  
     {  
     register int *iptr = md->offset_vector;  
     register int *iend = iptr + resetcount;  
     while (iptr < iend) *iptr++ = -1;  
     }  
   
6121    /* If firstline is TRUE, the start of the match is constrained to the first    /* If firstline is TRUE, the start of the match is constrained to the first
6122    line of a multiline string. That is, the match must be before or at the first    line of a multiline string. That is, the match must be before or at the first
6123    newline. Implement this by temporarily adjusting end_subject so that we stop    newline. Implement this by temporarily adjusting end_subject so that we stop
# Line 6306  for(;;) Line 6307  for(;;)
6307    md->start_used_ptr = start_match;    md->start_used_ptr = start_match;
6308    md->match_call_count = 0;    md->match_call_count = 0;
6309    md->match_function_type = 0;    md->match_function_type = 0;
6310      md->end_offset_top = 0;
6311    rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);    rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6312    if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;    if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6313    

Legend:
Removed from v.608  
changed lines
  Added in v.615

  ViewVC Help
Powered by ViewVC 1.1.5