/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 608 by ph10, Sun Jun 12 16:25:55 2011 UTC revision 618 by ph10, Sat Jul 16 17:24:16 2011 UTC
# Line 76  negative to avoid the external error cod Line 76  negative to avoid the external error cod
76  #define MATCH_ACCEPT       (-999)  #define MATCH_ACCEPT       (-999)
77  #define MATCH_COMMIT       (-998)  #define MATCH_COMMIT       (-998)
78  #define MATCH_KETRPOS      (-997)  #define MATCH_KETRPOS      (-997)
79  #define MATCH_PRUNE        (-996)  #define MATCH_ONCE         (-996)
80  #define MATCH_SKIP         (-995)  #define MATCH_PRUNE        (-995)
81  #define MATCH_SKIP_ARG     (-994)  #define MATCH_SKIP         (-994)
82  #define MATCH_THEN         (-993)  #define MATCH_SKIP_ARG     (-993)
83    #define MATCH_THEN         (-992)
84    
85  /* This is a convenience macro for code that occurs many times. */  /* This is a convenience macro for code that occurs many times. */
86    
# Line 276  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM Line 277  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM
277         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278         RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,         RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279         RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,         RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280         RM61,  RM62, RM63, RM64 };         RM61,  RM62, RM63, RM64, RM65, RM66 };
281    
282  /* These versions of the macros use the stack, as normal. There are debugging  /* These versions of the macros use the stack, as normal. There are debugging
283  versions and production versions. Note that the "rw" argument of RMATCH isn't  versions and production versions. Note that the "rw" argument of RMATCH isn't
# Line 808  for (;;) Line 809  for (;;)
809      subject position in the working slot at the top of the vector. We mustn't      subject position in the working slot at the top of the vector. We mustn't
810      change the current values of the data slot, because they may be set from a      change the current values of the data slot, because they may be set from a
811      previous iteration of this group, and be referred to by a reference inside      previous iteration of this group, and be referred to by a reference inside
812      the group. If we fail to match, we need to restore this value and also the      the group. A failure to match might occur after the group has succeeded,
813      values of the final offsets, in case they were set by a previous iteration      if something later on doesn't match. For this reason, we need to restore
814      of the same bracket.      the working value and also the values of the final offsets, in case they
815        were set by a previous iteration of the same bracket.
816    
817      If there isn't enough space in the offset vector, treat this as if it were      If there isn't enough space in the offset vector, treat this as if it were
818      a non-capturing bracket. Don't worry about setting the flag for the error      a non-capturing bracket. Don't worry about setting the flag for the error
# Line 844  for (;;) Line 846  for (;;)
846          if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;          if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
847          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
848            eptrb, RM1);            eptrb, RM1);
849            if (rrc == MATCH_ONCE) break;  /* Backing up through an atomic group */
850          if (rrc != MATCH_NOMATCH &&          if (rrc != MATCH_NOMATCH &&
851              (rrc != MATCH_THEN || md->start_match_ptr != ecode))              (rrc != MATCH_THEN || md->start_match_ptr != ecode))
852            RRETURN(rrc);            RRETURN(rrc);
# Line 853  for (;;) Line 856  for (;;)
856          }          }
857    
858        DPRINTF(("bracket %d failed\n", number));        DPRINTF(("bracket %d failed\n", number));
   
859        md->offset_vector[offset] = save_offset1;        md->offset_vector[offset] = save_offset1;
860        md->offset_vector[offset+1] = save_offset2;        md->offset_vector[offset+1] = save_offset2;
861        md->offset_vector[md->offset_end - number] = save_offset3;        md->offset_vector[md->offset_end - number] = save_offset3;
862    
863          /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
864          MATCH_THEN. */
865    
866        if (rrc != MATCH_THEN) md->mark = markptr;        if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
867        RRETURN(MATCH_NOMATCH);        RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
868        }        }
869    
870      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
# Line 873  for (;;) Line 878  for (;;)
878      /* VVVVVVVVVVVVVVVVVVVVVVVVV */      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
879      /* VVVVVVVVVVVVVVVVVVVVVVVVV */      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
880    
881      /* Non-capturing bracket, except for possessive with unlimited repeat. Loop      /* Non-capturing or atomic group, except for possessive with unlimited
882      for all the alternatives. When we get to the final alternative within the      repeat. Loop for all the alternatives. When we get to the final alternative
883      brackets, we would return the result of a recursive call to match()      within the brackets, we used to return the result of a recursive call to
884      whatever happened. We can reduce stack usage by turning this into a tail      match() whatever happened so it was possible to reduce stack usage by
885      recursion, except in the case of a possibly empty group.*/      turning this into a tail recursion, except in the case of a possibly empty
886        group. However, now that there is the possiblity of (*THEN) occurring in
887        the final alternative, this optimization is no longer possible.
888    
889        MATCH_ONCE is returned when the end of an atomic group is successfully
890        reached, but subsequent matching fails. It passes back up the tree (causing
891        captured values to be reset) until the original atomic group level is
892        reached. This is tested by comparing md->once_target with the start of the
893        group. At this point, the return is converted into MATCH_NOMATCH so that
894        previous backup points can be taken. */
895    
896        case OP_ONCE:
897      case OP_BRA:      case OP_BRA:
898      case OP_SBRA:      case OP_SBRA:
899      DPRINTF(("start non-capturing bracket\n"));      DPRINTF(("start non-capturing bracket\n"));
900    
901      for (;;)      for (;;)
902        {        {
903        if (ecode[GET(ecode, 1)] != OP_ALT)   /* Final alternative */        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
         {  
         if (op >= OP_SBRA)   /* Possibly empty group */  
           {  
           md->match_function_type = MATCH_CBEGROUP;  
           RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,  
             RM48);  
           if (rrc == MATCH_NOMATCH) md->mark = markptr;  
           RRETURN(rrc);  
           }  
         /* Not a possibly empty group; use tail recursion */  
         ecode += _pcre_OP_lengths[*ecode];  
         DPRINTF(("bracket 0 tail recursion\n"));  
         goto TAIL_RECURSE;  
         }  
   
       /* For non-final alternatives, continue the loop for a NOMATCH result;  
       otherwise return. */  
   
       if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;  
904        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
905          RM2);          RM2);
906        if (rrc != MATCH_NOMATCH &&        if (rrc != MATCH_NOMATCH &&
907            (rrc != MATCH_THEN || md->start_match_ptr != ecode))            (rrc != MATCH_THEN || md->start_match_ptr != ecode))
908            {
909            if (rrc == MATCH_ONCE)
910              {
911              const uschar *scode = ecode;
912              if (*scode != OP_ONCE)           /* If not at start, find it */
913                {
914                while (*scode == OP_ALT) scode += GET(scode, 1);
915                scode -= GET(scode, 1);
916                }
917              if (md->once_target == scode) rrc = MATCH_NOMATCH;
918              }
919          RRETURN(rrc);          RRETURN(rrc);
920            }
921        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
922          if (*ecode != OP_ALT) break;
923        }        }
924      /* Control never reaches here. */      if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
925        RRETURN(MATCH_NOMATCH);
926    
927      /* Handle possessive capturing brackets with an unlimited repeat. We come      /* Handle possessive capturing brackets with an unlimited repeat. We come
928      here from BRAZERO with allow_zero set TRUE. The offset_vector values are      here from BRAZERO with allow_zero set TRUE. The offset_vector values are
# Line 980  for (;;) Line 991  for (;;)
991          ecode += GET(ecode, 1);          ecode += GET(ecode, 1);
992          if (*ecode != OP_ALT) break;          if (*ecode != OP_ALT) break;
993          }          }
994    
995        if (!matched_once)        if (!matched_once)
996          {          {
997          md->offset_vector[offset] = save_offset1;          md->offset_vector[offset] = save_offset1;
# Line 988  for (;;) Line 999  for (;;)
999          md->offset_vector[md->offset_end - number] = save_offset3;          md->offset_vector[md->offset_end - number] = save_offset3;
1000          }          }
1001    
1002        if (rrc != MATCH_THEN) md->mark = markptr;        if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
1003        if (allow_zero || matched_once)        if (allow_zero || matched_once)
1004          {          {
1005          ecode += 1 + LINK_SIZE;          ecode += 1 + LINK_SIZE;
# Line 1026  for (;;) Line 1037  for (;;)
1037        {        {
1038        if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;        if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1039        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1040          eptrb, RM64);          eptrb, RM48);
1041        if (rrc == MATCH_KETRPOS)        if (rrc == MATCH_KETRPOS)
1042          {          {
1043            offset_top = md->end_offset_top;
1044          eptr = md->end_match_ptr;          eptr = md->end_match_ptr;
1045          ecode = md->start_code + code_offset;          ecode = md->start_code + code_offset;
1046          matched_once = TRUE;          matched_once = TRUE;
# Line 1040  for (;;) Line 1052  for (;;)
1052        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
1053        if (*ecode != OP_ALT) break;        if (*ecode != OP_ALT) break;
1054        }        }
1055    
1056      if (matched_once || allow_zero)      if (matched_once || allow_zero)
1057        {        {
1058        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
# Line 1053  for (;;) Line 1065  for (;;)
1065      /* Conditional group: compilation checked that there are no more than      /* Conditional group: compilation checked that there are no more than
1066      two branches. If the condition is false, skipping the first branch takes us      two branches. If the condition is false, skipping the first branch takes us
1067      past the end if there is only one branch, but that's OK because that is      past the end if there is only one branch, but that's OK because that is
1068      exactly what going to the ket would do. As there is only one branch to be      exactly what going to the ket would do. */
     obeyed, we can use tail recursion to avoid using another stack frame. */  
1069    
1070      case OP_COND:      case OP_COND:
1071      case OP_SCOND:      case OP_SCOND:
# Line 1259  for (;;) Line 1270  for (;;)
1270        }        }
1271    
1272      /* We are now at the branch that is to be obeyed. As there is only one,      /* We are now at the branch that is to be obeyed. As there is only one,
1273      we can use tail recursion to avoid using another stack frame, except when      we used to use tail recursion to avoid using another stack frame, except
1274      we have an unlimited repeat of a possibly empty group. If the second      when there was unlimited repeat of a possibly empty group. However, that
1275      alternative doesn't exist, we can just plough on. */      strategy no longer works because of the possibilty of (*THEN) being
1276        encountered in the branch. A recursive call to match() is always required,
1277        unless the second alternative doesn't exist, in which case we can just
1278        plough on. */
1279    
1280      if (condition || *ecode == OP_ALT)      if (condition || *ecode == OP_ALT)
1281        {        {
1282        ecode += 1 + LINK_SIZE;        if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1283        if (op == OP_SCOND)        /* Possibly empty group */        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1284          {        if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1285          md->match_function_type = MATCH_CBEGROUP;          rrc = MATCH_NOMATCH;
1286          RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);        RRETURN(rrc);
         RRETURN(rrc);  
         }  
       else goto TAIL_RECURSE;  
1287        }        }
1288      else                         /* Condition false & no alternative */      else                         /* Condition false & no alternative */
1289        {        {
# Line 1310  for (;;) Line 1321  for (;;)
1321      recursion, continue from after the call. */      recursion, continue from after the call. */
1322    
1323      case OP_ACCEPT:      case OP_ACCEPT:
1324        case OP_ASSERT_ACCEPT:
1325      case OP_END:      case OP_END:
1326    
1327    /*
1328      if (md->recursive != NULL)      if (md->recursive != NULL)
1329        {        {
1330        recursion_info *rec = md->recursive;        recursion_info *rec = md->recursive;
1331    
1332        md->recursive = rec->prevrec;        md->recursive = rec->prevrec;
1333    
1334        memmove(md->offset_vector, rec->offset_save,        memmove(md->offset_vector, rec->offset_save,
1335          rec->saved_max * sizeof(int));          rec->saved_max * sizeof(int));
1336        offset_top = rec->save_offset_top;        offset_top = rec->save_offset_top;
# Line 1324  for (;;) Line 1340  for (;;)
1340          break;          break;
1341          }          }
1342        }        }
1343    */
1344        /* Otherwise, if we have matched an empty string, fail if not in an
1345        assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1346        is set and we have matched at the start of the subject. In both cases,
1347        backtracking will then try other alternatives, if any. */
1348    
1349    /*    else */ if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1350    
1351      /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is           md->recursive == NULL &&
     set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of  
     the subject. In both cases, backtracking will then try other alternatives,  
     if any. */  
1352    
     else if (eptr == mstart &&  
1353          (md->notempty ||          (md->notempty ||
1354            (md->notempty_atstart &&            (md->notempty_atstart &&
1355              mstart == md->start_subject + md->start_offset)))              mstart == md->start_subject + md->start_offset)))
# Line 1493  for (;;) Line 1512  for (;;)
1512      /* Recursion either matches the current regex, or some subexpression. The      /* Recursion either matches the current regex, or some subexpression. The
1513      offset data is the offset to the starting bracket from the start of the      offset data is the offset to the starting bracket from the start of the
1514      whole pattern. (This is so that it works from duplicated subpatterns.)      whole pattern. (This is so that it works from duplicated subpatterns.)
1515    
1516      If there are any capturing brackets started but not finished, we have to      The state of the capturing groups is preserved over recursion, and
1517      save their starting points and reinstate them after the recursion. However,      re-instated afterwards. We don't know how many are started and not yet
1518      we don't know how many such there are (offset_top records the completed      finished (offset_top records the completed total) so we just have to save
1519      total) so we just have to save all the potential data. There may be up to      all the potential data. There may be up to 65535 such values, which is too
1520      65535 such values, which is too large to put on the stack, but using malloc      large to put on the stack, but using malloc for small numbers seems
1521      for small numbers seems expensive. As a compromise, the stack is used when      expensive. As a compromise, the stack is used when there are no more than
1522      there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc      REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
     is used. A problem is what to do if the malloc fails ... there is no way of  
     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX  
     values on the stack, and accept that the rest may be wrong.  
1523    
1524      There are also other values that have to be saved. We use a chained      There are also other values that have to be saved. We use a chained
1525      sequence of blocks that actually live on the stack. Thanks to Robin Houston      sequence of blocks that actually live on the stack. Thanks to Robin Houston
1526      for the original version of this logic. */      for the original version of this logic. It has, however, been hacked around
1527        a lot, so he is not to blame for the current way it works. */
1528    
1529      case OP_RECURSE:      case OP_RECURSE:
1530        {        {
# Line 1520  for (;;) Line 1537  for (;;)
1537        new_recursive.prevrec = md->recursive;        new_recursive.prevrec = md->recursive;
1538        md->recursive = &new_recursive;        md->recursive = &new_recursive;
1539    
1540        /* Find where to continue from afterwards */        /* Where to continue from afterwards */
1541    
1542        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
       new_recursive.after_call = ecode;  
1543    
1544        /* Now save the offset data. */        /* Now save the offset data */
1545    
1546        new_recursive.saved_max = md->offset_end;        new_recursive.saved_max = md->offset_end;
1547        if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)        if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
# Line 1536  for (;;) Line 1552  for (;;)
1552            (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));            (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1553          if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);          if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1554          }          }
   
1555        memcpy(new_recursive.offset_save, md->offset_vector,        memcpy(new_recursive.offset_save, md->offset_vector,
1556              new_recursive.saved_max * sizeof(int));              new_recursive.saved_max * sizeof(int));
       new_recursive.save_offset_top = offset_top;  
1557    
1558        /* OK, now we can do the recursion. For each top-level alternative we        /* OK, now we can do the recursion. After processing each alternative,
1559        restore the offset and recursion data. */        restore the offset data. If there were nested recursions, md->recursive
1560          might be changed, so reset it before looping. */
1561    
1562        DPRINTF(("Recursing into group %d\n", new_recursive.group_num));        DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1563        cbegroup = (*callpat >= OP_SBRA);        cbegroup = (*callpat >= OP_SBRA);
# Line 1551  for (;;) Line 1566  for (;;)
1566          if (cbegroup) md->match_function_type = MATCH_CBEGROUP;          if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1567          RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,          RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1568            md, eptrb, RM6);            md, eptrb, RM6);
1569            memcpy(md->offset_vector, new_recursive.offset_save,
1570                new_recursive.saved_max * sizeof(int));
1571          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1572            {            {
1573            DPRINTF(("Recursion matched\n"));            DPRINTF(("Recursion matched\n"));
1574            md->recursive = new_recursive.prevrec;            md->recursive = new_recursive.prevrec;
1575            if (new_recursive.offset_save != stacksave)            if (new_recursive.offset_save != stacksave)
1576              (pcre_free)(new_recursive.offset_save);              (pcre_free)(new_recursive.offset_save);
1577            MRRETURN(MATCH_MATCH);  
1578              /* Set where we got to in the subject, and reset the start in case
1579              it was changed by \K. This *is* propagated back out of a recursion,
1580              for Perl compatibility. */
1581    
1582              eptr = md->end_match_ptr;
1583              mstart = md->start_match_ptr;
1584              goto RECURSION_MATCHED;        /* Exit loop; end processing */
1585            }            }
1586          else if (rrc != MATCH_NOMATCH &&          else if (rrc != MATCH_NOMATCH &&
1587                  (rrc != MATCH_THEN || md->start_match_ptr != ecode))                  (rrc != MATCH_THEN || md->start_match_ptr != ecode))
# Line 1569  for (;;) Line 1593  for (;;)
1593            }            }
1594    
1595          md->recursive = &new_recursive;          md->recursive = &new_recursive;
         memcpy(md->offset_vector, new_recursive.offset_save,  
             new_recursive.saved_max * sizeof(int));  
1596          callpat += GET(callpat, 1);          callpat += GET(callpat, 1);
1597          }          }
1598        while (*callpat == OP_ALT);        while (*callpat == OP_ALT);
# Line 1581  for (;;) Line 1603  for (;;)
1603          (pcre_free)(new_recursive.offset_save);          (pcre_free)(new_recursive.offset_save);
1604        MRRETURN(MATCH_NOMATCH);        MRRETURN(MATCH_NOMATCH);
1605        }        }
1606      /* Control never reaches here */  
1607        RECURSION_MATCHED:
1608      /* "Once" brackets are like assertion brackets except that after a match,      break;
     the point in the subject string is not moved back. Thus there can never be  
     a move back into the brackets. Friedl calls these "atomic" subpatterns.  
     Check the alternative branches in turn - the matching won't pass the KET  
     for this kind of subpattern. If any one branch matches, we carry on as at  
     the end of a normal bracket, leaving the subject pointer, but resetting  
     the start-of-match value in case it was changed by \K. */  
   
     case OP_ONCE:  
     prev = ecode;  
     saved_eptr = eptr;  
   
     do  
       {  
       RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);  
       if (rrc == MATCH_MATCH)  /* Note: _not_ MATCH_ACCEPT */  
         {  
         mstart = md->start_match_ptr;  
         break;  
         }  
       if (rrc != MATCH_NOMATCH &&  
           (rrc != MATCH_THEN || md->start_match_ptr != ecode))  
         RRETURN(rrc);  
       ecode += GET(ecode,1);  
       }  
     while (*ecode == OP_ALT);  
   
     /* If hit the end of the group (which could be repeated), fail */  
   
     if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);  
   
     /* Continue as from after the assertion, updating the offsets high water  
     mark, since extracts may have been taken. */  
   
     do ecode += GET(ecode, 1); while (*ecode == OP_ALT);  
   
     offset_top = md->end_offset_top;  
     eptr = md->end_match_ptr;  
   
     /* For a non-repeating ket, just continue at this level. This also  
     happens for a repeating ket if no characters were matched in the group.  
     This is the forcible breaking of infinite loops as implemented in Perl  
     5.005. If there is an options reset, it will get obeyed in the normal  
     course of events. */  
   
     if (*ecode == OP_KET || eptr == saved_eptr)  
       {  
       ecode += 1+LINK_SIZE;  
       break;  
       }  
   
     /* The repeating kets try the rest of the pattern or restart from the  
     preceding bracket, in the appropriate order. The second "call" of match()  
     uses tail recursion, to avoid using another stack frame. */  
   
     if (*ecode == OP_KETRMIN)  
       {  
       RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);  
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
       ecode = prev;  
       goto TAIL_RECURSE;  
       }  
     else  /* OP_KETRMAX */  
       {  
       md->match_function_type = MATCH_CBEGROUP;  
       RMATCH(eptr, prev, offset_top, md, eptrb, RM9);  
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
       ecode += 1 + LINK_SIZE;  
       goto TAIL_RECURSE;  
       }  
     /* Control never gets here */  
1609    
1610      /* An alternation is the end of a branch; scan along to find the end of the      /* An alternation is the end of a branch; scan along to find the end of the
1611      bracketed group and go to there. */      bracketed group and go to there. */
# Line 1706  for (;;) Line 1658  for (;;)
1658      case OP_KETRMAX:      case OP_KETRMAX:
1659      case OP_KETRPOS:      case OP_KETRPOS:
1660      prev = ecode - GET(ecode, 1);      prev = ecode - GET(ecode, 1);
1661    
1662      /* If this was a group that remembered the subject start, in order to break      /* If this was a group that remembered the subject start, in order to break
1663      infinite repeats of empty string matches, retrieve the subject start from      infinite repeats of empty string matches, retrieve the subject start from
1664      the chain. Otherwise, set it NULL. */      the chain. Otherwise, set it NULL. */
1665    
1666      if (*prev >= OP_SBRA)      if (*prev >= OP_SBRA || *prev == OP_ONCE)
1667        {        {
1668        saved_eptr = eptrb->epb_saved_eptr;   /* Value at start of group */        saved_eptr = eptrb->epb_saved_eptr;   /* Value at start of group */
1669        eptrb = eptrb->epb_prev;              /* Backup to previous group */        eptrb = eptrb->epb_prev;              /* Backup to previous group */
1670        }        }
1671      else saved_eptr = NULL;      else saved_eptr = NULL;
1672    
1673      /* If we are at the end of an assertion group or an atomic group, stop      /* If we are at the end of an assertion group, stop matching and return
1674      matching and return MATCH_MATCH, but record the current high water mark for      MATCH_MATCH, but record the current high water mark for use by positive
1675      use by positive assertions. We also need to record the match start in case      assertions. We also need to record the match start in case it was changed
1676      it was changed by \K. */      by \K. */
1677    
1678      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1679          *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||          *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
         *prev == OP_ONCE)  
1680        {        {
1681        md->end_match_ptr = eptr;      /* For ONCE */        md->end_match_ptr = eptr;      /* For ONCE */
1682        md->end_offset_top = offset_top;        md->end_offset_top = offset_top;
# Line 1735  for (;;) Line 1686  for (;;)
1686    
1687      /* For capturing groups we have to check the group number back at the start      /* For capturing groups we have to check the group number back at the start
1688      and if necessary complete handling an extraction by setting the offsets and      and if necessary complete handling an extraction by setting the offsets and
1689      bumping the high water mark. Note that whole-pattern recursion is coded as      bumping the high water mark. Whole-pattern recursion is coded as a recurse
1690      a recurse into group 0, so it won't be picked up here. Instead, we catch it      into group 0, so it won't be picked up here. Instead, we catch it when the
1691      when the OP_END is reached. Other recursion is handled here. */      OP_END is reached. Other recursion is handled here. We just have to record
1692        the current subject position and start match pointer and give a MATCH
1693        return. */
1694    
1695      if (*prev == OP_CBRA || *prev == OP_SCBRA ||      if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1696          *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)          *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
# Line 1750  for (;;) Line 1703  for (;;)
1703        printf("\n");        printf("\n");
1704  #endif  #endif
1705    
1706          /* Handle a recursively called group. */
1707    
1708          if (md->recursive != NULL && md->recursive->group_num == number)
1709            {
1710            md->end_match_ptr = eptr;
1711            md->start_match_ptr = mstart;
1712            RRETURN(MATCH_MATCH);
1713            }
1714    
1715          /* Deal with capturing */
1716    
1717        md->capture_last = number;        md->capture_last = number;
1718        if (offset >= md->offset_max) md->offset_overflow = TRUE; else        if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1719          {          {
1720            /* If offset is greater than offset_top, it means that we are
1721            "skipping" a capturing group, and that group's offsets must be marked
1722            unset. In earlier versions of PCRE, all the offsets were unset at the
1723            start of matching, but this doesn't work because atomic groups and
1724            assertions can cause a value to be set that should later be unset.
1725            Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1726            part of the atomic group, but this is not on the final matching path,
1727            so must be unset when 2 is set. (If there is no group 2, there is no
1728            problem, because offset_top will then be 2, indicating no capture.) */
1729    
1730            if (offset > offset_top)
1731              {
1732              register int *iptr = md->offset_vector + offset_top;
1733              register int *iend = md->offset_vector + offset;
1734              while (iptr < iend) *iptr++ = -1;
1735              }
1736    
1737            /* Now make the extraction */
1738    
1739          md->offset_vector[offset] =          md->offset_vector[offset] =
1740            md->offset_vector[md->offset_end - number];            md->offset_vector[md->offset_end - number];
1741          md->offset_vector[offset+1] = (int)(eptr - md->start_subject);          md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1742          if (offset_top <= offset) offset_top = offset + 2;          if (offset_top <= offset) offset_top = offset + 2;
1743          }          }
   
       /* Handle a recursively called group. Restore the offsets  
       appropriately and continue from after the call. */  
   
       if (md->recursive != NULL && md->recursive->group_num == number)  
         {  
         recursion_info *rec = md->recursive;  
         DPRINTF(("Recursion (%d) succeeded - continuing\n", number));  
         md->recursive = rec->prevrec;  
         memcpy(md->offset_vector, rec->offset_save,  
           rec->saved_max * sizeof(int));  
         offset_top = rec->save_offset_top;  
         ecode = rec->after_call;  
         break;  
         }  
1744        }        }
1745    
1746      /* For a non-repeating ket, just continue at this level. This also      /* For an ordinary non-repeating ket, just continue at this level. This
1747      happens for a repeating ket if no characters were matched in the group.      also happens for a repeating ket if no characters were matched in the
1748      This is the forcible breaking of infinite loops as implemented in Perl      group. This is the forcible breaking of infinite loops as implemented in
1749      5.005. If there is an options reset, it will get obeyed in the normal      Perl 5.005. For a non-repeating atomic group, establish a backup point by
1750      course of events. */      processing the rest of the pattern at a lower level. If this results in a
1751        NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1752        bypassing intermediate backup points, but resetting any captures that
1753        happened along the way. */
1754    
1755      if (*ecode == OP_KET || eptr == saved_eptr)      if (*ecode == OP_KET || eptr == saved_eptr)
1756        {        {
1757        ecode += 1 + LINK_SIZE;        if (*prev == OP_ONCE)
1758            {
1759            RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1760            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1761            md->once_target = prev;  /* Level at which to change to MATCH_NOMATCH */
1762            RRETURN(MATCH_ONCE);
1763            }
1764          ecode += 1 + LINK_SIZE;    /* Carry on at this level */
1765        break;        break;
1766        }        }
1767    
# Line 1801  for (;;) Line 1779  for (;;)
1779      /* The normal repeating kets try the rest of the pattern or restart from      /* The normal repeating kets try the rest of the pattern or restart from
1780      the preceding bracket, in the appropriate order. In the second case, we can      the preceding bracket, in the appropriate order. In the second case, we can
1781      use tail recursion to avoid using another stack frame, unless we have an      use tail recursion to avoid using another stack frame, unless we have an
1782      unlimited repeat of a group that can match an empty string. */      an atomic group or an unlimited repeat of a group that can match an empty
1783        string. */
1784    
1785      if (*ecode == OP_KETRMIN)      if (*ecode == OP_KETRMIN)
1786        {        {
1787        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
1788        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1789          if (*prev == OP_ONCE)
1790            {
1791            RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
1792            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1793            md->once_target = prev;  /* Level at which to change to MATCH_NOMATCH */
1794            RRETURN(MATCH_ONCE);
1795            }
1796        if (*prev >= OP_SBRA)    /* Could match an empty string */        if (*prev >= OP_SBRA)    /* Could match an empty string */
1797          {          {
1798          md->match_function_type = MATCH_CBEGROUP;          md->match_function_type = MATCH_CBEGROUP;
# Line 1820  for (;;) Line 1806  for (;;)
1806        {        {
1807        if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;        if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1808        RMATCH(eptr, prev, offset_top, md, eptrb, RM13);        RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1809          if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1810        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1811          if (*prev == OP_ONCE)
1812            {
1813            RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
1814            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1815            md->once_target = prev;
1816            RRETURN(MATCH_ONCE);
1817            }
1818        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1819        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1820        }        }
# Line 5704  switch (frame->Xwhere) Line 5698  switch (frame->Xwhere)
5698    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5699    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5700    LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)    LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5701      LBL(65) LBL(66)
5702  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5703    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5704    LBL(32) LBL(34) LBL(42) LBL(46)    LBL(32) LBL(34) LBL(42) LBL(46)
# Line 5805  pcre_exec(const pcre *argument_re, const Line 5800  pcre_exec(const pcre *argument_re, const
5800    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5801    int offsetcount)    int offsetcount)
5802  {  {
5803  int rc, resetcount, ocount;  int rc, ocount;
5804  int first_byte = -1;  int first_byte = -1;
5805  int req_byte = -1;  int req_byte = -1;
5806  int req_byte2 = -1;  int req_byte2 = -1;
# Line 5913  utf8 = md->utf8 = (re->options & PCRE_UT Line 5908  utf8 = md->utf8 = (re->options & PCRE_UT
5908  md->use_ucp = (re->options & PCRE_UCP) != 0;  md->use_ucp = (re->options & PCRE_UCP) != 0;
5909  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5910    
5911    /* Some options are unpacked into BOOL variables in the hope that testing
5912    them will be faster than individual option bits. */
5913    
5914  md->notbol = (options & PCRE_NOTBOL) != 0;  md->notbol = (options & PCRE_NOTBOL) != 0;
5915  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
5916  md->notempty = (options & PCRE_NOTEMPTY) != 0;  md->notempty = (options & PCRE_NOTEMPTY) != 0;
5917  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5918  md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :  md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5919                ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;                ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5920    
5921    
5922  md->hitend = FALSE;  md->hitend = FALSE;
5923  md->mark = NULL;                        /* In case never set */  md->mark = NULL;                        /* In case never set */
5924    
# Line 6049  md->offset_max = (2*ocount)/3; Line 6049  md->offset_max = (2*ocount)/3;
6049  md->offset_overflow = FALSE;  md->offset_overflow = FALSE;
6050  md->capture_last = -1;  md->capture_last = -1;
6051    
 /* Compute the minimum number of offsets that we need to reset each time. Doing  
 this makes a huge difference to execution time when there aren't many brackets  
 in the pattern. */  
   
 resetcount = 2 + re->top_bracket * 2;  
 if (resetcount > offsetcount) resetcount = ocount;  
   
6052  /* Reset the working variable associated with each extraction. These should  /* Reset the working variable associated with each extraction. These should
6053  never be used unless previously set, but they get saved and restored, and so we  never be used unless previously set, but they get saved and restored, and so we
6054  initialize them to avoid reading uninitialized locations. */  initialize them to avoid reading uninitialized locations. Also, unset the
6055    offsets for the matched string. This is really just for tidiness with callouts,
6056    in case they inspect these fields. */
6057    
6058  if (md->offset_vector != NULL)  if (md->offset_vector != NULL)
6059    {    {
6060    register int *iptr = md->offset_vector + ocount;    register int *iptr = md->offset_vector + ocount;
6061    register int *iend = iptr - resetcount/2 + 1;    register int *iend = iptr - re->top_bracket;
6062      if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6063    while (--iptr >= iend) *iptr = -1;    while (--iptr >= iend) *iptr = -1;
6064      md->offset_vector[0] = md->offset_vector[1] = -1;
6065    }    }
6066    
6067  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 6098  if ((re->flags & PCRE_REQCHSET) != 0) Line 6095  if ((re->flags & PCRE_REQCHSET) != 0)
6095    }    }
6096    
6097    
6098    
6099    
6100  /* ==========================================================================*/  /* ==========================================================================*/
6101    
6102  /* Loop for handling unanchored repeated matching attempts; for anchored regexs  /* Loop for handling unanchored repeated matching attempts; for anchored regexs
# Line 6108  for(;;) Line 6107  for(;;)
6107    USPTR save_end_subject = end_subject;    USPTR save_end_subject = end_subject;
6108    USPTR new_start_match;    USPTR new_start_match;
6109    
   /* Reset the maximum number of extractions we might see. */  
   
   if (md->offset_vector != NULL)  
     {  
     register int *iptr = md->offset_vector;  
     register int *iend = iptr + resetcount;  
     while (iptr < iend) *iptr++ = -1;  
     }  
   
6110    /* If firstline is TRUE, the start of the match is constrained to the first    /* If firstline is TRUE, the start of the match is constrained to the first
6111    line of a multiline string. That is, the match must be before or at the first    line of a multiline string. That is, the match must be before or at the first
6112    newline. Implement this by temporarily adjusting end_subject so that we stop    newline. Implement this by temporarily adjusting end_subject so that we stop
# Line 6306  for(;;) Line 6296  for(;;)
6296    md->start_used_ptr = start_match;    md->start_used_ptr = start_match;
6297    md->match_call_count = 0;    md->match_call_count = 0;
6298    md->match_function_type = 0;    md->match_function_type = 0;
6299      md->end_offset_top = 0;
6300    rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);    rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6301    if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;    if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6302    

Legend:
Removed from v.608  
changed lines
  Added in v.618

  ViewVC Help
Powered by ViewVC 1.1.5