/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 649 by ph10, Mon Aug 1 11:23:52 2011 UTC revision 723 by ph10, Sat Oct 8 15:55:23 2011 UTC
# Line 277  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM Line 277  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM
277         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278         RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,         RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279         RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,         RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280         RM61,  RM62, RM63 };         RM61,  RM62, RM63, RM64, RM65, RM66 };
281    
282  /* These versions of the macros use the stack, as normal. There are debugging  /* These versions of the macros use the stack, as normal. There are debugging
283  versions and production versions. Note that the "rw" argument of RMATCH isn't  versions and production versions. Note that the "rw" argument of RMATCH isn't
# Line 775  for (;;) Line 775  for (;;)
775      md->start_match_ptr = ecode + 2;      md->start_match_ptr = ecode + 2;
776      RRETURN(MATCH_SKIP_ARG);      RRETURN(MATCH_SKIP_ARG);
777    
778      /* For THEN (and THEN_ARG) we pass back the address of the bracket or      /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779      the alt that is at the start of the current branch. This makes it possible      the branch in which it occurs can be determined. Overload the start of
780      to skip back past alternatives that precede the THEN within the current      match pointer to do this. */
     branch. */  
781    
782      case OP_THEN:      case OP_THEN:
783      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
784        eptrb, RM54);        eptrb, RM54);
785      if (rrc != MATCH_NOMATCH) RRETURN(rrc);      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786      md->start_match_ptr = ecode - GET(ecode, 1);      md->start_match_ptr = ecode;
787      MRRETURN(MATCH_THEN);      MRRETURN(MATCH_THEN);
788    
789      case OP_THEN_ARG:      case OP_THEN_ARG:
790      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
791        offset_top, md, eptrb, RM58);        md, eptrb, RM58);
792      if (rrc != MATCH_NOMATCH) RRETURN(rrc);      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793      md->start_match_ptr = ecode - GET(ecode, 1);      md->start_match_ptr = ecode;
794      md->mark = ecode + LINK_SIZE + 2;      md->mark = ecode + 2;
795      RRETURN(MATCH_THEN);      RRETURN(MATCH_THEN);
796    
797        /* Handle an atomic group that does not contain any capturing parentheses.
798        This can be handled like an assertion. Prior to 8.13, all atomic groups
799        were handled this way. In 8.13, the code was changed as below for ONCE, so
800        that backups pass through the group and thereby reset captured values.
801        However, this uses a lot more stack, so in 8.20, atomic groups that do not
802        contain any captures generate OP_ONCE_NC, which can be handled in the old,
803        less stack intensive way.
804    
805        Check the alternative branches in turn - the matching won't pass the KET
806        for this kind of subpattern. If any one branch matches, we carry on as at
807        the end of a normal bracket, leaving the subject pointer, but resetting
808        the start-of-match value in case it was changed by \K. */
809    
810        case OP_ONCE_NC:
811        prev = ecode;
812        saved_eptr = eptr;
813        do
814          {
815          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
816          if (rrc == MATCH_MATCH)  /* Note: _not_ MATCH_ACCEPT */
817            {
818            mstart = md->start_match_ptr;
819            break;
820            }
821          if (rrc == MATCH_THEN)
822            {
823            next = ecode + GET(ecode,1);
824            if (md->start_match_ptr < next &&
825                (*ecode == OP_ALT || *next == OP_ALT))
826              rrc = MATCH_NOMATCH;
827            }
828    
829          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
830          ecode += GET(ecode,1);
831          }
832        while (*ecode == OP_ALT);
833    
834        /* If hit the end of the group (which could be repeated), fail */
835    
836        if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
837    
838        /* Continue as from after the group, updating the offsets high water
839        mark, since extracts may have been taken. */
840    
841        do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
842    
843        offset_top = md->end_offset_top;
844        eptr = md->end_match_ptr;
845    
846        /* For a non-repeating ket, just continue at this level. This also
847        happens for a repeating ket if no characters were matched in the group.
848        This is the forcible breaking of infinite loops as implemented in Perl
849        5.005. */
850    
851        if (*ecode == OP_KET || eptr == saved_eptr)
852          {
853          ecode += 1+LINK_SIZE;
854          break;
855          }
856    
857        /* The repeating kets try the rest of the pattern or restart from the
858        preceding bracket, in the appropriate order. The second "call" of match()
859        uses tail recursion, to avoid using another stack frame. */
860    
861        if (*ecode == OP_KETRMIN)
862          {
863          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
864          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
865          ecode = prev;
866          goto TAIL_RECURSE;
867          }
868        else  /* OP_KETRMAX */
869          {
870          md->match_function_type = MATCH_CBEGROUP;
871          RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
872          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
873          ecode += 1 + LINK_SIZE;
874          goto TAIL_RECURSE;
875          }
876        /* Control never gets here */
877    
878      /* Handle a capturing bracket, other than those that are possessive with an      /* Handle a capturing bracket, other than those that are possessive with an
879      unlimited repeat. If there is space in the offset vector, save the current      unlimited repeat. If there is space in the offset vector, save the current
# Line 838  for (;;) Line 918  for (;;)
918          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
919            eptrb, RM1);            eptrb, RM1);
920          if (rrc == MATCH_ONCE) break;  /* Backing up through an atomic group */          if (rrc == MATCH_ONCE) break;  /* Backing up through an atomic group */
921          if (rrc != MATCH_NOMATCH &&  
922              (rrc != MATCH_THEN || md->start_match_ptr != ecode))          /* If we backed up to a THEN, check whether it is within the current
923            RRETURN(rrc);          branch by comparing the address of the THEN that is passed back with
924            the end of the branch. If it is within the current branch, and the
925            branch is one of two or more alternatives (it either starts or ends
926            with OP_ALT), we have reached the limit of THEN's action, so convert
927            the return code to NOMATCH, which will cause normal backtracking to
928            happen from now on. Otherwise, THEN is passed back to an outer
929            alternative. This implements Perl's treatment of parenthesized groups,
930            where a group not containing | does not affect the current alternative,
931            that is, (X) is NOT the same as (X|(*F)). */
932    
933            if (rrc == MATCH_THEN)
934              {
935              next = ecode + GET(ecode,1);
936              if (md->start_match_ptr < next &&
937                  (*ecode == OP_ALT || *next == OP_ALT))
938                rrc = MATCH_NOMATCH;
939              }
940    
941            /* Anything other than NOMATCH is passed back. */
942    
943            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
944          md->capture_last = save_capture_last;          md->capture_last = save_capture_last;
945          ecode += GET(ecode, 1);          ecode += GET(ecode, 1);
946          if (*ecode != OP_ALT) break;          if (*ecode != OP_ALT) break;
# Line 851  for (;;) Line 951  for (;;)
951        md->offset_vector[offset+1] = save_offset2;        md->offset_vector[offset+1] = save_offset2;
952        md->offset_vector[md->offset_end - number] = save_offset3;        md->offset_vector[md->offset_end - number] = save_offset3;
953    
954        /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or        /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
       MATCH_THEN. */  
955    
956        if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;        if (md->mark == NULL) md->mark = markptr;
957        RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));        RRETURN(rrc);
958        }        }
959    
960      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
# Line 870  for (;;) Line 969  for (;;)
969      /* VVVVVVVVVVVVVVVVVVVVVVVVV */      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970    
971      /* Non-capturing or atomic group, except for possessive with unlimited      /* Non-capturing or atomic group, except for possessive with unlimited
972      repeat. Loop for all the alternatives. When we get to the final alternative      repeat and ONCE group with no captures. Loop for all the alternatives.
973      within the brackets, we used to return the result of a recursive call to  
974      match() whatever happened so it was possible to reduce stack usage by      When we get to the final alternative within the brackets, we used to return
975      turning this into a tail recursion, except in the case of a possibly empty      the result of a recursive call to match() whatever happened so it was
976      group. However, now that there is the possiblity of (*THEN) occurring in      possible to reduce stack usage by turning this into a tail recursion,
977      the final alternative, this optimization is no longer possible.      except in the case of a possibly empty group. However, now that there is
978        the possiblity of (*THEN) occurring in the final alternative, this
979        optimization is no longer always possible.
980    
981        We can optimize if we know there are no (*THEN)s in the pattern; at present
982        this is the best that can be done.
983    
984      MATCH_ONCE is returned when the end of an atomic group is successfully      MATCH_ONCE is returned when the end of an atomic group is successfully
985      reached, but subsequent matching fails. It passes back up the tree (causing      reached, but subsequent matching fails. It passes back up the tree (causing
# Line 892  for (;;) Line 996  for (;;)
996      for (;;)      for (;;)
997        {        {
998        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
999    
1000          /* If this is not a possibly empty group, and there are no (*THEN)s in
1001          the pattern, and this is the final alternative, optimize as described
1002          above. */
1003    
1004          else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1005            {
1006            ecode += _pcre_OP_lengths[*ecode];
1007            goto TAIL_RECURSE;
1008            }
1009    
1010          /* In all other cases, we have to make another call to match(). */
1011    
1012        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
1013          RM2);          RM2);
1014        if (rrc != MATCH_NOMATCH &&  
1015            (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* See comment in the code for capturing groups above about handling
1016          THEN. */
1017    
1018          if (rrc == MATCH_THEN)
1019            {
1020            next = ecode + GET(ecode,1);
1021            if (md->start_match_ptr < next &&
1022                (*ecode == OP_ALT || *next == OP_ALT))
1023              rrc = MATCH_NOMATCH;
1024            }
1025    
1026          if (rrc != MATCH_NOMATCH)
1027          {          {
1028          if (rrc == MATCH_ONCE)          if (rrc == MATCH_ONCE)
1029            {            {
# Line 912  for (;;) Line 1040  for (;;)
1040        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
1041        if (*ecode != OP_ALT) break;        if (*ecode != OP_ALT) break;
1042        }        }
1043      if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;  
1044        if (md->mark == NULL) md->mark = markptr;
1045      RRETURN(MATCH_NOMATCH);      RRETURN(MATCH_NOMATCH);
1046    
1047      /* Handle possessive capturing brackets with an unlimited repeat. We come      /* Handle possessive capturing brackets with an unlimited repeat. We come
# Line 975  for (;;) Line 1104  for (;;)
1104            matched_once = TRUE;            matched_once = TRUE;
1105            continue;            continue;
1106            }            }
1107          if (rrc != MATCH_NOMATCH &&  
1108              (rrc != MATCH_THEN || md->start_match_ptr != ecode))          /* See comment in the code for capturing groups above about handling
1109            RRETURN(rrc);          THEN. */
1110    
1111            if (rrc == MATCH_THEN)
1112              {
1113              next = ecode + GET(ecode,1);
1114              if (md->start_match_ptr < next &&
1115                  (*ecode == OP_ALT || *next == OP_ALT))
1116                rrc = MATCH_NOMATCH;
1117              }
1118    
1119            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120          md->capture_last = save_capture_last;          md->capture_last = save_capture_last;
1121          ecode += GET(ecode, 1);          ecode += GET(ecode, 1);
1122          if (*ecode != OP_ALT) break;          if (*ecode != OP_ALT) break;
# Line 990  for (;;) Line 1129  for (;;)
1129          md->offset_vector[md->offset_end - number] = save_offset3;          md->offset_vector[md->offset_end - number] = save_offset3;
1130          }          }
1131    
1132        if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;        if (md->mark == NULL) md->mark = markptr;
1133        if (allow_zero || matched_once)        if (allow_zero || matched_once)
1134          {          {
1135          ecode += 1 + LINK_SIZE;          ecode += 1 + LINK_SIZE;
# Line 1037  for (;;) Line 1176  for (;;)
1176          matched_once = TRUE;          matched_once = TRUE;
1177          continue;          continue;
1178          }          }
1179        if (rrc != MATCH_NOMATCH &&  
1180            (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* See comment in the code for capturing groups above about handling
1181          RRETURN(rrc);        THEN. */
1182    
1183          if (rrc == MATCH_THEN)
1184            {
1185            next = ecode + GET(ecode,1);
1186            if (md->start_match_ptr < next &&
1187                (*ecode == OP_ALT || *next == OP_ALT))
1188              rrc = MATCH_NOMATCH;
1189            }
1190    
1191          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
1193        if (*ecode != OP_ALT) break;        if (*ecode != OP_ALT) break;
1194        }        }
# Line 1082  for (;;) Line 1231  for (;;)
1231          cb.capture_top      = offset_top/2;          cb.capture_top      = offset_top/2;
1232          cb.capture_last     = md->capture_last;          cb.capture_last     = md->capture_last;
1233          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
1234          cb.mark             = markptr;          cb.mark             = markptr;
1235          if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);          if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1236          if (rrc < 0) RRETURN(rrc);          if (rrc < 0) RRETURN(rrc);
1237          }          }
# Line 1251  for (;;) Line 1400  for (;;)
1400          ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);          ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1401          while (*ecode == OP_ALT) ecode += GET(ecode, 1);          while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1402          }          }
1403        else if (rrc != MATCH_NOMATCH &&  
1404                (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1405          assertion; it is therefore treated as NOMATCH. */
1406    
1407          else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1408          {          {
1409          RRETURN(rrc);         /* Need braces because of following else */          RRETURN(rrc);         /* Need braces because of following else */
1410          }          }
# Line 1263  for (;;) Line 1415  for (;;)
1415          }          }
1416        }        }
1417    
1418      /* We are now at the branch that is to be obeyed. As there is only one,      /* We are now at the branch that is to be obeyed. As there is only one, can
1419      we used to use tail recursion to avoid using another stack frame, except      use tail recursion to avoid using another stack frame, except when there is
1420      when there was unlimited repeat of a possibly empty group. However, that      unlimited repeat of a possibly empty group. In the latter case, a recursive
1421      strategy no longer works because of the possibilty of (*THEN) being      call to match() is always required, unless the second alternative doesn't
1422      encountered in the branch. A recursive call to match() is always required,      exist, in which case we can just plough on. Note that, for compatibility
1423      unless the second alternative doesn't exist, in which case we can just      with Perl, the | in a conditional group is NOT treated as creating two
1424      plough on. */      alternatives. If a THEN is encountered in the branch, it propagates out to
1425        the enclosing alternative (unless nested in a deeper set of alternatives,
1426        of course). */
1427    
1428      if (condition || *ecode == OP_ALT)      if (condition || *ecode == OP_ALT)
1429        {        {
1430        if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;        if (op != OP_SCOND)
1431            {
1432            ecode += 1 + LINK_SIZE;
1433            goto TAIL_RECURSE;
1434            }
1435    
1436          md->match_function_type = MATCH_CBEGROUP;
1437        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
       if (rrc == MATCH_THEN && md->start_match_ptr == ecode)  
         rrc = MATCH_NOMATCH;  
1438        RRETURN(rrc);        RRETURN(rrc);
1439        }        }
1440      else                         /* Condition false & no alternative */  
1441         /* Condition false & no alternative; continue after the group. */
1442    
1443        else
1444        {        {
1445        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1446        }        }
# Line 1369  for (;;) Line 1530  for (;;)
1530          markptr = md->mark;          markptr = md->mark;
1531          break;          break;
1532          }          }
1533        if (rrc != MATCH_NOMATCH &&  
1534            (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1535          RRETURN(rrc);        as NOMATCH. */
1536    
1537          if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1538        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
1539        }        }
1540      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 1412  for (;;) Line 1575  for (;;)
1575          do ecode += GET(ecode,1); while (*ecode == OP_ALT);          do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1576          break;          break;
1577          }          }
1578        if (rrc != MATCH_NOMATCH &&  
1579            (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1580          RRETURN(rrc);        as NOMATCH. */
1581    
1582          if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1583        ecode += GET(ecode,1);        ecode += GET(ecode,1);
1584        }        }
1585      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 1477  for (;;) Line 1642  for (;;)
1642        cb.capture_top      = offset_top/2;        cb.capture_top      = offset_top/2;
1643        cb.capture_last     = md->capture_last;        cb.capture_last     = md->capture_last;
1644        cb.callout_data     = md->callout_data;        cb.callout_data     = md->callout_data;
1645        cb.mark             = markptr;        cb.mark             = markptr;
1646        if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);        if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1647        if (rrc < 0) RRETURN(rrc);        if (rrc < 0) RRETURN(rrc);
1648        }        }
# Line 1505  for (;;) Line 1670  for (;;)
1670        {        {
1671        recursion_info *ri;        recursion_info *ri;
1672        int recno;        int recno;
1673    
1674        callpat = md->start_code + GET(ecode, 1);        callpat = md->start_code + GET(ecode, 1);
1675        recno = (callpat == md->start_code)? 0 :        recno = (callpat == md->start_code)? 0 :
1676          GET2(callpat, 1 + LINK_SIZE);          GET2(callpat, 1 + LINK_SIZE);
1677    
1678        /* Check for repeating a recursion without advancing the subject pointer.        /* Check for repeating a recursion without advancing the subject pointer.
1679        This should catch convoluted mutual recursions. (Some simple cases are        This should catch convoluted mutual recursions. (Some simple cases are
1680        caught at compile time.) */        caught at compile time.) */
1681    
1682        for (ri = md->recursive; ri != NULL; ri = ri->prevrec)        for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1683          if (recno == ri->group_num && eptr == ri->subject_position)          if (recno == ri->group_num && eptr == ri->subject_position)
1684            RRETURN(PCRE_ERROR_RECURSELOOP);            RRETURN(PCRE_ERROR_RECURSELOOP);
1685    
1686        /* Add to "recursing stack" */        /* Add to "recursing stack" */
# Line 1556  for (;;) Line 1721  for (;;)
1721            md, eptrb, RM6);            md, eptrb, RM6);
1722          memcpy(md->offset_vector, new_recursive.offset_save,          memcpy(md->offset_vector, new_recursive.offset_save,
1723              new_recursive.saved_max * sizeof(int));              new_recursive.saved_max * sizeof(int));
1724            md->recursive = new_recursive.prevrec;
1725          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1726            {            {
1727            DPRINTF(("Recursion matched\n"));            DPRINTF(("Recursion matched\n"));
           md->recursive = new_recursive.prevrec;  
1728            if (new_recursive.offset_save != stacksave)            if (new_recursive.offset_save != stacksave)
1729              (pcre_free)(new_recursive.offset_save);              (pcre_free)(new_recursive.offset_save);
1730    
# Line 1571  for (;;) Line 1736  for (;;)
1736            mstart = md->start_match_ptr;            mstart = md->start_match_ptr;
1737            goto RECURSION_MATCHED;        /* Exit loop; end processing */            goto RECURSION_MATCHED;        /* Exit loop; end processing */
1738            }            }
1739          else if (rrc != MATCH_NOMATCH &&  
1740                  (rrc != MATCH_THEN || md->start_match_ptr != ecode))          /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1741            as NOMATCH. */
1742    
1743            else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1744            {            {
1745            DPRINTF(("Recursion gave error %d\n", rrc));            DPRINTF(("Recursion gave error %d\n", rrc));
1746            if (new_recursive.offset_save != stacksave)            if (new_recursive.offset_save != stacksave)
# Line 1658  for (;;) Line 1826  for (;;)
1826        }        }
1827      else saved_eptr = NULL;      else saved_eptr = NULL;
1828    
1829      /* If we are at the end of an assertion group, stop matching and return      /* If we are at the end of an assertion group or a non-capturing atomic
1830      MATCH_MATCH, but record the current high water mark for use by positive      group, stop matching and return MATCH_MATCH, but record the current high
1831      assertions. We also need to record the match start in case it was changed      water mark for use by positive assertions. We also need to record the match
1832      by \K. */      start in case it was changed by \K. */
1833    
1834      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||      if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1835          *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)           *prev == OP_ONCE_NC)
1836        {        {
1837        md->end_match_ptr = eptr;      /* For ONCE */        md->end_match_ptr = eptr;      /* For ONCE_NC */
1838        md->end_offset_top = offset_top;        md->end_offset_top = offset_top;
1839        md->start_match_ptr = mstart;        md->start_match_ptr = mstart;
1840        MRRETURN(MATCH_MATCH);         /* Sets md->mark */        MRRETURN(MATCH_MATCH);         /* Sets md->mark */
# Line 1734  for (;;) Line 1902  for (;;)
1902      /* For an ordinary non-repeating ket, just continue at this level. This      /* For an ordinary non-repeating ket, just continue at this level. This
1903      also happens for a repeating ket if no characters were matched in the      also happens for a repeating ket if no characters were matched in the
1904      group. This is the forcible breaking of infinite loops as implemented in      group. This is the forcible breaking of infinite loops as implemented in
1905      Perl 5.005. For a non-repeating atomic group, establish a backup point by      Perl 5.005. For a non-repeating atomic group that includes captures,
1906      processing the rest of the pattern at a lower level. If this results in a      establish a backup point by processing the rest of the pattern at a lower
1907      NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby      level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1908      bypassing intermediate backup points, but resetting any captures that      original OP_ONCE level, thereby bypassing intermediate backup points, but
1909      happened along the way. */      resetting any captures that happened along the way. */
1910    
1911      if (*ecode == OP_KET || eptr == saved_eptr)      if (*ecode == OP_KET || eptr == saved_eptr)
1912        {        {
# Line 2033  for (;;) Line 2201  for (;;)
2201        SCHECK_PARTIAL();        SCHECK_PARTIAL();
2202        MRRETURN(MATCH_NOMATCH);        MRRETURN(MATCH_NOMATCH);
2203        }        }
2204      eptr++;      eptr++;
2205      ecode++;      ecode++;
2206      break;      break;
2207    
# Line 5182  for (;;) Line 5350  for (;;)
5350                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5351                }                }
5352              }              }
5353            else            else
5354              {              {
5355              eptr = md->end_subject;   /* Unlimited UTF-8 repeat */              eptr = md->end_subject;   /* Unlimited UTF-8 repeat */
5356              SCHECK_PARTIAL();              SCHECK_PARTIAL();
5357              }              }
5358            break;            break;
5359    
5360            /* The byte case is the same as non-UTF8 */            /* The byte case is the same as non-UTF8 */
# Line 5659  switch (frame->Xwhere) Line 5827  switch (frame->Xwhere)
5827    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5828    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5829    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5830    LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)    LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5831      LBL(65) LBL(66)
5832  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5833    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5834    LBL(32) LBL(34) LBL(42) LBL(46)    LBL(32) LBL(34) LBL(42) LBL(46)
# Line 5761  pcre_exec(const pcre *argument_re, const Line 5930  pcre_exec(const pcre *argument_re, const
5930    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5931    int offsetcount)    int offsetcount)
5932  {  {
5933  int rc, ocount;  int rc, ocount, arg_offset_max;
5934  int first_byte = -1;  int first_byte = -1;
5935  int req_byte = -1;  int req_byte = -1;
5936  int req_byte2 = -1;  int req_byte2 = -1;
# Line 5797  if (re == NULL || subject == NULL || Line 5966  if (re == NULL || subject == NULL ||
5966  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5967  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5968    
5969  /* This information is for finding all the numbers associated with a given  /* These two settings are used in the code for checking a UTF-8 string that
5970  name, for condition testing. */  follows immediately afterwards. Other values in the md block are used only
5971    during "normal" pcre_exec() processing, not when the JIT support is in use,
5972    so they are set up later. */
5973    
5974    utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5975    md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5976                  ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5977    
5978    /* Check a UTF-8 string if required. Pass back the character offset and error
5979    code for an invalid string if a results vector is available. */
5980    
5981    #ifdef SUPPORT_UTF8
5982    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5983      {
5984      int erroroffset;
5985      int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5986      if (errorcode != 0)
5987        {
5988        if (offsetcount >= 2)
5989          {
5990          offsets[0] = erroroffset;
5991          offsets[1] = errorcode;
5992          }
5993        return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5994          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5995        }
5996    
5997      /* Check that a start_offset points to the start of a UTF-8 character. */
5998      if (start_offset > 0 && start_offset < length &&
5999          (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6000        return PCRE_ERROR_BADUTF8_OFFSET;
6001      }
6002    #endif
6003    
6004    /* If the pattern was successfully studied with JIT support, run the JIT
6005    executable instead of the rest of this function. Most options must be set at
6006    compile time for the JIT code to be usable. Fallback to the normal code path if
6007    an unsupported flag is set. In particular, JIT does not support partial
6008    matching. */
6009    
6010    #ifdef SUPPORT_JIT
6011    if (extra_data != NULL
6012        && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6013        && extra_data->executable_jit != NULL
6014        && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6015                        PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6016      return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
6017        start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6018        ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6019    #endif
6020    
6021    /* Carry on with non-JIT matching. This information is for finding all the
6022    numbers associated with a given name, for condition testing. */
6023    
6024  md->name_table = (uschar *)re + re->name_table_offset;  md->name_table = (uschar *)re + re->name_table_offset;
6025  md->name_count = re->name_count;  md->name_count = re->name_count;
# Line 5865  md->end_subject = md->start_subject + le Line 6086  md->end_subject = md->start_subject + le
6086  end_subject = md->end_subject;  end_subject = md->end_subject;
6087    
6088  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;  
6089  md->use_ucp = (re->options & PCRE_UCP) != 0;  md->use_ucp = (re->options & PCRE_UCP) != 0;
6090  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6091    
# Line 5876  md->notbol = (options & PCRE_NOTBOL) != Line 6096  md->notbol = (options & PCRE_NOTBOL) !=
6096  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
6097  md->notempty = (options & PCRE_NOTEMPTY) != 0;  md->notempty = (options & PCRE_NOTEMPTY) != 0;
6098  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :  
               ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;  
   
6099    
6100  md->hitend = FALSE;  md->hitend = FALSE;
6101  md->mark = NULL;                        /* In case never set */  md->mark = NULL;                        /* In case never set */
6102    
6103  md->recursive = NULL;                   /* No recursion at top level */  md->recursive = NULL;                   /* No recursion at top level */
6104    md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6105    
6106  md->lcc = tables + lcc_offset;  md->lcc = tables + lcc_offset;
6107  md->ctypes = tables + ctypes_offset;  md->ctypes = tables + ctypes_offset;
# Line 5961  defined (though never set). So there's n Line 6179  defined (though never set). So there's n
6179  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6180    return PCRE_ERROR_BADPARTIAL;    return PCRE_ERROR_BADPARTIAL;
6181    
 /* Check a UTF-8 string if required. Pass back the character offset and error  
 code for an invalid string if a results vector is available. */  
   
 #ifdef SUPPORT_UTF8  
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  
   {  
   int erroroffset;  
   int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);  
   if (errorcode != 0)  
     {  
     if (offsetcount >= 2)  
       {  
       offsets[0] = erroroffset;  
       offsets[1] = errorcode;  
       }  
     return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?  
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;  
     }  
   
   /* Check that a start_offset points to the start of a UTF-8 character. */  
   
   if (start_offset > 0 && start_offset < length &&  
       (((USPTR)subject)[start_offset] & 0xc0) == 0x80)  
     return PCRE_ERROR_BADUTF8_OFFSET;  
   }  
 #endif  
   
6182  /* If the expression has got more back references than the offsets supplied can  /* If the expression has got more back references than the offsets supplied can
6183  hold, we get a temporary chunk of working store to use during the matching.  hold, we get a temporary chunk of working store to use during the matching.
6184  Otherwise, we can use the vector supplied, rounding down its size to a multiple  Otherwise, we can use the vector supplied, rounding down its size to a multiple
6185  of 3. */  of 3. */
6186    
6187  ocount = offsetcount - (offsetcount % 3);  ocount = offsetcount - (offsetcount % 3);
6188    arg_offset_max = (2*ocount)/3;
6189    
6190  if (re->top_backref > 0 && re->top_backref >= ocount/3)  if (re->top_backref > 0 && re->top_backref >= ocount/3)
6191    {    {
# Line 6368  if (rc == MATCH_MATCH || rc == MATCH_ACC Line 6560  if (rc == MATCH_MATCH || rc == MATCH_ACC
6560    {    {
6561    if (using_temporary_offsets)    if (using_temporary_offsets)
6562      {      {
6563      if (offsetcount >= 4)      if (arg_offset_max >= 4)
6564        {        {
6565        memcpy(offsets + 2, md->offset_vector + 2,        memcpy(offsets + 2, md->offset_vector + 2,
6566          (offsetcount - 2) * sizeof(int));          (arg_offset_max - 2) * sizeof(int));
6567        DPRINTF(("Copied offsets from temporary memory\n"));        DPRINTF(("Copied offsets from temporary memory\n"));
6568        }        }
6569      if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;      if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6570      DPRINTF(("Freeing temporary memory\n"));      DPRINTF(("Freeing temporary memory\n"));
6571      (pcre_free)(md->offset_vector);      (pcre_free)(md->offset_vector);
6572      }      }
6573    
6574    /* Set the return code to the number of captured strings, or 0 if there are    /* Set the return code to the number of captured strings, or 0 if there were
6575    too many to fit into the vector. */    too many to fit into the vector. */
6576    
6577    rc = md->offset_overflow? 0 : md->end_offset_top/2;    rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6578        0 : md->end_offset_top/2;
6579    
6580    /* If there is space in the offset vector, set any unused pairs at the end of    /* If there is space in the offset vector, set any unused pairs at the end of
6581    the pattern to -1 for backwards compatibility. It is documented that this    the pattern to -1 for backwards compatibility. It is documented that this
6582    happens. In earlier versions, the whole set of potential capturing offsets    happens. In earlier versions, the whole set of potential capturing offsets
6583    was set to -1 each time round the loop, but this is handled differently now.    was set to -1 each time round the loop, but this is handled differently now.
6584    "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only    "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6585    those at the end that need unsetting here. We can't just unset them all at    those at the end that need unsetting here. We can't just unset them all at
6586    the start of the whole thing because they may get set in one branch that is    the start of the whole thing because they may get set in one branch that is
6587    not the final matching branch. */    not the final matching branch. */

Legend:
Removed from v.649  
changed lines
  Added in v.723

  ViewVC Help
Powered by ViewVC 1.1.5