/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 649 by ph10, Mon Aug 1 11:23:52 2011 UTC revision 721 by ph10, Fri Oct 7 15:51:39 2011 UTC
# Line 775  for (;;) Line 775  for (;;)
775      md->start_match_ptr = ecode + 2;      md->start_match_ptr = ecode + 2;
776      RRETURN(MATCH_SKIP_ARG);      RRETURN(MATCH_SKIP_ARG);
777    
778      /* For THEN (and THEN_ARG) we pass back the address of the bracket or      /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779      the alt that is at the start of the current branch. This makes it possible      the branch in which it occurs can be determined. Overload the start of
780      to skip back past alternatives that precede the THEN within the current      match pointer to do this. */
     branch. */  
781    
782      case OP_THEN:      case OP_THEN:
783      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
784        eptrb, RM54);        eptrb, RM54);
785      if (rrc != MATCH_NOMATCH) RRETURN(rrc);      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786      md->start_match_ptr = ecode - GET(ecode, 1);      md->start_match_ptr = ecode;
787      MRRETURN(MATCH_THEN);      MRRETURN(MATCH_THEN);
788    
789      case OP_THEN_ARG:      case OP_THEN_ARG:
790      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],      RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
791        offset_top, md, eptrb, RM58);        md, eptrb, RM58);
792      if (rrc != MATCH_NOMATCH) RRETURN(rrc);      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793      md->start_match_ptr = ecode - GET(ecode, 1);      md->start_match_ptr = ecode;
794      md->mark = ecode + LINK_SIZE + 2;      md->mark = ecode + 2;
795      RRETURN(MATCH_THEN);      RRETURN(MATCH_THEN);
796    
797      /* Handle a capturing bracket, other than those that are possessive with an      /* Handle a capturing bracket, other than those that are possessive with an
# Line 838  for (;;) Line 837  for (;;)
837          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
838            eptrb, RM1);            eptrb, RM1);
839          if (rrc == MATCH_ONCE) break;  /* Backing up through an atomic group */          if (rrc == MATCH_ONCE) break;  /* Backing up through an atomic group */
840          if (rrc != MATCH_NOMATCH &&  
841              (rrc != MATCH_THEN || md->start_match_ptr != ecode))          /* If we backed up to a THEN, check whether it is within the current
842            RRETURN(rrc);          branch by comparing the address of the THEN that is passed back with
843            the end of the branch. If it is within the current branch, and the
844            branch is one of two or more alternatives (it either starts or ends
845            with OP_ALT), we have reached the limit of THEN's action, so convert
846            the return code to NOMATCH, which will cause normal backtracking to
847            happen from now on. Otherwise, THEN is passed back to an outer
848            alternative. This implements Perl's treatment of parenthesized groups,
849            where a group not containing | does not affect the current alternative,
850            that is, (X) is NOT the same as (X|(*F)). */
851    
852            if (rrc == MATCH_THEN)
853              {
854              next = ecode + GET(ecode,1);
855              if (md->start_match_ptr < next &&
856                  (*ecode == OP_ALT || *next == OP_ALT))
857                rrc = MATCH_NOMATCH;
858              }
859    
860            /* Anything other than NOMATCH is passed back. */
861    
862            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
863          md->capture_last = save_capture_last;          md->capture_last = save_capture_last;
864          ecode += GET(ecode, 1);          ecode += GET(ecode, 1);
865          if (*ecode != OP_ALT) break;          if (*ecode != OP_ALT) break;
# Line 851  for (;;) Line 870  for (;;)
870        md->offset_vector[offset+1] = save_offset2;        md->offset_vector[offset+1] = save_offset2;
871        md->offset_vector[md->offset_end - number] = save_offset3;        md->offset_vector[md->offset_end - number] = save_offset3;
872    
873        /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or        /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
       MATCH_THEN. */  
874    
875        if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;        if (md->mark == NULL) md->mark = markptr;
876        RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));        RRETURN(rrc);
877        }        }
878    
879      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
# Line 870  for (;;) Line 888  for (;;)
888      /* VVVVVVVVVVVVVVVVVVVVVVVVV */      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
889    
890      /* Non-capturing or atomic group, except for possessive with unlimited      /* Non-capturing or atomic group, except for possessive with unlimited
891      repeat. Loop for all the alternatives. When we get to the final alternative      repeat. Loop for all the alternatives.
892      within the brackets, we used to return the result of a recursive call to  
893      match() whatever happened so it was possible to reduce stack usage by      When we get to the final alternative within the brackets, we used to return
894      turning this into a tail recursion, except in the case of a possibly empty      the result of a recursive call to match() whatever happened so it was
895      group. However, now that there is the possiblity of (*THEN) occurring in      possible to reduce stack usage by turning this into a tail recursion,
896      the final alternative, this optimization is no longer possible.      except in the case of a possibly empty group. However, now that there is
897        the possiblity of (*THEN) occurring in the final alternative, this
898        optimization is no longer always possible.
899    
900        We can optimize if we know there are no (*THEN)s in the pattern; at present
901        this is the best that can be done.
902    
903      MATCH_ONCE is returned when the end of an atomic group is successfully      MATCH_ONCE is returned when the end of an atomic group is successfully
904      reached, but subsequent matching fails. It passes back up the tree (causing      reached, but subsequent matching fails. It passes back up the tree (causing
# Line 892  for (;;) Line 915  for (;;)
915      for (;;)      for (;;)
916        {        {
917        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
918    
919          /* If this is not a possibly empty group, and there are no (*THEN)s in
920          the pattern, and this is the final alternative, optimize as described
921          above. */
922    
923          else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
924            {
925            ecode += _pcre_OP_lengths[*ecode];
926            goto TAIL_RECURSE;
927            }
928    
929          /* In all other cases, we have to make another call to match(). */
930    
931        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
932          RM2);          RM2);
933        if (rrc != MATCH_NOMATCH &&  
934            (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* See comment in the code for capturing groups above about handling
935          THEN. */
936    
937          if (rrc == MATCH_THEN)
938            {
939            next = ecode + GET(ecode,1);
940            if (md->start_match_ptr < next &&
941                (*ecode == OP_ALT || *next == OP_ALT))
942              rrc = MATCH_NOMATCH;
943            }
944    
945          if (rrc != MATCH_NOMATCH)
946          {          {
947          if (rrc == MATCH_ONCE)          if (rrc == MATCH_ONCE)
948            {            {
# Line 912  for (;;) Line 959  for (;;)
959        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
960        if (*ecode != OP_ALT) break;        if (*ecode != OP_ALT) break;
961        }        }
962      if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;  
963        if (md->mark == NULL) md->mark = markptr;
964      RRETURN(MATCH_NOMATCH);      RRETURN(MATCH_NOMATCH);
965    
966      /* Handle possessive capturing brackets with an unlimited repeat. We come      /* Handle possessive capturing brackets with an unlimited repeat. We come
# Line 975  for (;;) Line 1023  for (;;)
1023            matched_once = TRUE;            matched_once = TRUE;
1024            continue;            continue;
1025            }            }
1026          if (rrc != MATCH_NOMATCH &&  
1027              (rrc != MATCH_THEN || md->start_match_ptr != ecode))          /* See comment in the code for capturing groups above about handling
1028            RRETURN(rrc);          THEN. */
1029    
1030            if (rrc == MATCH_THEN)
1031              {
1032              next = ecode + GET(ecode,1);
1033              if (md->start_match_ptr < next &&
1034                  (*ecode == OP_ALT || *next == OP_ALT))
1035                rrc = MATCH_NOMATCH;
1036              }
1037    
1038            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1039          md->capture_last = save_capture_last;          md->capture_last = save_capture_last;
1040          ecode += GET(ecode, 1);          ecode += GET(ecode, 1);
1041          if (*ecode != OP_ALT) break;          if (*ecode != OP_ALT) break;
# Line 990  for (;;) Line 1048  for (;;)
1048          md->offset_vector[md->offset_end - number] = save_offset3;          md->offset_vector[md->offset_end - number] = save_offset3;
1049          }          }
1050    
1051        if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;        if (md->mark == NULL) md->mark = markptr;
1052        if (allow_zero || matched_once)        if (allow_zero || matched_once)
1053          {          {
1054          ecode += 1 + LINK_SIZE;          ecode += 1 + LINK_SIZE;
# Line 1037  for (;;) Line 1095  for (;;)
1095          matched_once = TRUE;          matched_once = TRUE;
1096          continue;          continue;
1097          }          }
1098        if (rrc != MATCH_NOMATCH &&  
1099            (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* See comment in the code for capturing groups above about handling
1100          RRETURN(rrc);        THEN. */
1101    
1102          if (rrc == MATCH_THEN)
1103            {
1104            next = ecode + GET(ecode,1);
1105            if (md->start_match_ptr < next &&
1106                (*ecode == OP_ALT || *next == OP_ALT))
1107              rrc = MATCH_NOMATCH;
1108            }
1109    
1110          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1111        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
1112        if (*ecode != OP_ALT) break;        if (*ecode != OP_ALT) break;
1113        }        }
# Line 1082  for (;;) Line 1150  for (;;)
1150          cb.capture_top      = offset_top/2;          cb.capture_top      = offset_top/2;
1151          cb.capture_last     = md->capture_last;          cb.capture_last     = md->capture_last;
1152          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
1153          cb.mark             = markptr;          cb.mark             = markptr;
1154          if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);          if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1155          if (rrc < 0) RRETURN(rrc);          if (rrc < 0) RRETURN(rrc);
1156          }          }
# Line 1251  for (;;) Line 1319  for (;;)
1319          ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);          ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1320          while (*ecode == OP_ALT) ecode += GET(ecode, 1);          while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1321          }          }
1322        else if (rrc != MATCH_NOMATCH &&  
1323                (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1324          assertion; it is therefore treated as NOMATCH. */
1325    
1326          else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1327          {          {
1328          RRETURN(rrc);         /* Need braces because of following else */          RRETURN(rrc);         /* Need braces because of following else */
1329          }          }
# Line 1263  for (;;) Line 1334  for (;;)
1334          }          }
1335        }        }
1336    
1337      /* We are now at the branch that is to be obeyed. As there is only one,      /* We are now at the branch that is to be obeyed. As there is only one, can
1338      we used to use tail recursion to avoid using another stack frame, except      use tail recursion to avoid using another stack frame, except when there is
1339      when there was unlimited repeat of a possibly empty group. However, that      unlimited repeat of a possibly empty group. In the latter case, a recursive
1340      strategy no longer works because of the possibilty of (*THEN) being      call to match() is always required, unless the second alternative doesn't
1341      encountered in the branch. A recursive call to match() is always required,      exist, in which case we can just plough on. Note that, for compatibility
1342      unless the second alternative doesn't exist, in which case we can just      with Perl, the | in a conditional group is NOT treated as creating two
1343      plough on. */      alternatives. If a THEN is encountered in the branch, it propagates out to
1344        the enclosing alternative (unless nested in a deeper set of alternatives,
1345        of course). */
1346    
1347      if (condition || *ecode == OP_ALT)      if (condition || *ecode == OP_ALT)
1348        {        {
1349        if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;        if (op != OP_SCOND)
1350            {
1351            ecode += 1 + LINK_SIZE;
1352            goto TAIL_RECURSE;
1353            }
1354    
1355          md->match_function_type = MATCH_CBEGROUP;
1356        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
       if (rrc == MATCH_THEN && md->start_match_ptr == ecode)  
         rrc = MATCH_NOMATCH;  
1357        RRETURN(rrc);        RRETURN(rrc);
1358        }        }
1359      else                         /* Condition false & no alternative */  
1360         /* Condition false & no alternative; continue after the group. */
1361    
1362        else
1363        {        {
1364        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1365        }        }
# Line 1369  for (;;) Line 1449  for (;;)
1449          markptr = md->mark;          markptr = md->mark;
1450          break;          break;
1451          }          }
1452        if (rrc != MATCH_NOMATCH &&  
1453            (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1454          RRETURN(rrc);        as NOMATCH. */
1455    
1456          if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1457        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
1458        }        }
1459      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 1412  for (;;) Line 1494  for (;;)
1494          do ecode += GET(ecode,1); while (*ecode == OP_ALT);          do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1495          break;          break;
1496          }          }
1497        if (rrc != MATCH_NOMATCH &&  
1498            (rrc != MATCH_THEN || md->start_match_ptr != ecode))        /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1499          RRETURN(rrc);        as NOMATCH. */
1500    
1501          if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1502        ecode += GET(ecode,1);        ecode += GET(ecode,1);
1503        }        }
1504      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 1477  for (;;) Line 1561  for (;;)
1561        cb.capture_top      = offset_top/2;        cb.capture_top      = offset_top/2;
1562        cb.capture_last     = md->capture_last;        cb.capture_last     = md->capture_last;
1563        cb.callout_data     = md->callout_data;        cb.callout_data     = md->callout_data;
1564        cb.mark             = markptr;        cb.mark             = markptr;
1565        if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);        if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1566        if (rrc < 0) RRETURN(rrc);        if (rrc < 0) RRETURN(rrc);
1567        }        }
# Line 1505  for (;;) Line 1589  for (;;)
1589        {        {
1590        recursion_info *ri;        recursion_info *ri;
1591        int recno;        int recno;
1592    
1593        callpat = md->start_code + GET(ecode, 1);        callpat = md->start_code + GET(ecode, 1);
1594        recno = (callpat == md->start_code)? 0 :        recno = (callpat == md->start_code)? 0 :
1595          GET2(callpat, 1 + LINK_SIZE);          GET2(callpat, 1 + LINK_SIZE);
1596    
1597        /* Check for repeating a recursion without advancing the subject pointer.        /* Check for repeating a recursion without advancing the subject pointer.
1598        This should catch convoluted mutual recursions. (Some simple cases are        This should catch convoluted mutual recursions. (Some simple cases are
1599        caught at compile time.) */        caught at compile time.) */
1600    
1601        for (ri = md->recursive; ri != NULL; ri = ri->prevrec)        for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1602          if (recno == ri->group_num && eptr == ri->subject_position)          if (recno == ri->group_num && eptr == ri->subject_position)
1603            RRETURN(PCRE_ERROR_RECURSELOOP);            RRETURN(PCRE_ERROR_RECURSELOOP);
1604    
1605        /* Add to "recursing stack" */        /* Add to "recursing stack" */
# Line 1556  for (;;) Line 1640  for (;;)
1640            md, eptrb, RM6);            md, eptrb, RM6);
1641          memcpy(md->offset_vector, new_recursive.offset_save,          memcpy(md->offset_vector, new_recursive.offset_save,
1642              new_recursive.saved_max * sizeof(int));              new_recursive.saved_max * sizeof(int));
1643            md->recursive = new_recursive.prevrec;
1644          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1645            {            {
1646            DPRINTF(("Recursion matched\n"));            DPRINTF(("Recursion matched\n"));
           md->recursive = new_recursive.prevrec;  
1647            if (new_recursive.offset_save != stacksave)            if (new_recursive.offset_save != stacksave)
1648              (pcre_free)(new_recursive.offset_save);              (pcre_free)(new_recursive.offset_save);
1649    
# Line 1571  for (;;) Line 1655  for (;;)
1655            mstart = md->start_match_ptr;            mstart = md->start_match_ptr;
1656            goto RECURSION_MATCHED;        /* Exit loop; end processing */            goto RECURSION_MATCHED;        /* Exit loop; end processing */
1657            }            }
1658          else if (rrc != MATCH_NOMATCH &&  
1659                  (rrc != MATCH_THEN || md->start_match_ptr != ecode))          /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1660            as NOMATCH. */
1661    
1662            else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1663            {            {
1664            DPRINTF(("Recursion gave error %d\n", rrc));            DPRINTF(("Recursion gave error %d\n", rrc));
1665            if (new_recursive.offset_save != stacksave)            if (new_recursive.offset_save != stacksave)
# Line 1663  for (;;) Line 1750  for (;;)
1750      assertions. We also need to record the match start in case it was changed      assertions. We also need to record the match start in case it was changed
1751      by \K. */      by \K. */
1752    
1753      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||      if (*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT)
         *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)  
1754        {        {
1755        md->end_match_ptr = eptr;      /* For ONCE */        md->end_match_ptr = eptr;      /* For ONCE */
1756        md->end_offset_top = offset_top;        md->end_offset_top = offset_top;
# Line 2033  for (;;) Line 2119  for (;;)
2119        SCHECK_PARTIAL();        SCHECK_PARTIAL();
2120        MRRETURN(MATCH_NOMATCH);        MRRETURN(MATCH_NOMATCH);
2121        }        }
2122      eptr++;      eptr++;
2123      ecode++;      ecode++;
2124      break;      break;
2125    
# Line 5182  for (;;) Line 5268  for (;;)
5268                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5269                }                }
5270              }              }
5271            else            else
5272              {              {
5273              eptr = md->end_subject;   /* Unlimited UTF-8 repeat */              eptr = md->end_subject;   /* Unlimited UTF-8 repeat */
5274              SCHECK_PARTIAL();              SCHECK_PARTIAL();
5275              }              }
5276            break;            break;
5277    
5278            /* The byte case is the same as non-UTF8 */            /* The byte case is the same as non-UTF8 */
# Line 5761  pcre_exec(const pcre *argument_re, const Line 5847  pcre_exec(const pcre *argument_re, const
5847    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5848    int offsetcount)    int offsetcount)
5849  {  {
5850  int rc, ocount;  int rc, ocount, arg_offset_max;
5851  int first_byte = -1;  int first_byte = -1;
5852  int req_byte = -1;  int req_byte = -1;
5853  int req_byte2 = -1;  int req_byte2 = -1;
# Line 5797  if (re == NULL || subject == NULL || Line 5883  if (re == NULL || subject == NULL ||
5883  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5884  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5885    
5886  /* This information is for finding all the numbers associated with a given  /* These two settings are used in the code for checking a UTF-8 string that
5887  name, for condition testing. */  follows immediately afterwards. Other values in the md block are used only
5888    during "normal" pcre_exec() processing, not when the JIT support is in use,
5889    so they are set up later. */
5890    
5891    utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5892    md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5893                  ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5894    
5895    /* Check a UTF-8 string if required. Pass back the character offset and error
5896    code for an invalid string if a results vector is available. */
5897    
5898    #ifdef SUPPORT_UTF8
5899    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5900      {
5901      int erroroffset;
5902      int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5903      if (errorcode != 0)
5904        {
5905        if (offsetcount >= 2)
5906          {
5907          offsets[0] = erroroffset;
5908          offsets[1] = errorcode;
5909          }
5910        return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5911          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5912        }
5913    
5914      /* Check that a start_offset points to the start of a UTF-8 character. */
5915      if (start_offset > 0 && start_offset < length &&
5916          (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5917        return PCRE_ERROR_BADUTF8_OFFSET;
5918      }
5919    #endif
5920    
5921    /* If the pattern was successfully studied with JIT support, run the JIT
5922    executable instead of the rest of this function. Most options must be set at
5923    compile time for the JIT code to be usable. Fallback to the normal code path if
5924    an unsupported flag is set. In particular, JIT does not support partial
5925    matching. */
5926    
5927    #ifdef SUPPORT_JIT
5928    if (extra_data != NULL
5929        && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
5930        && extra_data->executable_jit != NULL
5931        && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
5932                        PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
5933      return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
5934        start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
5935        ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
5936    #endif
5937    
5938    /* Carry on with non-JIT matching. This information is for finding all the
5939    numbers associated with a given name, for condition testing. */
5940    
5941  md->name_table = (uschar *)re + re->name_table_offset;  md->name_table = (uschar *)re + re->name_table_offset;
5942  md->name_count = re->name_count;  md->name_count = re->name_count;
# Line 5865  md->end_subject = md->start_subject + le Line 6003  md->end_subject = md->start_subject + le
6003  end_subject = md->end_subject;  end_subject = md->end_subject;
6004    
6005  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;  
6006  md->use_ucp = (re->options & PCRE_UCP) != 0;  md->use_ucp = (re->options & PCRE_UCP) != 0;
6007  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6008    
# Line 5876  md->notbol = (options & PCRE_NOTBOL) != Line 6013  md->notbol = (options & PCRE_NOTBOL) !=
6013  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
6014  md->notempty = (options & PCRE_NOTEMPTY) != 0;  md->notempty = (options & PCRE_NOTEMPTY) != 0;
6015  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :  
               ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;  
   
6016    
6017  md->hitend = FALSE;  md->hitend = FALSE;
6018  md->mark = NULL;                        /* In case never set */  md->mark = NULL;                        /* In case never set */
6019    
6020  md->recursive = NULL;                   /* No recursion at top level */  md->recursive = NULL;                   /* No recursion at top level */
6021    md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6022    
6023  md->lcc = tables + lcc_offset;  md->lcc = tables + lcc_offset;
6024  md->ctypes = tables + ctypes_offset;  md->ctypes = tables + ctypes_offset;
# Line 5961  defined (though never set). So there's n Line 6096  defined (though never set). So there's n
6096  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6097    return PCRE_ERROR_BADPARTIAL;    return PCRE_ERROR_BADPARTIAL;
6098    
 /* Check a UTF-8 string if required. Pass back the character offset and error  
 code for an invalid string if a results vector is available. */  
   
 #ifdef SUPPORT_UTF8  
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  
   {  
   int erroroffset;  
   int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);  
   if (errorcode != 0)  
     {  
     if (offsetcount >= 2)  
       {  
       offsets[0] = erroroffset;  
       offsets[1] = errorcode;  
       }  
     return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?  
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;  
     }  
   
   /* Check that a start_offset points to the start of a UTF-8 character. */  
   
   if (start_offset > 0 && start_offset < length &&  
       (((USPTR)subject)[start_offset] & 0xc0) == 0x80)  
     return PCRE_ERROR_BADUTF8_OFFSET;  
   }  
 #endif  
   
6099  /* If the expression has got more back references than the offsets supplied can  /* If the expression has got more back references than the offsets supplied can
6100  hold, we get a temporary chunk of working store to use during the matching.  hold, we get a temporary chunk of working store to use during the matching.
6101  Otherwise, we can use the vector supplied, rounding down its size to a multiple  Otherwise, we can use the vector supplied, rounding down its size to a multiple
6102  of 3. */  of 3. */
6103    
6104  ocount = offsetcount - (offsetcount % 3);  ocount = offsetcount - (offsetcount % 3);
6105    arg_offset_max = (2*ocount)/3;
6106    
6107  if (re->top_backref > 0 && re->top_backref >= ocount/3)  if (re->top_backref > 0 && re->top_backref >= ocount/3)
6108    {    {
# Line 6368  if (rc == MATCH_MATCH || rc == MATCH_ACC Line 6477  if (rc == MATCH_MATCH || rc == MATCH_ACC
6477    {    {
6478    if (using_temporary_offsets)    if (using_temporary_offsets)
6479      {      {
6480      if (offsetcount >= 4)      if (arg_offset_max >= 4)
6481        {        {
6482        memcpy(offsets + 2, md->offset_vector + 2,        memcpy(offsets + 2, md->offset_vector + 2,
6483          (offsetcount - 2) * sizeof(int));          (arg_offset_max - 2) * sizeof(int));
6484        DPRINTF(("Copied offsets from temporary memory\n"));        DPRINTF(("Copied offsets from temporary memory\n"));
6485        }        }
6486      if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;      if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6487      DPRINTF(("Freeing temporary memory\n"));      DPRINTF(("Freeing temporary memory\n"));
6488      (pcre_free)(md->offset_vector);      (pcre_free)(md->offset_vector);
6489      }      }
6490    
6491    /* Set the return code to the number of captured strings, or 0 if there are    /* Set the return code to the number of captured strings, or 0 if there were
6492    too many to fit into the vector. */    too many to fit into the vector. */
6493    
6494    rc = md->offset_overflow? 0 : md->end_offset_top/2;    rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6495        0 : md->end_offset_top/2;
6496    
6497    /* If there is space in the offset vector, set any unused pairs at the end of    /* If there is space in the offset vector, set any unused pairs at the end of
6498    the pattern to -1 for backwards compatibility. It is documented that this    the pattern to -1 for backwards compatibility. It is documented that this
6499    happens. In earlier versions, the whole set of potential capturing offsets    happens. In earlier versions, the whole set of potential capturing offsets
6500    was set to -1 each time round the loop, but this is handled differently now.    was set to -1 each time round the loop, but this is handled differently now.
6501    "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only    "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6502    those at the end that need unsetting here. We can't just unset them all at    those at the end that need unsetting here. We can't just unset them all at
6503    the start of the whole thing because they may get set in one branch that is    the start of the whole thing because they may get set in one branch that is
6504    not the final matching branch. */    not the final matching branch. */

Legend:
Removed from v.649  
changed lines
  Added in v.721

  ViewVC Help
Powered by ViewVC 1.1.5