# Diff of /code/trunk/pcre_exec.c

revision 626 by ph10, Wed Jul 20 17:51:54 2011 UTC revision 713 by ph10, Tue Sep 27 11:03:15 2011 UTC
# Line 870  for (;;) Line 870  for (;;)
870      /* VVVVVVVVVVVVVVVVVVVVVVVVV */      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871
872      /* Non-capturing or atomic group, except for possessive with unlimited      /* Non-capturing or atomic group, except for possessive with unlimited
873      repeat. Loop for all the alternatives. When we get to the final alternative      repeat. Loop for all the alternatives.
874      within the brackets, we used to return the result of a recursive call to
875      match() whatever happened so it was possible to reduce stack usage by      When we get to the final alternative within the brackets, we used to return
876      turning this into a tail recursion, except in the case of a possibly empty      the result of a recursive call to match() whatever happened so it was
877      group. However, now that there is the possiblity of (*THEN) occurring in      possible to reduce stack usage by turning this into a tail recursion,
878      the final alternative, this optimization is no longer possible.      except in the case of a possibly empty group. However, now that there is
879        the possiblity of (*THEN) occurring in the final alternative, this
880        optimization is no longer always possible.
881
882        We can optimize if we know there are no (*THEN)s in the pattern; at present
883        this is the best that can be done.
884
885      MATCH_ONCE is returned when the end of an atomic group is successfully      MATCH_ONCE is returned when the end of an atomic group is successfully
886      reached, but subsequent matching fails. It passes back up the tree (causing      reached, but subsequent matching fails. It passes back up the tree (causing
# Line 892  for (;;) Line 897  for (;;)
897      for (;;)      for (;;)
898        {        {
899        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
900
901          /* If this is not a possibly empty group, and there are no (*THEN)s in
902          the pattern, and this is the final alternative, optimize as described
903          above. */
904
905          else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
906            {
907            ecode += _pcre_OP_lengths[*ecode];
908            goto TAIL_RECURSE;
909            }
910
911          /* In all other cases, we have to make another call to match(). */
912
913        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
914          RM2);          RM2);
915        if (rrc != MATCH_NOMATCH &&        if (rrc != MATCH_NOMATCH &&
# Line 1070  for (;;) Line 1088  for (;;)
1088        if (pcre_callout != NULL)        if (pcre_callout != NULL)
1089          {          {
1090          pcre_callout_block cb;          pcre_callout_block cb;
1091          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 2;   /* Version 1 of the callout block */
1093          cb.offset_vector    = md->offset_vector;          cb.offset_vector    = md->offset_vector;
1094          cb.subject          = (PCRE_SPTR)md->start_subject;          cb.subject          = (PCRE_SPTR)md->start_subject;
# Line 1082  for (;;) Line 1100  for (;;)
1100          cb.capture_top      = offset_top/2;          cb.capture_top      = offset_top/2;
1101          cb.capture_last     = md->capture_last;          cb.capture_last     = md->capture_last;
1102          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
1103            cb.mark             = markptr;
1104          if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);          if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1105          if (rrc < 0) RRETURN(rrc);          if (rrc < 0) RRETURN(rrc);
1106          }          }
# Line 1263  for (;;) Line 1282  for (;;)
1282        }        }
1283
1284      /* We are now at the branch that is to be obeyed. As there is only one,      /* We are now at the branch that is to be obeyed. As there is only one,
1285      we used to use tail recursion to avoid using another stack frame, except      we used always to use tail recursion to avoid using another stack frame,
1286      when there was unlimited repeat of a possibly empty group. However, that      except when there was unlimited repeat of a possibly empty group. However,
1287      strategy no longer works because of the possibilty of (*THEN) being      that strategy no longer works because of the possibilty of (*THEN) being
1288      encountered in the branch. A recursive call to match() is always required,      encountered in the branch. However, we can still use tail recursion if
1289      unless the second alternative doesn't exist, in which case we can just      there are no (*THEN)s in the pattern. Otherwise, a recursive call to
1290      plough on. */      match() is always required, unless the second alternative doesn't exist, in
1291        which case we can just plough on. */
1292
1293      if (condition || *ecode == OP_ALT)      if (condition || *ecode == OP_ALT)
1294        {        {
1295        if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;        if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1296          else if (!md->hasthen)
1297            {
1298            ecode += 1 + LINK_SIZE;
1299            goto TAIL_RECURSE;
1300            }
1301
1302          /* A call to match() is required. */
1303
1304        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1305        if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1306          rrc = MATCH_NOMATCH;        /* If the result is THEN from within the "true" branch of the condition,
1307          md->start_match_ptr will point to the original OP_COND, not to the start
1308          of the branch, so we have do work to see if it matches. If THEN comes
1309          from the "false" branch, md->start_match_ptr does point to OP_ALT. */
1310
1311          if (rrc == MATCH_THEN)
1312            {
1313            if (*ecode != OP_ALT)
1314              {
1315              do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1316              ecode -= GET(ecode, 1);
1317              }
1318            if (md->start_match_ptr == ecode) rrc = MATCH_NOMATCH;
1319            }
1320        RRETURN(rrc);        RRETURN(rrc);
1321        }        }
1322      else                         /* Condition false & no alternative */
1323         /* Condition false & no alternative; continue after the group. */
1324
1325        else
1326        {        {
1328        }        }
# Line 1365  for (;;) Line 1409  for (;;)
1409        if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)        if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1410          {          {
1411          mstart = md->start_match_ptr;   /* In case \K reset it */          mstart = md->start_match_ptr;   /* In case \K reset it */
1412            markptr = md->mark;
1413          break;          break;
1414          }          }
1415        if (rrc != MATCH_NOMATCH &&        if (rrc != MATCH_NOMATCH &&
# Line 1463  for (;;) Line 1508  for (;;)
1508      if (pcre_callout != NULL)      if (pcre_callout != NULL)
1509        {        {
1510        pcre_callout_block cb;        pcre_callout_block cb;
1511        cb.version          = 1;   /* Version 1 of the callout block */        cb.version          = 2;   /* Version 1 of the callout block */
1512        cb.callout_number   = ecode[1];        cb.callout_number   = ecode[1];
1513        cb.offset_vector    = md->offset_vector;        cb.offset_vector    = md->offset_vector;
1514        cb.subject          = (PCRE_SPTR)md->start_subject;        cb.subject          = (PCRE_SPTR)md->start_subject;
# Line 1475  for (;;) Line 1520  for (;;)
1520        cb.capture_top      = offset_top/2;        cb.capture_top      = offset_top/2;
1521        cb.capture_last     = md->capture_last;        cb.capture_last     = md->capture_last;
1522        cb.callout_data     = md->callout_data;        cb.callout_data     = md->callout_data;
1523          cb.mark             = markptr;
1524        if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);        if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1525        if (rrc < 0) RRETURN(rrc);        if (rrc < 0) RRETURN(rrc);
1526        }        }
# Line 1500  for (;;) Line 1546  for (;;)
1546
1547      case OP_RECURSE:      case OP_RECURSE:
1548        {        {
1549          recursion_info *ri;
1550          int recno;
1551
1552        callpat = md->start_code + GET(ecode, 1);        callpat = md->start_code + GET(ecode, 1);
1553        new_recursive.group_num = (callpat == md->start_code)? 0 :        recno = (callpat == md->start_code)? 0 :
1555
1556          /* Check for repeating a recursion without advancing the subject pointer.
1557          This should catch convoluted mutual recursions. (Some simple cases are
1558          caught at compile time.) */
1559
1560          for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1561            if (recno == ri->group_num && eptr == ri->subject_position)
1562              RRETURN(PCRE_ERROR_RECURSELOOP);
1563
1564        /* Add to "recursing stack" */        /* Add to "recursing stack" */
1565
1566          new_recursive.group_num = recno;
1567          new_recursive.subject_position = eptr;
1568        new_recursive.prevrec = md->recursive;        new_recursive.prevrec = md->recursive;
1569        md->recursive = &new_recursive;        md->recursive = &new_recursive;
1570
# Line 1540  for (;;) Line 1599  for (;;)
1599            md, eptrb, RM6);            md, eptrb, RM6);
1600          memcpy(md->offset_vector, new_recursive.offset_save,          memcpy(md->offset_vector, new_recursive.offset_save,
1601              new_recursive.saved_max * sizeof(int));              new_recursive.saved_max * sizeof(int));
1602            md->recursive = new_recursive.prevrec;
1603          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1604            {            {
1605            DPRINTF(("Recursion matched\n"));            DPRINTF(("Recursion matched\n"));
md->recursive = new_recursive.prevrec;
1606            if (new_recursive.offset_save != stacksave)            if (new_recursive.offset_save != stacksave)
1607              (pcre_free)(new_recursive.offset_save);              (pcre_free)(new_recursive.offset_save);
1608
# Line 1556  for (;;) Line 1615  for (;;)
1615            goto RECURSION_MATCHED;        /* Exit loop; end processing */            goto RECURSION_MATCHED;        /* Exit loop; end processing */
1616            }            }
1617          else if (rrc != MATCH_NOMATCH &&          else if (rrc != MATCH_NOMATCH &&
1618                  (rrc != MATCH_THEN || md->start_match_ptr != ecode))                  (rrc != MATCH_THEN || md->start_match_ptr != callpat))
1619            {            {
1620            DPRINTF(("Recursion gave error %d\n", rrc));            DPRINTF(("Recursion gave error %d\n", rrc));
1621            if (new_recursive.offset_save != stacksave)            if (new_recursive.offset_save != stacksave)
# Line 1653  for (;;) Line 1712  for (;;)
1712        md->end_match_ptr = eptr;      /* For ONCE */        md->end_match_ptr = eptr;      /* For ONCE */
1713        md->end_offset_top = offset_top;        md->end_offset_top = offset_top;
1714        md->start_match_ptr = mstart;        md->start_match_ptr = mstart;
1715        MRRETURN(MATCH_MATCH);        MRRETURN(MATCH_MATCH);         /* Sets md->mark */
1716        }        }
1717
1718      /* For capturing groups we have to check the group number back at the start      /* For capturing groups we have to check the group number back at the start
# Line 1998  for (;;) Line 2057  for (;;)
2057      /* Fall through */      /* Fall through */
2058
2059      case OP_ALLANY:      case OP_ALLANY:
2060      if (eptr++ >= md->end_subject)      if (eptr >= md->end_subject)   /* DO NOT merge the eptr++ here; it must */
2061        {        {                            /* not be updated before SCHECK_PARTIAL. */
2062        SCHECK_PARTIAL();        SCHECK_PARTIAL();
2063        MRRETURN(MATCH_NOMATCH);        MRRETURN(MATCH_NOMATCH);
2064        }        }
2065        eptr++;
2066      if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;      if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2067      ecode++;      ecode++;
2068      break;      break;
# Line 2011  for (;;) Line 2071  for (;;)
2071      any byte, even newline, independent of the setting of PCRE_DOTALL. */      any byte, even newline, independent of the setting of PCRE_DOTALL. */
2072
2073      case OP_ANYBYTE:      case OP_ANYBYTE:
2074      if (eptr++ >= md->end_subject)      if (eptr >= md->end_subject)   /* DO NOT merge the eptr++ here; it must */
2075        {        {                            /* not be updated before SCHECK_PARTIAL. */
2076        SCHECK_PARTIAL();        SCHECK_PARTIAL();
2077        MRRETURN(MATCH_NOMATCH);        MRRETURN(MATCH_NOMATCH);
2078        }        }
2079        eptr++;
2080      ecode++;      ecode++;
2081      break;      break;
2082
# Line 5164  for (;;) Line 5225  for (;;)
5225                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5226                }                }
5227              }              }
5228            else eptr = md->end_subject;   /* Unlimited UTF-8 repeat */            else
5229                {
5230                eptr = md->end_subject;   /* Unlimited UTF-8 repeat */
5231                SCHECK_PARTIAL();
5232                }
5233            break;            break;
5234
5235            /* The byte case is the same as non-UTF8 */            /* The byte case is the same as non-UTF8 */
# Line 5739  pcre_exec(const pcre *argument_re, const Line 5804  pcre_exec(const pcre *argument_re, const
5804    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5805    int offsetcount)    int offsetcount)
5806  {  {
5807  int rc, ocount;  int rc, ocount, arg_offset_max;
5808  int first_byte = -1;  int first_byte = -1;
5809  int req_byte = -1;  int req_byte = -1;
5810  int req_byte2 = -1;  int req_byte2 = -1;
# Line 5775  if (re == NULL || subject == NULL || Line 5840  if (re == NULL || subject == NULL ||
5840  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5841  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5842
5843  /* This information is for finding all the numbers associated with a given  /* These two settings are used in the code for checking a UTF-8 string that
5844  name, for condition testing. */  follows immediately afterwards. Other values in the md block are used only
5845    during "normal" pcre_exec() processing, not when the JIT support is in use,
5846    so they are set up later. */
5847
5848    utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5849    md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5850                  ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5851
5852    /* Check a UTF-8 string if required. Pass back the character offset and error
5853    code for an invalid string if a results vector is available. */
5854
5855    #ifdef SUPPORT_UTF8
5856    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5857      {
5858      int erroroffset;
5859      int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5860      if (errorcode != 0)
5861        {
5862        if (offsetcount >= 2)
5863          {
5864          offsets[0] = erroroffset;
5865          offsets[1] = errorcode;
5866          }
5867        return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5869        }
5870
5871      /* Check that a start_offset points to the start of a UTF-8 character. */
5872      if (start_offset > 0 && start_offset < length &&
5873          (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5875      }
5876    #endif
5877
5878    /* If the pattern was successfully studied with JIT support, run the JIT
5879    executable instead of the rest of this function. Most options must be set at
5880    compile time for the JIT code to be usable. Fallback to the normal code path if
5881    an unsupported flag is set. In particular, JIT does not support partial
5882    matching. */
5883
5884    #ifdef SUPPORT_JIT
5885    if (extra_data != NULL
5886        && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
5887        && extra_data->executable_jit != NULL
5888        && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
5889                        PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
5890      return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
5891        start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
5892        ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
5893    #endif
5894
5895    /* Carry on with non-JIT matching. This information is for finding all the
5896    numbers associated with a given name, for condition testing. */
5897
5898  md->name_table = (uschar *)re + re->name_table_offset;  md->name_table = (uschar *)re + re->name_table_offset;
5899  md->name_count = re->name_count;  md->name_count = re->name_count;
# Line 5843  md->end_subject = md->start_subject + le Line 5960  md->end_subject = md->start_subject + le
5960  end_subject = md->end_subject;  end_subject = md->end_subject;
5961
5962  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5963  md->use_ucp = (re->options & PCRE_UCP) != 0;  md->use_ucp = (re->options & PCRE_UCP) != 0;
5964  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5965
# Line 5854  md->notbol = (options & PCRE_NOTBOL) != Line 5970  md->notbol = (options & PCRE_NOTBOL) !=
5970  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
5971  md->notempty = (options & PCRE_NOTEMPTY) != 0;  md->notempty = (options & PCRE_NOTEMPTY) != 0;
5972  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;

5973
5974  md->hitend = FALSE;  md->hitend = FALSE;
5975  md->mark = NULL;                        /* In case never set */  md->mark = NULL;                        /* In case never set */
5976
5977  md->recursive = NULL;                   /* No recursion at top level */  md->recursive = NULL;                   /* No recursion at top level */
5978    md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
5979
5980  md->lcc = tables + lcc_offset;  md->lcc = tables + lcc_offset;
5981  md->ctypes = tables + ctypes_offset;  md->ctypes = tables + ctypes_offset;
# Line 5939  defined (though never set). So there's n Line 6053  defined (though never set). So there's n
6053  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6055
/* Check a UTF-8 string if required. Pass back the character offset and error
code for an invalid string if a results vector is available. */

#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
int erroroffset;
int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
if (errorcode != 0)
{
if (offsetcount >= 2)
{
offsets[0] = erroroffset;
offsets[1] = errorcode;
}
return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
}

/* Check that a start_offset points to the start of a UTF-8 character. */

if (start_offset > 0 && start_offset < length &&
(((USPTR)subject)[start_offset] & 0xc0) == 0x80)
}
#endif

6056  /* If the expression has got more back references than the offsets supplied can  /* If the expression has got more back references than the offsets supplied can
6057  hold, we get a temporary chunk of working store to use during the matching.  hold, we get a temporary chunk of working store to use during the matching.
6058  Otherwise, we can use the vector supplied, rounding down its size to a multiple  Otherwise, we can use the vector supplied, rounding down its size to a multiple
6059  of 3. */  of 3. */
6060
6061  ocount = offsetcount - (offsetcount % 3);  ocount = offsetcount - (offsetcount % 3);
6062    arg_offset_max = (2*ocount)/3;
6063
6064  if (re->top_backref > 0 && re->top_backref >= ocount/3)  if (re->top_backref > 0 && re->top_backref >= ocount/3)
6065    {    {
# Line 6346  if (rc == MATCH_MATCH || rc == MATCH_ACC Line 6434  if (rc == MATCH_MATCH || rc == MATCH_ACC
6434    {    {
6435    if (using_temporary_offsets)    if (using_temporary_offsets)
6436      {      {
6437      if (offsetcount >= 4)      if (arg_offset_max >= 4)
6438        {        {
6439        memcpy(offsets + 2, md->offset_vector + 2,        memcpy(offsets + 2, md->offset_vector + 2,
6440          (offsetcount - 2) * sizeof(int));          (arg_offset_max - 2) * sizeof(int));
6441        DPRINTF(("Copied offsets from temporary memory\n"));        DPRINTF(("Copied offsets from temporary memory\n"));
6442        }        }
6443      if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;      if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6444      DPRINTF(("Freeing temporary memory\n"));      DPRINTF(("Freeing temporary memory\n"));
6445      (pcre_free)(md->offset_vector);      (pcre_free)(md->offset_vector);
6446      }      }
6447
6448    /* Set the return code to the number of captured strings, or 0 if there are    /* Set the return code to the number of captured strings, or 0 if there were
6449    too many to fit into the vector. */    too many to fit into the vector. */
6450
6451    rc = md->offset_overflow? 0 : md->end_offset_top/2;    rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6452        0 : md->end_offset_top/2;
6453
6454    /* If there is space in the offset vector, set any unused pairs at the end of    /* If there is space in the offset vector, set any unused pairs at the end of
6455    the pattern to -1 for backwards compatibility. It is documented that this    the pattern to -1 for backwards compatibility. It is documented that this
6456    happens. In earlier versions, the whole set of potential capturing offsets    happens. In earlier versions, the whole set of potential capturing offsets
6457    was set to -1 each time round the loop, but this is handled differently now.    was set to -1 each time round the loop, but this is handled differently now.
6458    "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only    "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6459    those at the end that need unsetting here. We can't just unset them all at    those at the end that need unsetting here. We can't just unset them all at
6460    the start of the whole thing because they may get set in one branch that is    the start of the whole thing because they may get set in one branch that is
6461    not the final matching branch. */    not the final matching branch. */

Legend:
 Removed from v.626 changed lines Added in v.713