/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 645 by ph10, Sun Jul 31 17:02:18 2011 UTC revision 702 by ph10, Tue Sep 20 15:45:06 2011 UTC
# Line 870  for (;;) Line 870  for (;;)
870      /* VVVVVVVVVVVVVVVVVVVVVVVVV */      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871    
872      /* Non-capturing or atomic group, except for possessive with unlimited      /* Non-capturing or atomic group, except for possessive with unlimited
873      repeat. Loop for all the alternatives. When we get to the final alternative      repeat. Loop for all the alternatives.
874      within the brackets, we used to return the result of a recursive call to  
875      match() whatever happened so it was possible to reduce stack usage by      When we get to the final alternative within the brackets, we used to return
876      turning this into a tail recursion, except in the case of a possibly empty      the result of a recursive call to match() whatever happened so it was
877      group. However, now that there is the possiblity of (*THEN) occurring in      possible to reduce stack usage by turning this into a tail recursion,
878      the final alternative, this optimization is no longer possible.      except in the case of a possibly empty group. However, now that there is
879        the possiblity of (*THEN) occurring in the final alternative, this
880        optimization is no longer always possible.
881    
882        We can optimize if we know there are no (*THEN)s in the pattern; at present
883        this is the best that can be done.
884    
885      MATCH_ONCE is returned when the end of an atomic group is successfully      MATCH_ONCE is returned when the end of an atomic group is successfully
886      reached, but subsequent matching fails. It passes back up the tree (causing      reached, but subsequent matching fails. It passes back up the tree (causing
# Line 892  for (;;) Line 897  for (;;)
897      for (;;)      for (;;)
898        {        {
899        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
900        RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,  
901          /* If this is not a possibly empty group, and there are no (*THEN)s in
902          the pattern, and this is the final alternative, optimize as described
903          above. */
904    
905          else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
906            {
907            ecode += _pcre_OP_lengths[*ecode];
908            goto TAIL_RECURSE;
909            }
910    
911          /* In all other cases, we have to make another call to match(). */
912    
913          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
914          RM2);          RM2);
915        if (rrc != MATCH_NOMATCH &&        if (rrc != MATCH_NOMATCH &&
916            (rrc != MATCH_THEN || md->start_match_ptr != ecode))            (rrc != MATCH_THEN || md->start_match_ptr != ecode))
# Line 1082  for (;;) Line 1100  for (;;)
1100          cb.capture_top      = offset_top/2;          cb.capture_top      = offset_top/2;
1101          cb.capture_last     = md->capture_last;          cb.capture_last     = md->capture_last;
1102          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
1103          cb.mark             = markptr;          cb.mark             = markptr;
1104          if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);          if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1105          if (rrc < 0) RRETURN(rrc);          if (rrc < 0) RRETURN(rrc);
1106          }          }
# Line 1264  for (;;) Line 1282  for (;;)
1282        }        }
1283    
1284      /* We are now at the branch that is to be obeyed. As there is only one,      /* We are now at the branch that is to be obeyed. As there is only one,
1285      we used to use tail recursion to avoid using another stack frame, except      we used always to use tail recursion to avoid using another stack frame,
1286      when there was unlimited repeat of a possibly empty group. However, that      except when there was unlimited repeat of a possibly empty group. However,
1287      strategy no longer works because of the possibilty of (*THEN) being      that strategy no longer works because of the possibilty of (*THEN) being
1288      encountered in the branch. A recursive call to match() is always required,      encountered in the branch. However, we can still use tail recursion if
1289      unless the second alternative doesn't exist, in which case we can just      there are no (*THEN)s in the pattern. Otherwise, a recursive call to
1290      plough on. */      match() is always required, unless the second alternative doesn't exist, in
1291        which case we can just plough on. */
1292    
1293      if (condition || *ecode == OP_ALT)      if (condition || *ecode == OP_ALT)
1294        {        {
1295        if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;        if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1296          else if (!md->hasthen)
1297            {
1298            ecode += 1 + LINK_SIZE;
1299            goto TAIL_RECURSE;
1300            }
1301    
1302          /* A call to match() is required. */
1303    
1304        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1305        if (rrc == MATCH_THEN && md->start_match_ptr == ecode)  
1306          rrc = MATCH_NOMATCH;        /* If the result is THEN from within the "true" branch of the condition,
1307          md->start_match_ptr will point to the original OP_COND, not to the start
1308          of the branch, so we have do work to see if it matches. If THEN comes
1309          from the "false" branch, md->start_match_ptr does point to OP_ALT. */
1310    
1311          if (rrc == MATCH_THEN)
1312            {
1313            if (*ecode != OP_ALT)
1314              {
1315              do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1316              ecode -= GET(ecode, 1);
1317              }
1318            if (md->start_match_ptr == ecode) rrc = MATCH_NOMATCH;
1319            }
1320        RRETURN(rrc);        RRETURN(rrc);
1321        }        }
1322      else                         /* Condition false & no alternative */  
1323         /* Condition false & no alternative; continue after the group. */
1324    
1325        else
1326        {        {
1327        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1328        }        }
# Line 1477  for (;;) Line 1520  for (;;)
1520        cb.capture_top      = offset_top/2;        cb.capture_top      = offset_top/2;
1521        cb.capture_last     = md->capture_last;        cb.capture_last     = md->capture_last;
1522        cb.callout_data     = md->callout_data;        cb.callout_data     = md->callout_data;
1523        cb.mark             = markptr;        cb.mark             = markptr;
1524        if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);        if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1525        if (rrc < 0) RRETURN(rrc);        if (rrc < 0) RRETURN(rrc);
1526        }        }
# Line 1505  for (;;) Line 1548  for (;;)
1548        {        {
1549        recursion_info *ri;        recursion_info *ri;
1550        int recno;        int recno;
1551    
1552        callpat = md->start_code + GET(ecode, 1);        callpat = md->start_code + GET(ecode, 1);
1553        recno = (callpat == md->start_code)? 0 :        recno = (callpat == md->start_code)? 0 :
1554          GET2(callpat, 1 + LINK_SIZE);          GET2(callpat, 1 + LINK_SIZE);
1555    
1556        /* Check for repeating a recursion without advancing the subject pointer.        /* Check for repeating a recursion without advancing the subject pointer.
1557        This should catch convoluted mutual recursions. (Some simple cases are        This should catch convoluted mutual recursions. (Some simple cases are
1558        caught at compile time.) */        caught at compile time.) */
1559    
1560        for (ri = md->recursive; ri != NULL; ri = ri->prevrec)        for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1561          if (recno == ri->group_num && eptr == ri->subject_position)          if (recno == ri->group_num && eptr == ri->subject_position)
1562            RRETURN(PCRE_ERROR_RECURSELOOP);            RRETURN(PCRE_ERROR_RECURSELOOP);
1563    
1564        /* Add to "recursing stack" */        /* Add to "recursing stack" */
# Line 1556  for (;;) Line 1599  for (;;)
1599            md, eptrb, RM6);            md, eptrb, RM6);
1600          memcpy(md->offset_vector, new_recursive.offset_save,          memcpy(md->offset_vector, new_recursive.offset_save,
1601              new_recursive.saved_max * sizeof(int));              new_recursive.saved_max * sizeof(int));
1602            md->recursive = new_recursive.prevrec;
1603          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1604            {            {
1605            DPRINTF(("Recursion matched\n"));            DPRINTF(("Recursion matched\n"));
           md->recursive = new_recursive.prevrec;  
1606            if (new_recursive.offset_save != stacksave)            if (new_recursive.offset_save != stacksave)
1607              (pcre_free)(new_recursive.offset_save);              (pcre_free)(new_recursive.offset_save);
1608    
# Line 2014  for (;;) Line 2057  for (;;)
2057      /* Fall through */      /* Fall through */
2058    
2059      case OP_ALLANY:      case OP_ALLANY:
2060      if (eptr++ >= md->end_subject)      if (eptr >= md->end_subject)   /* DO NOT merge the eptr++ here; it must */
2061        {        {                            /* not be updated before SCHECK_PARTIAL. */
2062        SCHECK_PARTIAL();        SCHECK_PARTIAL();
2063        MRRETURN(MATCH_NOMATCH);        MRRETURN(MATCH_NOMATCH);
2064        }        }
2065        eptr++;
2066      if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;      if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2067      ecode++;      ecode++;
2068      break;      break;
# Line 2027  for (;;) Line 2071  for (;;)
2071      any byte, even newline, independent of the setting of PCRE_DOTALL. */      any byte, even newline, independent of the setting of PCRE_DOTALL. */
2072    
2073      case OP_ANYBYTE:      case OP_ANYBYTE:
2074      if (eptr++ >= md->end_subject)      if (eptr >= md->end_subject)   /* DO NOT merge the eptr++ here; it must */
2075        {        {                            /* not be updated before SCHECK_PARTIAL. */
2076        SCHECK_PARTIAL();        SCHECK_PARTIAL();
2077        MRRETURN(MATCH_NOMATCH);        MRRETURN(MATCH_NOMATCH);
2078        }        }
2079        eptr++;
2080      ecode++;      ecode++;
2081      break;      break;
2082    
# Line 5180  for (;;) Line 5225  for (;;)
5225                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5226                }                }
5227              }              }
5228            else eptr = md->end_subject;   /* Unlimited UTF-8 repeat */            else
5229                {
5230                eptr = md->end_subject;   /* Unlimited UTF-8 repeat */
5231                SCHECK_PARTIAL();
5232                }
5233            break;            break;
5234    
5235            /* The byte case is the same as non-UTF8 */            /* The byte case is the same as non-UTF8 */
# Line 5755  pcre_exec(const pcre *argument_re, const Line 5804  pcre_exec(const pcre *argument_re, const
5804    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5805    int offsetcount)    int offsetcount)
5806  {  {
5807  int rc, ocount;  int rc, ocount, arg_offset_max;
5808  int first_byte = -1;  int first_byte = -1;
5809  int req_byte = -1;  int req_byte = -1;
5810  int req_byte2 = -1;  int req_byte2 = -1;
# Line 5791  if (re == NULL || subject == NULL || Line 5840  if (re == NULL || subject == NULL ||
5840  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5841  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5842    
5843  /* This information is for finding all the numbers associated with a given  /* These two settings are used in the code for checking a UTF-8 string that
5844  name, for condition testing. */  follows immediately afterwards. Other values in the md block are used only
5845    during "normal" pcre_exec() processing, not when the JIT support is in use,
5846    so they are set up later. */
5847    
5848    utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5849    md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5850                  ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5851    
5852    /* Check a UTF-8 string if required. Pass back the character offset and error
5853    code for an invalid string if a results vector is available. */
5854    
5855    #ifdef SUPPORT_UTF8
5856    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5857      {
5858      int erroroffset;
5859      int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5860      if (errorcode != 0)
5861        {
5862        if (offsetcount >= 2)
5863          {
5864          offsets[0] = erroroffset;
5865          offsets[1] = errorcode;
5866          }
5867        return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5868          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5869        }
5870    
5871      /* Check that a start_offset points to the start of a UTF-8 character. */
5872      if (start_offset > 0 && start_offset < length &&
5873          (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5874        return PCRE_ERROR_BADUTF8_OFFSET;
5875      }
5876    #endif
5877    
5878    /* If the pattern was successfully studied with JIT support, run the JIT
5879    executable instead of the rest of this function. Most options must be set at
5880    compile time for the JIT code to be usable. Fallback to the normal code path if
5881    an unsupported flag is set. In particular, JIT does not support partial
5882    matching. */
5883    
5884    #ifdef SUPPORT_JIT
5885    if (extra_data != NULL
5886        && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
5887        && extra_data->executable_jit != NULL
5888        && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
5889                        PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
5890      return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
5891        start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
5892        ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
5893    #endif
5894    
5895    /* Carry on with non-JIT matching. This information is for finding all the
5896    numbers associated with a given name, for condition testing. */
5897    
5898  md->name_table = (uschar *)re + re->name_table_offset;  md->name_table = (uschar *)re + re->name_table_offset;
5899  md->name_count = re->name_count;  md->name_count = re->name_count;
# Line 5859  md->end_subject = md->start_subject + le Line 5960  md->end_subject = md->start_subject + le
5960  end_subject = md->end_subject;  end_subject = md->end_subject;
5961    
5962  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;  
5963  md->use_ucp = (re->options & PCRE_UCP) != 0;  md->use_ucp = (re->options & PCRE_UCP) != 0;
5964  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;  md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5965    
# Line 5870  md->notbol = (options & PCRE_NOTBOL) != Line 5970  md->notbol = (options & PCRE_NOTBOL) !=
5970  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
5971  md->notempty = (options & PCRE_NOTEMPTY) != 0;  md->notempty = (options & PCRE_NOTEMPTY) != 0;
5972  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;  md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :  
               ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;  
   
5973    
5974  md->hitend = FALSE;  md->hitend = FALSE;
5975  md->mark = NULL;                        /* In case never set */  md->mark = NULL;                        /* In case never set */
5976    
5977  md->recursive = NULL;                   /* No recursion at top level */  md->recursive = NULL;                   /* No recursion at top level */
5978    md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
5979    
5980  md->lcc = tables + lcc_offset;  md->lcc = tables + lcc_offset;
5981  md->ctypes = tables + ctypes_offset;  md->ctypes = tables + ctypes_offset;
# Line 5955  defined (though never set). So there's n Line 6053  defined (though never set). So there's n
6053  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6054    return PCRE_ERROR_BADPARTIAL;    return PCRE_ERROR_BADPARTIAL;
6055    
 /* Check a UTF-8 string if required. Pass back the character offset and error  
 code for an invalid string if a results vector is available. */  
   
 #ifdef SUPPORT_UTF8  
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  
   {  
   int erroroffset;  
   int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);  
   if (errorcode != 0)  
     {  
     if (offsetcount >= 2)  
       {  
       offsets[0] = erroroffset;  
       offsets[1] = errorcode;  
       }  
     return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?  
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;  
     }  
   
   /* Check that a start_offset points to the start of a UTF-8 character. */  
   
   if (start_offset > 0 && start_offset < length &&  
       (((USPTR)subject)[start_offset] & 0xc0) == 0x80)  
     return PCRE_ERROR_BADUTF8_OFFSET;  
   }  
 #endif  
   
6056  /* If the expression has got more back references than the offsets supplied can  /* If the expression has got more back references than the offsets supplied can
6057  hold, we get a temporary chunk of working store to use during the matching.  hold, we get a temporary chunk of working store to use during the matching.
6058  Otherwise, we can use the vector supplied, rounding down its size to a multiple  Otherwise, we can use the vector supplied, rounding down its size to a multiple
6059  of 3. */  of 3. */
6060    
6061  ocount = offsetcount - (offsetcount % 3);  ocount = offsetcount - (offsetcount % 3);
6062    arg_offset_max = (2*ocount)/3;
6063    
6064  if (re->top_backref > 0 && re->top_backref >= ocount/3)  if (re->top_backref > 0 && re->top_backref >= ocount/3)
6065    {    {
# Line 6362  if (rc == MATCH_MATCH || rc == MATCH_ACC Line 6434  if (rc == MATCH_MATCH || rc == MATCH_ACC
6434    {    {
6435    if (using_temporary_offsets)    if (using_temporary_offsets)
6436      {      {
6437      if (offsetcount >= 4)      if (arg_offset_max >= 4)
6438        {        {
6439        memcpy(offsets + 2, md->offset_vector + 2,        memcpy(offsets + 2, md->offset_vector + 2,
6440          (offsetcount - 2) * sizeof(int));          (arg_offset_max - 2) * sizeof(int));
6441        DPRINTF(("Copied offsets from temporary memory\n"));        DPRINTF(("Copied offsets from temporary memory\n"));
6442        }        }
6443      if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;      if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6444      DPRINTF(("Freeing temporary memory\n"));      DPRINTF(("Freeing temporary memory\n"));
6445      (pcre_free)(md->offset_vector);      (pcre_free)(md->offset_vector);
6446      }      }
6447    
6448    /* Set the return code to the number of captured strings, or 0 if there are    /* Set the return code to the number of captured strings, or 0 if there were
6449    too many to fit into the vector. */    too many to fit into the vector. */
6450    
6451    rc = md->offset_overflow? 0 : md->end_offset_top/2;    rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6452        0 : md->end_offset_top/2;
6453    
6454    /* If there is space in the offset vector, set any unused pairs at the end of    /* If there is space in the offset vector, set any unused pairs at the end of
6455    the pattern to -1 for backwards compatibility. It is documented that this    the pattern to -1 for backwards compatibility. It is documented that this
6456    happens. In earlier versions, the whole set of potential capturing offsets    happens. In earlier versions, the whole set of potential capturing offsets
6457    was set to -1 each time round the loop, but this is handled differently now.    was set to -1 each time round the loop, but this is handled differently now.
6458    "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only    "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6459    those at the end that need unsetting here. We can't just unset them all at    those at the end that need unsetting here. We can't just unset them all at
6460    the start of the whole thing because they may get set in one branch that is    the start of the whole thing because they may get set in one branch that is
6461    not the final matching branch. */    not the final matching branch. */

Legend:
Removed from v.645  
changed lines
  Added in v.702

  ViewVC Help
Powered by ViewVC 1.1.5