/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 381 by ph10, Tue Mar 3 16:08:23 2009 UTC revision 399 by ph10, Sat Mar 21 12:34:15 2009 UTC
# Line 561  int oclength; Line 561  int oclength;
561  uschar occhars[8];  uschar occhars[8];
562  #endif  #endif
563    
564    int codelink;
565    int condcode;
566  int ctype;  int ctype;
567  int length;  int length;
568  int max;  int max;
# Line 635  for (;;) Line 637  for (;;)
637    {    {
638    minimize = possessive = FALSE;    minimize = possessive = FALSE;
639    op = *ecode;    op = *ecode;
640    
641    /* For partial matching, remember if we ever hit the end of the subject after    /* For partial matching, remember if we ever hit the end of the subject after
642    matching at least one subject character. */    matching at least one subject character. */
643    
# Line 787  for (;;) Line 789  for (;;)
789    
790      case OP_COND:      case OP_COND:
791      case OP_SCOND:      case OP_SCOND:
792        codelink= GET(ecode, 1);
793    
794      /* Because of the way auto-callout works during compile, a callout item is      /* Because of the way auto-callout works during compile, a callout item is
795      inserted between OP_COND and an assertion condition. */      inserted between OP_COND and an assertion condition. */
796    
797      if (ecode[LINK_SIZE+1] == OP_CALLOUT)      if (ecode[LINK_SIZE+1] == OP_CALLOUT)
798        {        {
799        if (pcre_callout != NULL)        if (pcre_callout != NULL)
# Line 812  for (;;) Line 816  for (;;)
816          }          }
817        ecode += _pcre_OP_lengths[OP_CALLOUT];        ecode += _pcre_OP_lengths[OP_CALLOUT];
818        }        }
819    
820        condcode = ecode[LINK_SIZE+1];
821    
822      /* Now see what the actual condition is */      /* Now see what the actual condition is */
823    
824      if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */      if (condcode == OP_RREF)         /* Recursion test */
825        {        {
826        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
827        condition = md->recursive != NULL &&        condition = md->recursive != NULL &&
# Line 823  for (;;) Line 829  for (;;)
829        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
830        }        }
831    
832      else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */      else if (condcode == OP_CREF)    /* Group used test */
833        {        {
834        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
835        condition = offset < offset_top && md->offset_vector[offset] >= 0;        condition = offset < offset_top && md->offset_vector[offset] >= 0;
836        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
837        }        }
838    
839      else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */      else if (condcode == OP_DEF)     /* DEFINE - always false */
840        {        {
841        condition = FALSE;        condition = FALSE;
842        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
# Line 857  for (;;) Line 863  for (;;)
863        else        else
864          {          {
865          condition = FALSE;          condition = FALSE;
866          ecode += GET(ecode, 1);          ecode += codelink;
867          }          }
868        }        }
869    
# Line 880  for (;;) Line 886  for (;;)
886          goto TAIL_RECURSE;          goto TAIL_RECURSE;
887          }          }
888        }        }
889      else                         /* Condition false & no 2nd alternative */      else                         /* Condition false & no alternative */
890        {        {
891        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
892        }        }
# Line 1707  for (;;) Line 1713  for (;;)
1713      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1714      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
1715        {        {
1716        const ucd_record * prop = GET_UCD(c);        const ucd_record *prop = GET_UCD(c);
1717    
1718        switch(ecode[1])        switch(ecode[1])
1719          {          {
# Line 2075  for (;;) Line 2081  for (;;)
2081    
2082    
2083      /* Match an extended character class. This opcode is encountered only      /* Match an extended character class. This opcode is encountered only
2084      in UTF-8 mode, because that's the only time it is compiled. */      when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2085        mode, because Unicode properties are supported in non-UTF-8 mode. */
2086    
2087  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2088      case OP_XCLASS:      case OP_XCLASS:
# Line 2117  for (;;) Line 2124  for (;;)
2124        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
2125          {          {
2126          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2127          GETCHARINC(c, eptr);          GETCHARINCTEST(c, eptr);
2128          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2129          }          }
2130    
# Line 2136  for (;;) Line 2143  for (;;)
2143            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2144            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2145            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2146            GETCHARINC(c, eptr);            GETCHARINCTEST(c, eptr);
2147            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2148            }            }
2149          /* Control never gets here */          /* Control never gets here */
# Line 2151  for (;;) Line 2158  for (;;)
2158            {            {
2159            int len = 1;            int len = 1;
2160            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
2161            GETCHARLEN(c, eptr, len);            GETCHARLENTEST(c, eptr, len);
2162            if (!_pcre_xclass(c, data)) break;            if (!_pcre_xclass(c, data)) break;
2163            eptr += len;            eptr += len;
2164            }            }
# Line 4561  switch ((((options & PCRE_NEWLINE_BITS) Line 4568  switch ((((options & PCRE_NEWLINE_BITS)
4568          (pcre_uint32)options) & PCRE_NEWLINE_BITS)          (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4569    {    {
4570    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
4571    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4572    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4573    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
4574         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4575    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
4576    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4577    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 4716  for(;;) Line 4723  for(;;)
4723      while (iptr < iend) *iptr++ = -1;      while (iptr < iend) *iptr++ = -1;
4724      }      }
4725    
4726    /* Advance to a unique first char if possible. If firstline is TRUE, the    /* If firstline is TRUE, the start of the match is constrained to the first
4727    start of the match is constrained to the first line of a multiline string.    line of a multiline string. That is, the match must be before or at the first
4728    That is, the match must be before or at the first newline. Implement this by    newline. Implement this by temporarily adjusting end_subject so that we stop
4729    temporarily adjusting end_subject so that we stop scanning at a newline. If    scanning at a newline. If the match fails at the newline, later code breaks
4730    the match fails at the newline, later code breaks this loop. */    this loop. */
4731    
4732    if (firstline)    if (firstline)
4733      {      {
# Line 4740  for(;;) Line 4747  for(;;)
4747      end_subject = t;      end_subject = t;
4748      }      }
4749    
4750    /* Now advance to a unique first byte if there is one. */    /* There are some optimizations that avoid running the match if a known
4751      starting point is not found, or if a known later character is not present.
4752      However, there is an option that disables these, for testing and for ensuring
4753      that all callouts do actually occur. */
4754    
4755    if (first_byte >= 0)    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4756      {      {
4757      if (first_byte_caseless)      /* Advance to a unique first byte if there is one. */
4758        while (start_match < end_subject && md->lcc[*start_match] != first_byte)  
4759          start_match++;      if (first_byte >= 0)
4760      else        {
4761        while (start_match < end_subject && *start_match != first_byte)        if (first_byte_caseless)
4762          start_match++;          while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4763      }            start_match++;
4764          else
4765            while (start_match < end_subject && *start_match != first_byte)
4766              start_match++;
4767          }
4768    
4769    /* Or to just after a linebreak for a multiline match */      /* Or to just after a linebreak for a multiline match */
4770    
4771    else if (startline)      else if (startline)
     {  
     if (start_match > md->start_subject + start_offset)  
4772        {        {
4773  #ifdef SUPPORT_UTF8        if (start_match > md->start_subject + start_offset)
       if (utf8)  
4774          {          {
4775          while (start_match < end_subject && !WAS_NEWLINE(start_match))  #ifdef SUPPORT_UTF8
4776            if (utf8)
4777            {            {
4778            start_match++;            while (start_match < end_subject && !WAS_NEWLINE(start_match))
4779            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)              {
4780              start_match++;              start_match++;
4781                while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4782                  start_match++;
4783                }
4784            }            }
4785          }          else
       else  
4786  #endif  #endif
4787        while (start_match < end_subject && !WAS_NEWLINE(start_match))          while (start_match < end_subject && !WAS_NEWLINE(start_match))
4788          start_match++;            start_match++;
4789    
4790        /* If we have just passed a CR and the newline option is ANY or ANYCRLF,          /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4791        and we are now at a LF, advance the match position by one more character.          and we are now at a LF, advance the match position by one more character.
4792        */          */
4793    
4794        if (start_match[-1] == '\r' &&          if (start_match[-1] == CHAR_CR &&
4795             (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4796             start_match < end_subject &&               start_match < end_subject &&
4797             *start_match == '\n')               *start_match == CHAR_NL)
4798          start_match++;            start_match++;
4799            }
4800        }        }
     }  
4801    
4802    /* Or to a non-unique first byte after study */      /* Or to a non-unique first byte after study */
4803    
4804    else if (start_bits != NULL)      else if (start_bits != NULL)
     {  
     while (start_match < end_subject)  
4805        {        {
4806        register unsigned int c = *start_match;        while (start_match < end_subject)
4807        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;          {
4808          else break;          register unsigned int c = *start_match;
4809            if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4810              else break;
4811            }
4812        }        }
4813      }      }   /* Starting optimizations */
4814    
4815    /* Restore fudged end_subject */    /* Restore fudged end_subject */
4816    
# Line 4807  for(;;) Line 4822  for(;;)
4822    printf("\n");    printf("\n");
4823  #endif  #endif
4824    
4825    /* If req_byte is set, we know that that character must appear in the subject    /* If req_byte is set, we know that that character must appear in the
4826    for the match to succeed. If the first character is set, req_byte must be    subject for the match to succeed. If the first character is set, req_byte
4827    later in the subject; otherwise the test starts at the match point. This    must be later in the subject; otherwise the test starts at the match point.
4828    optimization can save a huge amount of backtracking in patterns with nested    This optimization can save a huge amount of backtracking in patterns with
4829    unlimited repeats that aren't going to match. Writing separate code for    nested unlimited repeats that aren't going to match. Writing separate code
4830    cased/caseless versions makes it go faster, as does using an autoincrement    for cased/caseless versions makes it go faster, as does using an
4831    and backing off on a match.    autoincrement and backing off on a match.
4832    
4833    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end
4834    take a long time, and give bad performance on quite ordinary patterns. This    can take a long time, and give bad performance on quite ordinary patterns.
4835    showed up when somebody was matching something like /^\d+C/ on a 32-megabyte    This showed up when somebody was matching something like /^\d+C/ on a
4836    string... so we don't do this when the string is sufficiently long.    32-megabyte string... so we don't do this when the string is sufficiently
4837      long.
4838    
4839    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, or if
4840    */    disabling is explicitly requested. */
4841    
4842    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4843          req_byte >= 0 &&
4844        end_subject - start_match < REQ_BYTE_MAX &&        end_subject - start_match < REQ_BYTE_MAX &&
4845        !md->partial)        !md->partial)
4846      {      {
# Line 4931  for(;;) Line 4948  for(;;)
4948    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
4949    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
4950    
4951    if (start_match[-1] == '\r' &&    if (start_match[-1] == CHAR_CR &&
4952        start_match < end_subject &&        start_match < end_subject &&
4953        *start_match == '\n' &&        *start_match == CHAR_NL &&
4954        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
4955          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
4956           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.381  
changed lines
  Added in v.399

  ViewVC Help
Powered by ViewVC 1.1.5