/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 365 by ph10, Fri Jul 11 17:06:55 2008 UTC revision 399 by ph10, Sat Mar 21 12:34:15 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 561  int oclength; Line 561  int oclength;
561  uschar occhars[8];  uschar occhars[8];
562  #endif  #endif
563    
564    int codelink;
565    int condcode;
566  int ctype;  int ctype;
567  int length;  int length;
568  int max;  int max;
# Line 635  for (;;) Line 637  for (;;)
637    {    {
638    minimize = possessive = FALSE;    minimize = possessive = FALSE;
639    op = *ecode;    op = *ecode;
640    
641    /* For partial matching, remember if we ever hit the end of the subject after    /* For partial matching, remember if we ever hit the end of the subject after
642    matching at least one subject character. */    matching at least one subject character. */
643    
# Line 787  for (;;) Line 789  for (;;)
789    
790      case OP_COND:      case OP_COND:
791      case OP_SCOND:      case OP_SCOND:
792      if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */      codelink= GET(ecode, 1);
793    
794        /* Because of the way auto-callout works during compile, a callout item is
795        inserted between OP_COND and an assertion condition. */
796    
797        if (ecode[LINK_SIZE+1] == OP_CALLOUT)
798          {
799          if (pcre_callout != NULL)
800            {
801            pcre_callout_block cb;
802            cb.version          = 1;   /* Version 1 of the callout block */
803            cb.callout_number   = ecode[LINK_SIZE+2];
804            cb.offset_vector    = md->offset_vector;
805            cb.subject          = (PCRE_SPTR)md->start_subject;
806            cb.subject_length   = md->end_subject - md->start_subject;
807            cb.start_match      = mstart - md->start_subject;
808            cb.current_position = eptr - md->start_subject;
809            cb.pattern_position = GET(ecode, LINK_SIZE + 3);
810            cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
811            cb.capture_top      = offset_top/2;
812            cb.capture_last     = md->capture_last;
813            cb.callout_data     = md->callout_data;
814            if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
815            if (rrc < 0) RRETURN(rrc);
816            }
817          ecode += _pcre_OP_lengths[OP_CALLOUT];
818          }
819    
820        condcode = ecode[LINK_SIZE+1];
821    
822        /* Now see what the actual condition is */
823    
824        if (condcode == OP_RREF)         /* Recursion test */
825        {        {
826        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
827        condition = md->recursive != NULL &&        condition = md->recursive != NULL &&
# Line 795  for (;;) Line 829  for (;;)
829        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
830        }        }
831    
832      else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */      else if (condcode == OP_CREF)    /* Group used test */
833        {        {
834        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
835        condition = offset < offset_top && md->offset_vector[offset] >= 0;        condition = offset < offset_top && md->offset_vector[offset] >= 0;
836        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
837        }        }
838    
839      else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */      else if (condcode == OP_DEF)     /* DEFINE - always false */
840        {        {
841        condition = FALSE;        condition = FALSE;
842        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
# Line 829  for (;;) Line 863  for (;;)
863        else        else
864          {          {
865          condition = FALSE;          condition = FALSE;
866          ecode += GET(ecode, 1);          ecode += codelink;
867          }          }
868        }        }
869    
# Line 852  for (;;) Line 886  for (;;)
886          goto TAIL_RECURSE;          goto TAIL_RECURSE;
887          }          }
888        }        }
889      else                         /* Condition false & no 2nd alternative */      else                         /* Condition false & no alternative */
890        {        {
891        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
892        }        }
# Line 1679  for (;;) Line 1713  for (;;)
1713      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1714      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
1715        {        {
1716        const ucd_record * prop = GET_UCD(c);        const ucd_record *prop = GET_UCD(c);
1717    
1718        switch(ecode[1])        switch(ecode[1])
1719          {          {
# Line 2047  for (;;) Line 2081  for (;;)
2081    
2082    
2083      /* Match an extended character class. This opcode is encountered only      /* Match an extended character class. This opcode is encountered only
2084      in UTF-8 mode, because that's the only time it is compiled. */      when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2085        mode, because Unicode properties are supported in non-UTF-8 mode. */
2086    
2087  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2088      case OP_XCLASS:      case OP_XCLASS:
# Line 2089  for (;;) Line 2124  for (;;)
2124        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
2125          {          {
2126          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2127          GETCHARINC(c, eptr);          GETCHARINCTEST(c, eptr);
2128          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2129          }          }
2130    
# Line 2108  for (;;) Line 2143  for (;;)
2143            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2144            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2145            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2146            GETCHARINC(c, eptr);            GETCHARINCTEST(c, eptr);
2147            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2148            }            }
2149          /* Control never gets here */          /* Control never gets here */
# Line 2123  for (;;) Line 2158  for (;;)
2158            {            {
2159            int len = 1;            int len = 1;
2160            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
2161            GETCHARLEN(c, eptr, len);            GETCHARLENTEST(c, eptr, len);
2162            if (!_pcre_xclass(c, data)) break;            if (!_pcre_xclass(c, data)) break;
2163            eptr += len;            eptr += len;
2164            }            }
# Line 2609  for (;;) Line 2644  for (;;)
2644              {              {
2645              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2646              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2647                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2648              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2649              if (d < 256) d = md->lcc[d];              if (d < 256) d = md->lcc[d];
2650              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
2651                RRETURN(MATCH_NOMATCH);  
2652              }              }
2653            }            }
2654          else          else
# Line 2718  for (;;) Line 2754  for (;;)
2754              {              {
2755              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2756              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2757                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2758              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2759              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
               RRETURN(MATCH_NOMATCH);  
2760              }              }
2761            }            }
2762          else          else
# Line 4532  switch ((((options & PCRE_NEWLINE_BITS) Line 4568  switch ((((options & PCRE_NEWLINE_BITS)
4568          (pcre_uint32)options) & PCRE_NEWLINE_BITS)          (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4569    {    {
4570    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
4571    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4572    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4573    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
4574         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4575    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
4576    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4577    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 4687  for(;;) Line 4723  for(;;)
4723      while (iptr < iend) *iptr++ = -1;      while (iptr < iend) *iptr++ = -1;
4724      }      }
4725    
4726    /* Advance to a unique first char if possible. If firstline is TRUE, the    /* If firstline is TRUE, the start of the match is constrained to the first
4727    start of the match is constrained to the first line of a multiline string.    line of a multiline string. That is, the match must be before or at the first
4728    That is, the match must be before or at the first newline. Implement this by    newline. Implement this by temporarily adjusting end_subject so that we stop
4729    temporarily adjusting end_subject so that we stop scanning at a newline. If    scanning at a newline. If the match fails at the newline, later code breaks
4730    the match fails at the newline, later code breaks this loop. */    this loop. */
4731    
4732    if (firstline)    if (firstline)
4733      {      {
4734      USPTR t = start_match;      USPTR t = start_match;
4735  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4736      if (utf8)      if (utf8)
4737        {        {
4738        while (t < md->end_subject && !IS_NEWLINE(t))        while (t < md->end_subject && !IS_NEWLINE(t))
4739          {          {
4740          t++;          t++;
4741          while (t < end_subject && (*t & 0xc0) == 0x80) t++;          while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4742          }          }
4743        }        }
4744      else      else
4745  #endif  #endif
4746      while (t < md->end_subject && !IS_NEWLINE(t)) t++;      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4747      end_subject = t;      end_subject = t;
4748      }      }
4749    
4750    /* Now advance to a unique first byte if there is one. */    /* There are some optimizations that avoid running the match if a known
4751      starting point is not found, or if a known later character is not present.
4752      However, there is an option that disables these, for testing and for ensuring
4753      that all callouts do actually occur. */
4754    
4755    if (first_byte >= 0)    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4756      {      {
4757      if (first_byte_caseless)      /* Advance to a unique first byte if there is one. */
       while (start_match < end_subject && md->lcc[*start_match] != first_byte)  
         start_match++;  
     else  
       while (start_match < end_subject && *start_match != first_byte)  
         start_match++;  
     }  
4758    
4759    /* Or to just after a linebreak for a multiline match */      if (first_byte >= 0)
4760          {
4761          if (first_byte_caseless)
4762            while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4763              start_match++;
4764          else
4765            while (start_match < end_subject && *start_match != first_byte)
4766              start_match++;
4767          }
4768    
4769    else if (startline)      /* Or to just after a linebreak for a multiline match */
4770      {  
4771      if (start_match > md->start_subject + start_offset)      else if (startline)
4772        {        {
4773  #ifdef SUPPORT_UTF8        if (start_match > md->start_subject + start_offset)
       if (utf8)  
4774          {          {
4775          while (start_match < end_subject && !WAS_NEWLINE(start_match))  #ifdef SUPPORT_UTF8
4776            if (utf8)
4777            {            {
4778            start_match++;            while (start_match < end_subject && !WAS_NEWLINE(start_match))
4779            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)              {
4780              start_match++;              start_match++;
4781            }              while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4782                  start_match++;
4783                }
4784              }
4785            else
4786    #endif
4787            while (start_match < end_subject && !WAS_NEWLINE(start_match))
4788              start_match++;
4789    
4790            /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4791            and we are now at a LF, advance the match position by one more character.
4792            */
4793    
4794            if (start_match[-1] == CHAR_CR &&
4795                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4796                 start_match < end_subject &&
4797                 *start_match == CHAR_NL)
4798              start_match++;
4799          }          }
       else  
 #endif  
       while (start_match < end_subject && !WAS_NEWLINE(start_match))  
         start_match++;  
   
       /* If we have just passed a CR and the newline option is ANY or ANYCRLF,  
       and we are now at a LF, advance the match position by one more character.  
       */  
   
       if (start_match[-1] == '\r' &&  
            (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&  
            start_match < end_subject &&  
            *start_match == '\n')  
         start_match++;  
4800        }        }
     }  
4801    
4802    /* Or to a non-unique first byte after study */      /* Or to a non-unique first byte after study */
4803    
4804    else if (start_bits != NULL)      else if (start_bits != NULL)
     {  
     while (start_match < end_subject)  
4805        {        {
4806        register unsigned int c = *start_match;        while (start_match < end_subject)
4807        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;          {
4808          else break;          register unsigned int c = *start_match;
4809            if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4810              else break;
4811            }
4812        }        }
4813      }      }   /* Starting optimizations */
4814    
4815    /* Restore fudged end_subject */    /* Restore fudged end_subject */
4816    
# Line 4778  for(;;) Line 4822  for(;;)
4822    printf("\n");    printf("\n");
4823  #endif  #endif
4824    
4825    /* If req_byte is set, we know that that character must appear in the subject    /* If req_byte is set, we know that that character must appear in the
4826    for the match to succeed. If the first character is set, req_byte must be    subject for the match to succeed. If the first character is set, req_byte
4827    later in the subject; otherwise the test starts at the match point. This    must be later in the subject; otherwise the test starts at the match point.
4828    optimization can save a huge amount of backtracking in patterns with nested    This optimization can save a huge amount of backtracking in patterns with
4829    unlimited repeats that aren't going to match. Writing separate code for    nested unlimited repeats that aren't going to match. Writing separate code
4830    cased/caseless versions makes it go faster, as does using an autoincrement    for cased/caseless versions makes it go faster, as does using an
4831    and backing off on a match.    autoincrement and backing off on a match.
4832    
4833    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end
4834    take a long time, and give bad performance on quite ordinary patterns. This    can take a long time, and give bad performance on quite ordinary patterns.
4835    showed up when somebody was matching something like /^\d+C/ on a 32-megabyte    This showed up when somebody was matching something like /^\d+C/ on a
4836    string... so we don't do this when the string is sufficiently long.    32-megabyte string... so we don't do this when the string is sufficiently
4837      long.
4838    
4839    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, or if
4840    */    disabling is explicitly requested. */
4841    
4842    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4843          req_byte >= 0 &&
4844        end_subject - start_match < REQ_BYTE_MAX &&        end_subject - start_match < REQ_BYTE_MAX &&
4845        !md->partial)        !md->partial)
4846      {      {
# Line 4839  for(;;) Line 4885  for(;;)
4885      }      }
4886    
4887    /* OK, we can now run the match. */    /* OK, we can now run the match. */
4888    
4889    md->start_match_ptr = start_match;    md->start_match_ptr = start_match;
4890    md->match_call_count = 0;    md->match_call_count = 0;
4891    rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);    rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
# Line 4902  for(;;) Line 4948  for(;;)
4948    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
4949    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
4950    
4951    if (start_match[-1] == '\r' &&    if (start_match[-1] == CHAR_CR &&
4952        start_match < end_subject &&        start_match < end_subject &&
4953        *start_match == '\n' &&        *start_match == CHAR_NL &&
4954        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
4955          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
4956           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.365  
changed lines
  Added in v.399

  ViewVC Help
Powered by ViewVC 1.1.5