/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 349 by ph10, Wed Jul 2 18:42:11 2008 UTC revision 399 by ph10, Sat Mar 21 12:34:15 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 158  printf("\n"); Line 158  printf("\n");
158    
159  if (length > md->end_subject - eptr) return FALSE;  if (length > md->end_subject - eptr) return FALSE;
160    
161  /* Separate the caselesss case for speed */  /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162    properly if Unicode properties are supported. Otherwise, we can check only
163    ASCII characters. */
164    
165  if ((ims & PCRE_CASELESS) != 0)  if ((ims & PCRE_CASELESS) != 0)
166    {    {
167    #ifdef SUPPORT_UTF8
168    #ifdef SUPPORT_UCP
169      if (md->utf8)
170        {
171        USPTR endptr = eptr + length;
172        while (eptr < endptr)
173          {
174          int c, d;
175          GETCHARINC(c, eptr);
176          GETCHARINC(d, p);
177          if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178          }
179        }
180      else
181    #endif
182    #endif
183    
184      /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185      is no UCP support. */
186    
187    while (length-- > 0)    while (length-- > 0)
188      if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;      { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189    }    }
190    
191    /* In the caseful case, we can just compare the bytes, whether or not we
192    are in UTF-8 mode. */
193    
194  else  else
195    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
# Line 535  int oclength; Line 561  int oclength;
561  uschar occhars[8];  uschar occhars[8];
562  #endif  #endif
563    
564    int codelink;
565    int condcode;
566  int ctype;  int ctype;
567  int length;  int length;
568  int max;  int max;
# Line 609  for (;;) Line 637  for (;;)
637    {    {
638    minimize = possessive = FALSE;    minimize = possessive = FALSE;
639    op = *ecode;    op = *ecode;
640    
641    /* For partial matching, remember if we ever hit the end of the subject after    /* For partial matching, remember if we ever hit the end of the subject after
642    matching at least one subject character. */    matching at least one subject character. */
643    
# Line 761  for (;;) Line 789  for (;;)
789    
790      case OP_COND:      case OP_COND:
791      case OP_SCOND:      case OP_SCOND:
792      if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */      codelink= GET(ecode, 1);
793    
794        /* Because of the way auto-callout works during compile, a callout item is
795        inserted between OP_COND and an assertion condition. */
796    
797        if (ecode[LINK_SIZE+1] == OP_CALLOUT)
798          {
799          if (pcre_callout != NULL)
800            {
801            pcre_callout_block cb;
802            cb.version          = 1;   /* Version 1 of the callout block */
803            cb.callout_number   = ecode[LINK_SIZE+2];
804            cb.offset_vector    = md->offset_vector;
805            cb.subject          = (PCRE_SPTR)md->start_subject;
806            cb.subject_length   = md->end_subject - md->start_subject;
807            cb.start_match      = mstart - md->start_subject;
808            cb.current_position = eptr - md->start_subject;
809            cb.pattern_position = GET(ecode, LINK_SIZE + 3);
810            cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
811            cb.capture_top      = offset_top/2;
812            cb.capture_last     = md->capture_last;
813            cb.callout_data     = md->callout_data;
814            if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
815            if (rrc < 0) RRETURN(rrc);
816            }
817          ecode += _pcre_OP_lengths[OP_CALLOUT];
818          }
819    
820        condcode = ecode[LINK_SIZE+1];
821    
822        /* Now see what the actual condition is */
823    
824        if (condcode == OP_RREF)         /* Recursion test */
825        {        {
826        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
827        condition = md->recursive != NULL &&        condition = md->recursive != NULL &&
# Line 769  for (;;) Line 829  for (;;)
829        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
830        }        }
831    
832      else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */      else if (condcode == OP_CREF)    /* Group used test */
833        {        {
834        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
835        condition = offset < offset_top && md->offset_vector[offset] >= 0;        condition = offset < offset_top && md->offset_vector[offset] >= 0;
836        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
837        }        }
838    
839      else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */      else if (condcode == OP_DEF)     /* DEFINE - always false */
840        {        {
841        condition = FALSE;        condition = FALSE;
842        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
# Line 803  for (;;) Line 863  for (;;)
863        else        else
864          {          {
865          condition = FALSE;          condition = FALSE;
866          ecode += GET(ecode, 1);          ecode += codelink;
867          }          }
868        }        }
869    
# Line 826  for (;;) Line 886  for (;;)
886          goto TAIL_RECURSE;          goto TAIL_RECURSE;
887          }          }
888        }        }
889      else                         /* Condition false & no 2nd alternative */      else                         /* Condition false & no alternative */
890        {        {
891        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
892        }        }
# Line 1653  for (;;) Line 1713  for (;;)
1713      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1714      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
1715        {        {
1716        const ucd_record * prop = GET_UCD(c);        const ucd_record *prop = GET_UCD(c);
1717    
1718        switch(ecode[1])        switch(ecode[1])
1719          {          {
# Line 1669  for (;;) Line 1729  for (;;)
1729           break;           break;
1730    
1731          case PT_GC:          case PT_GC:
1732          if ((ecode[2] != ucp_gentype[prop->chartype]) == (op == OP_PROP))          if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1733            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1734          break;          break;
1735    
# Line 2021  for (;;) Line 2081  for (;;)
2081    
2082    
2083      /* Match an extended character class. This opcode is encountered only      /* Match an extended character class. This opcode is encountered only
2084      in UTF-8 mode, because that's the only time it is compiled. */      when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2085        mode, because Unicode properties are supported in non-UTF-8 mode. */
2086    
2087  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2088      case OP_XCLASS:      case OP_XCLASS:
# Line 2063  for (;;) Line 2124  for (;;)
2124        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
2125          {          {
2126          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2127          GETCHARINC(c, eptr);          GETCHARINCTEST(c, eptr);
2128          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2129          }          }
2130    
# Line 2082  for (;;) Line 2143  for (;;)
2143            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2144            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2145            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2146            GETCHARINC(c, eptr);            GETCHARINCTEST(c, eptr);
2147            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2148            }            }
2149          /* Control never gets here */          /* Control never gets here */
# Line 2097  for (;;) Line 2158  for (;;)
2158            {            {
2159            int len = 1;            int len = 1;
2160            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
2161            GETCHARLEN(c, eptr, len);            GETCHARLENTEST(c, eptr, len);
2162            if (!_pcre_xclass(c, data)) break;            if (!_pcre_xclass(c, data)) break;
2163            eptr += len;            eptr += len;
2164            }            }
# Line 2583  for (;;) Line 2644  for (;;)
2644              {              {
2645              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2646              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2647                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2648              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2649              if (d < 256) d = md->lcc[d];              if (d < 256) d = md->lcc[d];
2650              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
2651                RRETURN(MATCH_NOMATCH);  
2652              }              }
2653            }            }
2654          else          else
# Line 2692  for (;;) Line 2754  for (;;)
2754              {              {
2755              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2756              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2757                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2758              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2759              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
               RRETURN(MATCH_NOMATCH);  
2760              }              }
2761            }            }
2762          else          else
# Line 4358  Returns:          > 0 => success; value Line 4420  Returns:          > 0 => success; value
4420                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
4421  */  */
4422    
4423  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4424  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4425    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4426    int offsetcount)    int offsetcount)
# Line 4506  switch ((((options & PCRE_NEWLINE_BITS) Line 4568  switch ((((options & PCRE_NEWLINE_BITS)
4568          (pcre_uint32)options) & PCRE_NEWLINE_BITS)          (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4569    {    {
4570    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
4571    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4572    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4573    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
4574         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4575    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
4576    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4577    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 4661  for(;;) Line 4723  for(;;)
4723      while (iptr < iend) *iptr++ = -1;      while (iptr < iend) *iptr++ = -1;
4724      }      }
4725    
4726    /* Advance to a unique first char if possible. If firstline is TRUE, the    /* If firstline is TRUE, the start of the match is constrained to the first
4727    start of the match is constrained to the first line of a multiline string.    line of a multiline string. That is, the match must be before or at the first
4728    That is, the match must be before or at the first newline. Implement this by    newline. Implement this by temporarily adjusting end_subject so that we stop
4729    temporarily adjusting end_subject so that we stop scanning at a newline. If    scanning at a newline. If the match fails at the newline, later code breaks
4730    the match fails at the newline, later code breaks this loop. */    this loop. */
4731    
4732    if (firstline)    if (firstline)
4733      {      {
4734      USPTR t = start_match;      USPTR t = start_match;
4735    #ifdef SUPPORT_UTF8
4736        if (utf8)
4737          {
4738          while (t < md->end_subject && !IS_NEWLINE(t))
4739            {
4740            t++;
4741            while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4742            }
4743          }
4744        else
4745    #endif
4746      while (t < md->end_subject && !IS_NEWLINE(t)) t++;      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4747      end_subject = t;      end_subject = t;
4748      }      }
4749    
4750    /* Now test for a unique first byte */    /* There are some optimizations that avoid running the match if a known
4751      starting point is not found, or if a known later character is not present.
4752      However, there is an option that disables these, for testing and for ensuring
4753      that all callouts do actually occur. */
4754    
4755    if (first_byte >= 0)    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4756      {      {
4757      if (first_byte_caseless)      /* Advance to a unique first byte if there is one. */
4758        while (start_match < end_subject &&  
4759               md->lcc[*start_match] != first_byte)      if (first_byte >= 0)
4760          { NEXTCHAR(start_match); }        {
4761      else        if (first_byte_caseless)
4762        while (start_match < end_subject && *start_match != first_byte)          while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4763          { NEXTCHAR(start_match); }            start_match++;
4764      }        else
4765            while (start_match < end_subject && *start_match != first_byte)
4766              start_match++;
4767          }
4768    
4769    /* Or to just after a linebreak for a multiline match if possible */      /* Or to just after a linebreak for a multiline match */
4770    
4771    else if (startline)      else if (startline)
     {  
     if (start_match > md->start_subject + start_offset)  
4772        {        {
4773        while (start_match <= end_subject && !WAS_NEWLINE(start_match))        if (start_match > md->start_subject + start_offset)
4774          { NEXTCHAR(start_match); }          {
4775    #ifdef SUPPORT_UTF8
4776            if (utf8)
4777              {
4778              while (start_match < end_subject && !WAS_NEWLINE(start_match))
4779                {
4780                start_match++;
4781                while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4782                  start_match++;
4783                }
4784              }
4785            else
4786    #endif
4787            while (start_match < end_subject && !WAS_NEWLINE(start_match))
4788              start_match++;
4789    
4790            /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4791            and we are now at a LF, advance the match position by one more character.
4792            */
4793    
4794        /* If we have just passed a CR and the newline option is ANY or ANYCRLF,          if (start_match[-1] == CHAR_CR &&
4795        and we are now at a LF, advance the match position by one more character.               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4796        */               start_match < end_subject &&
4797                 *start_match == CHAR_NL)
4798        if (start_match[-1] == '\r' &&            start_match++;
4799             (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&          }
            start_match < end_subject &&  
            *start_match == '\n')  
         start_match++;  
4800        }        }
     }  
4801    
4802    /* Or to a non-unique first char after study */      /* Or to a non-unique first byte after study */
4803    
4804    else if (start_bits != NULL)      else if (start_bits != NULL)
     {  
     while (start_match < end_subject)  
4805        {        {
4806        register unsigned int c = *start_match;        while (start_match < end_subject)
4807        if ((start_bits[c/8] & (1 << (c&7))) == 0)          {
4808          { NEXTCHAR(start_match); }          register unsigned int c = *start_match;
4809        else break;          if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4810              else break;
4811            }
4812        }        }
4813      }      }   /* Starting optimizations */
4814    
4815    /* Restore fudged end_subject */    /* Restore fudged end_subject */
4816    
# Line 4731  for(;;) Line 4822  for(;;)
4822    printf("\n");    printf("\n");
4823  #endif  #endif
4824    
4825    /* If req_byte is set, we know that that character must appear in the subject    /* If req_byte is set, we know that that character must appear in the
4826    for the match to succeed. If the first character is set, req_byte must be    subject for the match to succeed. If the first character is set, req_byte
4827    later in the subject; otherwise the test starts at the match point. This    must be later in the subject; otherwise the test starts at the match point.
4828    optimization can save a huge amount of backtracking in patterns with nested    This optimization can save a huge amount of backtracking in patterns with
4829    unlimited repeats that aren't going to match. Writing separate code for    nested unlimited repeats that aren't going to match. Writing separate code
4830    cased/caseless versions makes it go faster, as does using an autoincrement    for cased/caseless versions makes it go faster, as does using an
4831    and backing off on a match.    autoincrement and backing off on a match.
4832    
4833    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end
4834    take a long time, and give bad performance on quite ordinary patterns. This    can take a long time, and give bad performance on quite ordinary patterns.
4835    showed up when somebody was matching something like /^\d+C/ on a 32-megabyte    This showed up when somebody was matching something like /^\d+C/ on a
4836    string... so we don't do this when the string is sufficiently long.    32-megabyte string... so we don't do this when the string is sufficiently
4837      long.
4838    
4839    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, or if
4840    */    disabling is explicitly requested. */
4841    
4842    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4843          req_byte >= 0 &&
4844        end_subject - start_match < REQ_BYTE_MAX &&        end_subject - start_match < REQ_BYTE_MAX &&
4845        !md->partial)        !md->partial)
4846      {      {
# Line 4855  for(;;) Line 4948  for(;;)
4948    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
4949    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
4950    
4951    if (start_match[-1] == '\r' &&    if (start_match[-1] == CHAR_CR &&
4952        start_match < end_subject &&        start_match < end_subject &&
4953        *start_match == '\n' &&        *start_match == CHAR_NL &&
4954        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
4955          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
4956           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.349  
changed lines
  Added in v.399

  ViewVC Help
Powered by ViewVC 1.1.5