/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 231 by ph10, Tue Sep 11 11:15:33 2007 UTC revision 342 by ph10, Sun Apr 20 17:10:13 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 43  pattern matching using an NFA algorithm, Line 43  pattern matching using an NFA algorithm,
43  possible. There are also some static supporting functions. */  possible. There are also some static supporting functions. */
44    
45  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
46  #include <config.h>  #include "config.h"
47  #endif  #endif
48    
49  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
# Line 1148  for (;;) Line 1148  for (;;)
1148      do ecode += GET(ecode,1); while (*ecode == OP_ALT);      do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149      break;      break;
1150    
1151      /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating      /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1152      that it may occur zero times. It may repeat infinitely, or not at all -      indicating that it may occur zero times. It may repeat infinitely, or not
1153      i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper      at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1154      repeat limits are compiled as a number of copies, with the optional ones      with fixed upper repeat limits are compiled as a number of copies, with the
1155      preceded by BRAZERO or BRAMINZERO. */      optional ones preceded by BRAZERO or BRAMINZERO. */
1156    
1157      case OP_BRAZERO:      case OP_BRAZERO:
1158        {        {
# Line 1174  for (;;) Line 1174  for (;;)
1174        }        }
1175      break;      break;
1176    
1177        case OP_SKIPZERO:
1178          {
1179          next = ecode+1;
1180          do next += GET(next,1); while (*next == OP_ALT);
1181          ecode = next + 1 + LINK_SIZE;
1182          }
1183        break;
1184    
1185      /* End of a group, repeated or non-repeating. */      /* End of a group, repeated or non-repeating. */
1186    
1187      case OP_KET:      case OP_KET:
# Line 1421  for (;;) Line 1429  for (;;)
1429      /* Match a single character type; inline for speed */      /* Match a single character type; inline for speed */
1430    
1431      case OP_ANY:      case OP_ANY:
1432      if ((ims & PCRE_DOTALL) == 0)      if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1433        {      /* Fall through */
1434        if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);  
1435        }      case OP_ALLANY:
1436      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1437      if (utf8)      if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
1438      ecode++;      ecode++;
1439      break;      break;
1440    
# Line 1723  for (;;) Line 1730  for (;;)
1730      case OP_REF:      case OP_REF:
1731        {        {
1732        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
1733        ecode += 3;                                 /* Advance past item */        ecode += 3;
1734    
1735        /* If the reference is unset, set the length to be longer than the amount        /* If the reference is unset, there are two possibilities:
1736        of subject left; this ensures that every attempt at a match fails. We  
1737        can't just fail here, because of the possibility of quantifiers with zero        (a) In the default, Perl-compatible state, set the length to be longer
1738        minima. */        than the amount of subject left; this ensures that every attempt at a
1739          match fails. We can't just fail here, because of the possibility of
1740        length = (offset >= offset_top || md->offset_vector[offset] < 0)?        quantifiers with zero minima.
1741          md->end_subject - eptr + 1 :  
1742          md->offset_vector[offset+1] - md->offset_vector[offset];        (b) If the JavaScript compatibility flag is set, set the length to zero
1743          so that the back reference matches an empty string.
1744    
1745          Otherwise, set the length to the length of what was matched by the
1746          referenced subpattern. */
1747    
1748          if (offset >= offset_top || md->offset_vector[offset] < 0)
1749            length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1750          else
1751            length = md->offset_vector[offset+1] - md->offset_vector[offset];
1752    
1753        /* Set up for repetition, or handle the non-repeated case */        /* Set up for repetition, or handle the non-repeated case */
1754    
# Line 2935  for (;;) Line 2951  for (;;)
2951          case OP_ANY:          case OP_ANY:
2952          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2953            {            {
2954            if (eptr >= md->end_subject ||            if (eptr >= md->end_subject || IS_NEWLINE(eptr))
                ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))  
2955              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
2956            eptr++;            eptr++;
2957            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2958            }            }
2959          break;          break;
2960    
2961            case OP_ALLANY:
2962            for (i = 1; i <= min; i++)
2963              {
2964              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2965              eptr++;
2966              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2967              }
2968            break;
2969    
2970          case OP_ANYBYTE:          case OP_ANYBYTE:
2971          eptr += min;          eptr += min;
2972          break;          break;
# Line 3151  for (;;) Line 3175  for (;;)
3175        switch(ctype)        switch(ctype)
3176          {          {
3177          case OP_ANY:          case OP_ANY:
3178          if ((ims & PCRE_DOTALL) == 0)          for (i = 1; i <= min; i++)
3179            {            {
3180            for (i = 1; i <= min; i++)            if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3181              {            eptr++;
             if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);  
             eptr++;  
             }  
3182            }            }
3183          else eptr += min;          break;
3184    
3185            case OP_ALLANY:
3186            eptr += min;
3187          break;          break;
3188    
3189          case OP_ANYBYTE:          case OP_ANYBYTE:
# Line 3416  for (;;) Line 3440  for (;;)
3440            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3441            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442            if (fi >= max || eptr >= md->end_subject ||            if (fi >= max || eptr >= md->end_subject ||
3443                 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&                 (ctype == OP_ANY && IS_NEWLINE(eptr)))
                 IS_NEWLINE(eptr)))  
3444              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3445    
3446            GETCHARINC(c, eptr);            GETCHARINC(c, eptr);
3447            switch(ctype)            switch(ctype)
3448              {              {
3449              case OP_ANY:        /* This is the DOTALL case */              case OP_ANY:        /* This is the non-NL case */
3450              break;              case OP_ALLANY:
   
3451              case OP_ANYBYTE:              case OP_ANYBYTE:
3452              break;              break;
3453    
# Line 3577  for (;;) Line 3599  for (;;)
3599            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3600            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3601            if (fi >= max || eptr >= md->end_subject ||            if (fi >= max || eptr >= md->end_subject ||
3602                 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))                 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3603              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3604    
3605            c = *eptr++;            c = *eptr++;
3606            switch(ctype)            switch(ctype)
3607              {              {
3608              case OP_ANY:   /* This is the DOTALL case */              case OP_ANY:     /* This is the non-NL case */
3609              break;              case OP_ALLANY:
   
3610              case OP_ANYBYTE:              case OP_ANYBYTE:
3611              break;              break;
3612    
# Line 3839  for (;;) Line 3860  for (;;)
3860            case OP_ANY:            case OP_ANY:
3861            if (max < INT_MAX)            if (max < INT_MAX)
3862              {              {
3863              if ((ims & PCRE_DOTALL) == 0)              for (i = min; i < max; i++)
               {  
               for (i = min; i < max; i++)  
                 {  
                 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;  
                 eptr++;  
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
                 }  
               }  
             else  
3864                {                {
3865                for (i = min; i < max; i++)                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3866                  {                eptr++;
3867                  if (eptr >= md->end_subject) break;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
                 eptr++;  
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
                 }  
3868                }                }
3869              }              }
3870    
# Line 3863  for (;;) Line 3872  for (;;)
3872    
3873            else            else
3874              {              {
3875              if ((ims & PCRE_DOTALL) == 0)              for (i = min; i < max; i++)
3876                {                {
3877                for (i = min; i < max; i++)                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3878                  {                eptr++;
3879                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
                 eptr++;  
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
                 }  
3880                }                }
3881              else              }
3882              break;
3883    
3884              case OP_ALLANY:
3885              if (max < INT_MAX)
3886                {
3887                for (i = min; i < max; i++)
3888                {                {
3889                eptr = md->end_subject;                if (eptr >= md->end_subject) break;
3890                  eptr++;
3891                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3892                }                }
3893              }              }
3894              else eptr = md->end_subject;   /* Unlimited UTF-8 repeat */
3895            break;            break;
3896    
3897            /* The byte case is the same as non-UTF8 */            /* The byte case is the same as non-UTF8 */
# Line 4064  for (;;) Line 4079  for (;;)
4079          switch(ctype)          switch(ctype)
4080            {            {
4081            case OP_ANY:            case OP_ANY:
4082            if ((ims & PCRE_DOTALL) == 0)            for (i = min; i < max; i++)
4083              {              {
4084              for (i = min; i < max; i++)              if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4085                {              eptr++;
               if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;  
               eptr++;  
               }  
             break;  
4086              }              }
4087            /* For DOTALL case, fall through and treat as \C */            break;
4088    
4089              case OP_ALLANY:
4090            case OP_ANYBYTE:            case OP_ANYBYTE:
4091            c = max - min;            c = max - min;
4092            if (c > (unsigned int)(md->end_subject - eptr))            if (c > (unsigned int)(md->end_subject - eptr))
# Line 4246  HEAP_RETURN: Line 4258  HEAP_RETURN:
4258  switch (frame->Xwhere)  switch (frame->Xwhere)
4259    {    {
4260    LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)    LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4261    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4262    LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4263    LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4264    LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)    LBL(53) LBL(54)
4265    LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47) LBL(48)  #ifdef SUPPORT_UTF8
4266    LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54)    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4267      LBL(32) LBL(34) LBL(42) LBL(46)
4268    #ifdef SUPPORT_UCP
4269      LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4270    #endif  /* SUPPORT_UCP */
4271    #endif  /* SUPPORT_UTF8 */
4272    default:    default:
4273    DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));    DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4274    return PCRE_ERROR_INTERNAL;    return PCRE_ERROR_INTERNAL;
# Line 4445  end_subject = md->end_subject; Line 4462  end_subject = md->end_subject;
4462    
4463  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4464  utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;  utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4465    md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4466    
4467  md->notbol = (options & PCRE_NOTBOL) != 0;  md->notbol = (options & PCRE_NOTBOL) != 0;
4468  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
# Line 4469  switch (options & (PCRE_BSR_ANYCRLF|PCRE Line 4487  switch (options & (PCRE_BSR_ANYCRLF|PCRE
4487    md->bsr_anycrlf = TRUE;    md->bsr_anycrlf = TRUE;
4488  #else  #else
4489    md->bsr_anycrlf = FALSE;    md->bsr_anycrlf = FALSE;
4490  #endif  #endif
4491    break;    break;
4492    
4493    case PCRE_BSR_ANYCRLF:    case PCRE_BSR_ANYCRLF:
# Line 4665  for(;;) Line 4683  for(;;)
4683      if (first_byte_caseless)      if (first_byte_caseless)
4684        while (start_match < end_subject &&        while (start_match < end_subject &&
4685               md->lcc[*start_match] != first_byte)               md->lcc[*start_match] != first_byte)
4686          start_match++;          { NEXTCHAR(start_match); }
4687      else      else
4688        while (start_match < end_subject && *start_match != first_byte)        while (start_match < end_subject && *start_match != first_byte)
4689          start_match++;          { NEXTCHAR(start_match); }
4690      }      }
4691    
4692    /* Or to just after a linebreak for a multiline match if possible */    /* Or to just after a linebreak for a multiline match if possible */
# Line 4678  for(;;) Line 4696  for(;;)
4696      if (start_match > md->start_subject + start_offset)      if (start_match > md->start_subject + start_offset)
4697        {        {
4698        while (start_match <= end_subject && !WAS_NEWLINE(start_match))        while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4699          start_match++;          { NEXTCHAR(start_match); }
4700    
4701        /* If we have just passed a CR and the newline option is ANY or ANYCRLF,        /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4702        and we are now at a LF, advance the match position by one more character.        and we are now at a LF, advance the match position by one more character.
# Line 4699  for(;;) Line 4717  for(;;)
4717      while (start_match < end_subject)      while (start_match < end_subject)
4718        {        {
4719        register unsigned int c = *start_match;        register unsigned int c = *start_match;
4720        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;        if ((start_bits[c/8] & (1 << (c&7))) == 0)
4721            { NEXTCHAR(start_match); }
4722          else break;
4723        }        }
4724      }      }
4725    

Legend:
Removed from v.231  
changed lines
  Added in v.342

  ViewVC Help
Powered by ViewVC 1.1.5