/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 365 by ph10, Fri Jul 11 17:06:55 2008 UTC revision 397 by ph10, Fri Mar 20 19:40:08 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 511  for (;;) Line 512  for (;;)
512      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
513      const uschar *code;      const uschar *code;
514      int state_offset = current_state->offset;      int state_offset = current_state->offset;
515      int count, codevalue;      int count, codevalue, rrc;
516    
517  #ifdef DEBUG  #ifdef DEBUG
518      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 757  for (;;) Line 758  for (;;)
758        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
759          {          {
760          if (clen == 0 ||          if (clen == 0 ||
761              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
762                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
763              ))              ))
764            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 2200  for (;;) Line 2201  for (;;)
2201          {          {
2202          int local_offsets[1000];          int local_offsets[1000];
2203          int local_workspace[1000];          int local_workspace[1000];
2204          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2205            int condcode;
2206    
2207            /* Because of the way auto-callout works during compile, a callout item
2208            is inserted between OP_COND and an assertion condition. */
2209    
2210            if (code[LINK_SIZE+1] == OP_CALLOUT)
2211              {
2212              if (pcre_callout != NULL)
2213                {
2214                int rrc;
2215                pcre_callout_block cb;
2216                cb.version          = 1;   /* Version 1 of the callout block */
2217                cb.callout_number   = code[LINK_SIZE+2];
2218                cb.offset_vector    = offsets;
2219                cb.subject          = (PCRE_SPTR)start_subject;
2220                cb.subject_length   = end_subject - start_subject;
2221                cb.start_match      = current_subject - start_subject;
2222                cb.current_position = ptr - start_subject;
2223                cb.pattern_position = GET(code, LINK_SIZE + 3);
2224                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2225                cb.capture_top      = 1;
2226                cb.capture_last     = -1;
2227                cb.callout_data     = md->callout_data;
2228                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2229                if (rrc == 0) { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2230                }
2231              code += _pcre_OP_lengths[OP_CALLOUT];
2232              }
2233    
2234            condcode = code[LINK_SIZE+1];
2235    
2236          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2237    
2238          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
# Line 2210  for (;;) Line 2241  for (;;)
2241    
2242          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2243            {            {
2244            ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);            ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0);
2245            }            }
2246    
2247          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
# Line 2222  for (;;) Line 2253  for (;;)
2253            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2254            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2255            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2256              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2257            }            }
2258    
2259          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2250  for (;;) Line 2281  for (;;)
2281    
2282            if ((rc >= 0) ==            if ((rc >= 0) ==
2283                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2284              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              {
2285                ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0);
2286                }
2287            else            else
2288              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2289            }            }
2290          }          }
2291        break;        break;
# Line 2404  for (;;) Line 2437  for (;;)
2437        /* Handle callouts */        /* Handle callouts */
2438    
2439        case OP_CALLOUT:        case OP_CALLOUT:
2440          rrc = 0;
2441        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2442          {          {
         int rrc;  
2443          pcre_callout_block cb;          pcre_callout_block cb;
2444          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2445          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 2421  for (;;) Line 2454  for (;;)
2454          cb.capture_last     = -1;          cb.capture_last     = -1;
2455          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2456          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2457          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          }
2458          }        if (rrc == 0)
2459            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2460        break;        break;
2461    
2462    
# Line 2614  switch ((((options & PCRE_NEWLINE_BITS) Line 2648  switch ((((options & PCRE_NEWLINE_BITS)
2648           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2649    {    {
2650    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2651    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2652    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2653    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2654         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2655    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2656    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2657    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2713  if ((re->flags & PCRE_REQCHSET) != 0) Line 2747  if ((re->flags & PCRE_REQCHSET) != 0)
2747    }    }
2748    
2749  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2750  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2751  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2752    
2753  for (;;)  for (;;)
2754    {    {
# Line 2725  for (;;) Line 2758  for (;;)
2758      {      {
2759      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2760    
2761      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2762      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2763      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2764      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2765    
2766      if (firstline)      if (firstline)
2767        {        {
2768        USPTR t = current_subject;        USPTR t = current_subject;
2769  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2770        if (utf8)        if (utf8)
2771          {          {
2772          while (t < md->end_subject && !IS_NEWLINE(t))          while (t < md->end_subject && !IS_NEWLINE(t))
2773            {            {
2774            t++;            t++;
2775            while (t < end_subject && (*t & 0xc0) == 0x80) t++;            while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2776            }            }
2777          }          }
2778        else        else
2779  #endif  #endif
2780        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2781        end_subject = t;        end_subject = t;
2782        }        }
2783    
2784      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2785        starting point is not found, or if a known later character is not present.
2786        However, there is an option that disables these, for testing and for
2787        ensuring that all callouts do actually occur. */
2788    
2789        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2790        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2791    
2792      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2793    
2794      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + start_offset)  
2795          {          {
2796  #ifdef SUPPORT_UTF8          if (first_byte_caseless)
2797          if (utf8)            while (current_subject < end_subject &&
2798                     lcc[*current_subject] != first_byte)
2799                current_subject++;
2800            else
2801              while (current_subject < end_subject &&
2802                     *current_subject != first_byte)
2803                current_subject++;
2804            }
2805    
2806          /* Or to just after a linebreak for a multiline match if possible */
2807    
2808          else if (startline)
2809            {
2810            if (current_subject > md->start_subject + start_offset)
2811            {            {
2812            while (current_subject < end_subject && !WAS_NEWLINE(current_subject))  #ifdef SUPPORT_UTF8
2813              if (utf8)
2814              {              {
2815              current_subject++;              while (current_subject < end_subject &&
2816              while(current_subject < end_subject &&                     !WAS_NEWLINE(current_subject))
2817                    (*current_subject & 0xc0) == 0x80)                {
2818                current_subject++;                current_subject++;
2819              }                while(current_subject < end_subject &&
2820                        (*current_subject & 0xc0) == 0x80)
2821                    current_subject++;
2822                  }
2823                }
2824              else
2825    #endif
2826              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2827                current_subject++;
2828    
2829              /* If we have just passed a CR and the newline option is ANY or
2830              ANYCRLF, and we are now at a LF, advance the match position by one
2831              more character. */
2832    
2833              if (current_subject[-1] == CHAR_CR &&
2834                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2835                   current_subject < end_subject &&
2836                   *current_subject == CHAR_NL)
2837                current_subject++;
2838            }            }
         else  
 #endif  
         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))  
           current_subject++;  
   
         /* If we have just passed a CR and the newline option is ANY or  
         ANYCRLF, and we are now at a LF, advance the match position by one more  
         character. */  
   
         if (current_subject[-1] == '\r' &&  
              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&  
              current_subject < end_subject &&  
              *current_subject == '\n')  
           current_subject++;  
2839          }          }
       }  
2840    
2841      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2842    
2843      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2844          {          {
2845          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2846          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2847            else break;            register unsigned int c = *current_subject;
2848              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2849                else break;
2850              }
2851          }          }
2852        }        }
2853    
# Line 2824  for (;;) Line 2869  for (;;)
2869    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2870    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2871    
2872    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2873    */    also be explicitly deactivated. */
2874    
2875    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2876          req_byte >= 0 &&
2877        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2878        (options & PCRE_PARTIAL) == 0)        (options & PCRE_PARTIAL) == 0)
2879      {      {
# Line 2903  for (;;) Line 2949  for (;;)
2949    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
2950    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2951    
2952    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2953        current_subject < end_subject &&        current_subject < end_subject &&
2954        *current_subject == '\n' &&        *current_subject == CHAR_NL &&
2955        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2956          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
2957           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.365  
changed lines
  Added in v.397

  ViewVC Help
Powered by ViewVC 1.1.5