/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 365 by ph10, Fri Jul 11 17:06:55 2008 UTC revision 398 by ph10, Fri Mar 20 20:41:29 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 60  applications. */ Line 61  applications. */
61  #define SP "                   "  #define SP "                   "
62    
63    
   
64  /*************************************************  /*************************************************
65  *      Code parameters and static tables         *  *      Code parameters and static tables         *
66  *************************************************/  *************************************************/
# Line 511  for (;;) Line 511  for (;;)
511      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
512      const uschar *code;      const uschar *code;
513      int state_offset = current_state->offset;      int state_offset = current_state->offset;
514      int count, codevalue;      int count, codevalue, rrc;
515    
516  #ifdef DEBUG  #ifdef DEBUG
517      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 757  for (;;) Line 757  for (;;)
757        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
758          {          {
759          if (clen == 0 ||          if (clen == 0 ||
760              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
761                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
762              ))              ))
763            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 2200  for (;;) Line 2200  for (;;)
2200          {          {
2201          int local_offsets[1000];          int local_offsets[1000];
2202          int local_workspace[1000];          int local_workspace[1000];
2203          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2204            int condcode;
2205    
2206            /* Because of the way auto-callout works during compile, a callout item
2207            is inserted between OP_COND and an assertion condition. This does not
2208            happen for the other conditions. */
2209    
2210            if (code[LINK_SIZE+1] == OP_CALLOUT)
2211              {
2212              rrc = 0;
2213              if (pcre_callout != NULL)
2214                {
2215                pcre_callout_block cb;
2216                cb.version          = 1;   /* Version 1 of the callout block */
2217                cb.callout_number   = code[LINK_SIZE+2];
2218                cb.offset_vector    = offsets;
2219                cb.subject          = (PCRE_SPTR)start_subject;
2220                cb.subject_length   = end_subject - start_subject;
2221                cb.start_match      = current_subject - start_subject;
2222                cb.current_position = ptr - start_subject;
2223                cb.pattern_position = GET(code, LINK_SIZE + 3);
2224                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2225                cb.capture_top      = 1;
2226                cb.capture_last     = -1;
2227                cb.callout_data     = md->callout_data;
2228                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2229                }
2230              if (rrc > 0) break;                      /* Fail this thread */
2231              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2232              }
2233    
2234            condcode = code[LINK_SIZE+1];
2235    
2236          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2237    
2238          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
# Line 2209  for (;;) Line 2240  for (;;)
2240          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2241    
2242          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2243            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2244    
2245          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2246          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
# Line 2221  for (;;) Line 2250  for (;;)
2250            {            {
2251            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2252            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2253            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0)
2254              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2255              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2256            }            }
2257    
2258          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2252  for (;;) Line 2282  for (;;)
2282                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2283              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2284            else            else
2285              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2286            }            }
2287          }          }
2288        break;        break;
# Line 2404  for (;;) Line 2434  for (;;)
2434        /* Handle callouts */        /* Handle callouts */
2435    
2436        case OP_CALLOUT:        case OP_CALLOUT:
2437          rrc = 0;
2438        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2439          {          {
         int rrc;  
2440          pcre_callout_block cb;          pcre_callout_block cb;
2441          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2442          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 2421  for (;;) Line 2451  for (;;)
2451          cb.capture_last     = -1;          cb.capture_last     = -1;
2452          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2453          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2454          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          }
2455          }        if (rrc == 0)
2456            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2457        break;        break;
2458    
2459    
# Line 2614  switch ((((options & PCRE_NEWLINE_BITS) Line 2645  switch ((((options & PCRE_NEWLINE_BITS)
2645           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2646    {    {
2647    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2648    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2649    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2650    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2651         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2652    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2653    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2654    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2713  if ((re->flags & PCRE_REQCHSET) != 0) Line 2744  if ((re->flags & PCRE_REQCHSET) != 0)
2744    }    }
2745    
2746  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2747  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2748  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2749    
2750  for (;;)  for (;;)
2751    {    {
# Line 2725  for (;;) Line 2755  for (;;)
2755      {      {
2756      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2757    
2758      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2759      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2760      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2761      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2762    
2763      if (firstline)      if (firstline)
2764        {        {
2765        USPTR t = current_subject;        USPTR t = current_subject;
2766  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2767        if (utf8)        if (utf8)
2768          {          {
2769          while (t < md->end_subject && !IS_NEWLINE(t))          while (t < md->end_subject && !IS_NEWLINE(t))
2770            {            {
2771            t++;            t++;
2772            while (t < end_subject && (*t & 0xc0) == 0x80) t++;            while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2773            }            }
2774          }          }
2775        else        else
2776  #endif  #endif
2777        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2778        end_subject = t;        end_subject = t;
2779        }        }
2780    
2781      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2782        starting point is not found, or if a known later character is not present.
2783        However, there is an option that disables these, for testing and for
2784        ensuring that all callouts do actually occur. */
2785    
2786        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2787        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2788    
2789      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2790    
2791      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + start_offset)  
2792          {          {
2793  #ifdef SUPPORT_UTF8          if (first_byte_caseless)
2794          if (utf8)            while (current_subject < end_subject &&
2795                     lcc[*current_subject] != first_byte)
2796                current_subject++;
2797            else
2798              while (current_subject < end_subject &&
2799                     *current_subject != first_byte)
2800                current_subject++;
2801            }
2802    
2803          /* Or to just after a linebreak for a multiline match if possible */
2804    
2805          else if (startline)
2806            {
2807            if (current_subject > md->start_subject + start_offset)
2808            {            {
2809            while (current_subject < end_subject && !WAS_NEWLINE(current_subject))  #ifdef SUPPORT_UTF8
2810              if (utf8)
2811              {              {
2812              current_subject++;              while (current_subject < end_subject &&
2813              while(current_subject < end_subject &&                     !WAS_NEWLINE(current_subject))
2814                    (*current_subject & 0xc0) == 0x80)                {
2815                current_subject++;                current_subject++;
2816              }                while(current_subject < end_subject &&
2817                        (*current_subject & 0xc0) == 0x80)
2818                    current_subject++;
2819                  }
2820                }
2821              else
2822    #endif
2823              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2824                current_subject++;
2825    
2826              /* If we have just passed a CR and the newline option is ANY or
2827              ANYCRLF, and we are now at a LF, advance the match position by one
2828              more character. */
2829    
2830              if (current_subject[-1] == CHAR_CR &&
2831                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2832                   current_subject < end_subject &&
2833                   *current_subject == CHAR_NL)
2834                current_subject++;
2835            }            }
         else  
 #endif  
         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))  
           current_subject++;  
   
         /* If we have just passed a CR and the newline option is ANY or  
         ANYCRLF, and we are now at a LF, advance the match position by one more  
         character. */  
   
         if (current_subject[-1] == '\r' &&  
              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&  
              current_subject < end_subject &&  
              *current_subject == '\n')  
           current_subject++;  
2836          }          }
       }  
2837    
2838      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2839    
2840      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2841          {          {
2842          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2843          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2844            else break;            register unsigned int c = *current_subject;
2845              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2846                else break;
2847              }
2848          }          }
2849        }        }
2850    
# Line 2824  for (;;) Line 2866  for (;;)
2866    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2867    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2868    
2869    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2870    */    also be explicitly deactivated. */
2871    
2872    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2873          req_byte >= 0 &&
2874        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2875        (options & PCRE_PARTIAL) == 0)        (options & PCRE_PARTIAL) == 0)
2876      {      {
# Line 2903  for (;;) Line 2946  for (;;)
2946    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
2947    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2948    
2949    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2950        current_subject < end_subject &&        current_subject < end_subject &&
2951        *current_subject == '\n' &&        *current_subject == CHAR_NL &&
2952        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2953          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
2954           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.365  
changed lines
  Added in v.398

  ViewVC Help
Powered by ViewVC 1.1.5