/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 365 by ph10, Fri Jul 11 17:06:55 2008 UTC revision 428 by ph10, Mon Aug 31 17:10:26 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 60  applications. */ Line 61  applications. */
61  #define SP "                   "  #define SP "                   "
62    
63    
   
64  /*************************************************  /*************************************************
65  *      Code parameters and static tables         *  *      Code parameters and static tables         *
66  *************************************************/  *************************************************/
# Line 454  for (;;) Line 454  for (;;)
454    int i, j;    int i, j;
455    int clen, dlen;    int clen, dlen;
456    unsigned int c, d;    unsigned int c, d;
457      int forced_fail = 0;
458      int reached_end = 0;
459    
460    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
461    new state list. */    new state list. */
# Line 511  for (;;) Line 513  for (;;)
513      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
514      const uschar *code;      const uschar *code;
515      int state_offset = current_state->offset;      int state_offset = current_state->offset;
516      int count, codevalue;      int count, codevalue, rrc;
517    
518  #ifdef DEBUG  #ifdef DEBUG
519      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 624  for (;;) Line 626  for (;;)
626            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
627            }            }
628          }          }
629        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
630          {          {
631          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          reached_end++;    /* Count branches that reach the end */
632            else if (match_count > 0 && ++match_count * 2 >= offsetcount)          if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
633              match_count = 0;            {
634          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
635          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));              else if (match_count > 0 && ++match_count * 2 >= offsetcount)
636          if (offsetcount >= 2)                match_count = 0;
637            {            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
638            offsets[0] = current_subject - start_subject;            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
639            offsets[1] = ptr - start_subject;            if (offsetcount >= 2)
640            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              {
641              offsets[1] - offsets[0], current_subject));              offsets[0] = current_subject - start_subject;
642            }              offsets[1] = ptr - start_subject;
643          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
644            {                offsets[1] - offsets[0], current_subject));
645            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              }
646              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
647              match_count, rlevel*2-2, SP));              {
648            return match_count;              DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
649            }                "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
650                  match_count, rlevel*2-2, SP));
651                return match_count;
652                }
653              }
654          }          }
655        break;        break;
656    
# Line 757  for (;;) Line 763  for (;;)
763        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
764          {          {
765          if (clen == 0 ||          if (clen == 0 ||
766              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
767                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
768              ))              ))
769            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 802  for (;;) Line 808  for (;;)
808            }            }
809          else left_word = 0;          else left_word = 0;
810    
811          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
812            else right_word = 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
813            else              /* This is a fudge to ensure that if this is the */
814              {               /* last item in the pattern, we don't count it as */
815              reached_end--;  /* reached, thus disabling a partial match. */
816              right_word = 0;
817              }
818    
819          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
820            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 2157  for (;;) Line 2168  for (;;)
2168    
2169  /* ========================================================================== */  /* ========================================================================== */
2170        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2171        to use recursion in order to handle them. The "always failing" assersion        to use recursion in order to handle them. The "always failing" assertion
2172        (?!) is optimised when compiling to OP_FAIL, so we have to support that,        (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2173        though the other "backtracking verbs" are not supported. */        though the other "backtracking verbs" are not supported. */
2174    
2175        case OP_FAIL:        case OP_FAIL:
2176          forced_fail++;    /* Count FAILs for multiple states */
2177        break;        break;
2178    
2179        case OP_ASSERT:        case OP_ASSERT:
# Line 2200  for (;;) Line 2212  for (;;)
2212          {          {
2213          int local_offsets[1000];          int local_offsets[1000];
2214          int local_workspace[1000];          int local_workspace[1000];
2215          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2216            int condcode;
2217    
2218            /* Because of the way auto-callout works during compile, a callout item
2219            is inserted between OP_COND and an assertion condition. This does not
2220            happen for the other conditions. */
2221    
2222            if (code[LINK_SIZE+1] == OP_CALLOUT)
2223              {
2224              rrc = 0;
2225              if (pcre_callout != NULL)
2226                {
2227                pcre_callout_block cb;
2228                cb.version          = 1;   /* Version 1 of the callout block */
2229                cb.callout_number   = code[LINK_SIZE+2];
2230                cb.offset_vector    = offsets;
2231                cb.subject          = (PCRE_SPTR)start_subject;
2232                cb.subject_length   = end_subject - start_subject;
2233                cb.start_match      = current_subject - start_subject;
2234                cb.current_position = ptr - start_subject;
2235                cb.pattern_position = GET(code, LINK_SIZE + 3);
2236                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2237                cb.capture_top      = 1;
2238                cb.capture_last     = -1;
2239                cb.callout_data     = md->callout_data;
2240                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2241                }
2242              if (rrc > 0) break;                      /* Fail this thread */
2243              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2244              }
2245    
2246            condcode = code[LINK_SIZE+1];
2247    
2248          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2249    
# Line 2209  for (;;) Line 2252  for (;;)
2252          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2253    
2254          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2255            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2256    
2257          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2258          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
# Line 2221  for (;;) Line 2262  for (;;)
2262            {            {
2263            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2264            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2265            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0)
2266              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2267              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2268            }            }
2269    
2270          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2252  for (;;) Line 2294  for (;;)
2294                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2295              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2296            else            else
2297              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2298            }            }
2299          }          }
2300        break;        break;
# Line 2404  for (;;) Line 2446  for (;;)
2446        /* Handle callouts */        /* Handle callouts */
2447    
2448        case OP_CALLOUT:        case OP_CALLOUT:
2449          rrc = 0;
2450        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2451          {          {
         int rrc;  
2452          pcre_callout_block cb;          pcre_callout_block cb;
2453          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2454          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 2421  for (;;) Line 2463  for (;;)
2463          cb.capture_last     = -1;          cb.capture_last     = -1;
2464          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2465          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2466          }          }
2467          if (rrc == 0)
2468            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2469        break;        break;
2470    
2471    
# Line 2438  for (;;) Line 2481  for (;;)
2481    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2482    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2483    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2484    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions. The "forced_
2485      fail" variable counts the number of (*F) encountered for the character. If it
2486      is equal to the original active_count (saved in workspace[1]) it means that
2487      (*F) was found on every active state. In this case we don't want to give a
2488      partial match. */
2489    
2490    if (new_count <= 0)    if (new_count <= 0)
2491      {      {
2492      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2493          rlevel == 1 &&                         /* Top level match function */          reached_end != workspace[1] &&               /* Not all reached end */
2494          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2495          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
2496          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2497            ||                                           /* or... */
2498            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2499             match_count < 0)                            /* no matches */
2500            ) &&                                         /* And... */
2501            ptr >= end_subject &&                     /* Reached end of subject */
2502            ptr > current_subject)                    /* Matched non-empty string */
2503        {        {
2504        if (offsetcount >= 2)        if (offsetcount >= 2)
2505          {          {
# Line 2614  switch ((((options & PCRE_NEWLINE_BITS) Line 2667  switch ((((options & PCRE_NEWLINE_BITS)
2667           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2668    {    {
2669    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2670    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2671    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2672    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2673         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2674    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2675    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2676    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2713  if ((re->flags & PCRE_REQCHSET) != 0) Line 2766  if ((re->flags & PCRE_REQCHSET) != 0)
2766    }    }
2767    
2768  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2769  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2770  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2771    
2772  for (;;)  for (;;)
2773    {    {
# Line 2725  for (;;) Line 2777  for (;;)
2777      {      {
2778      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2779    
2780      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2781      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2782      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2783      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2784    
2785      if (firstline)      if (firstline)
2786        {        {
2787        USPTR t = current_subject;        USPTR t = current_subject;
2788  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2789        if (utf8)        if (utf8)
2790          {          {
2791          while (t < md->end_subject && !IS_NEWLINE(t))          while (t < md->end_subject && !IS_NEWLINE(t))
2792            {            {
2793            t++;            t++;
2794            while (t < end_subject && (*t & 0xc0) == 0x80) t++;            while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2795            }            }
2796          }          }
2797        else        else
2798  #endif  #endif
2799        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2800        end_subject = t;        end_subject = t;
2801        }        }
2802    
2803      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2804        starting point is not found, or if a known later character is not present.
2805        However, there is an option that disables these, for testing and for
2806        ensuring that all callouts do actually occur. */
2807    
2808        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2809        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2810    
2811      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2812    
2813      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + start_offset)  
2814          {          {
2815  #ifdef SUPPORT_UTF8          if (first_byte_caseless)
2816          if (utf8)            while (current_subject < end_subject &&
2817                     lcc[*current_subject] != first_byte)
2818                current_subject++;
2819            else
2820              while (current_subject < end_subject &&
2821                     *current_subject != first_byte)
2822                current_subject++;
2823            }
2824    
2825          /* Or to just after a linebreak for a multiline match if possible */
2826    
2827          else if (startline)
2828            {
2829            if (current_subject > md->start_subject + start_offset)
2830            {            {
2831            while (current_subject < end_subject && !WAS_NEWLINE(current_subject))  #ifdef SUPPORT_UTF8
2832              if (utf8)
2833              {              {
2834              current_subject++;              while (current_subject < end_subject &&
2835              while(current_subject < end_subject &&                     !WAS_NEWLINE(current_subject))
2836                    (*current_subject & 0xc0) == 0x80)                {
2837                current_subject++;                current_subject++;
2838              }                while(current_subject < end_subject &&
2839                        (*current_subject & 0xc0) == 0x80)
2840                    current_subject++;
2841                  }
2842                }
2843              else
2844    #endif
2845              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2846                current_subject++;
2847    
2848              /* If we have just passed a CR and the newline option is ANY or
2849              ANYCRLF, and we are now at a LF, advance the match position by one
2850              more character. */
2851    
2852              if (current_subject[-1] == CHAR_CR &&
2853                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2854                   current_subject < end_subject &&
2855                   *current_subject == CHAR_NL)
2856                current_subject++;
2857            }            }
         else  
 #endif  
         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))  
           current_subject++;  
   
         /* If we have just passed a CR and the newline option is ANY or  
         ANYCRLF, and we are now at a LF, advance the match position by one more  
         character. */  
   
         if (current_subject[-1] == '\r' &&  
              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&  
              current_subject < end_subject &&  
              *current_subject == '\n')  
           current_subject++;  
2858          }          }
       }  
2859    
2860      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2861    
2862      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2863          {          {
2864          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2865          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2866            else break;            register unsigned int c = *current_subject;
2867              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2868                else break;
2869              }
2870          }          }
2871        }        }
2872    
# Line 2824  for (;;) Line 2888  for (;;)
2888    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2889    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2890    
2891    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2892    */    also be explicitly deactivated. Furthermore, we have to disable when
2893      restarting after a partial match, because the required character may have
2894      already been matched. */
2895    
2896    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2897          req_byte >= 0 &&
2898        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2899        (options & PCRE_PARTIAL) == 0)        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2900      {      {
2901      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2902    
# Line 2903  for (;;) Line 2970  for (;;)
2970    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
2971    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2972    
2973    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2974        current_subject < end_subject &&        current_subject < end_subject &&
2975        *current_subject == '\n' &&        *current_subject == CHAR_NL &&
2976        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2977          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
2978           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.365  
changed lines
  Added in v.428

  ViewVC Help
Powered by ViewVC 1.1.5