/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 389 by ph10, Sun Mar 15 18:24:05 2009 UTC revision 428 by ph10, Mon Aug 31 17:10:26 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language (but see  and semantics are as close as possible to those of the Perl 5 language (but see
7  below for why this module is different).  below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
# Line 61  applications. */ Line 61  applications. */
61  #define SP "                   "  #define SP "                   "
62    
63    
   
64  /*************************************************  /*************************************************
65  *      Code parameters and static tables         *  *      Code parameters and static tables         *
66  *************************************************/  *************************************************/
# Line 455  for (;;) Line 454  for (;;)
454    int i, j;    int i, j;
455    int clen, dlen;    int clen, dlen;
456    unsigned int c, d;    unsigned int c, d;
457      int forced_fail = 0;
458      int reached_end = 0;
459    
460    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
461    new state list. */    new state list. */
# Line 512  for (;;) Line 513  for (;;)
513      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
514      const uschar *code;      const uschar *code;
515      int state_offset = current_state->offset;      int state_offset = current_state->offset;
516      int count, codevalue;      int count, codevalue, rrc;
517    
518  #ifdef DEBUG  #ifdef DEBUG
519      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 625  for (;;) Line 626  for (;;)
626            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
627            }            }
628          }          }
629        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
630          {          {
631          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          reached_end++;    /* Count branches that reach the end */
632            else if (match_count > 0 && ++match_count * 2 >= offsetcount)          if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
633              match_count = 0;            {
634          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
635          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));              else if (match_count > 0 && ++match_count * 2 >= offsetcount)
636          if (offsetcount >= 2)                match_count = 0;
637            {            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
638            offsets[0] = current_subject - start_subject;            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
639            offsets[1] = ptr - start_subject;            if (offsetcount >= 2)
640            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              {
641              offsets[1] - offsets[0], current_subject));              offsets[0] = current_subject - start_subject;
642            }              offsets[1] = ptr - start_subject;
643          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
644            {                offsets[1] - offsets[0], current_subject));
645            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              }
646              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
647              match_count, rlevel*2-2, SP));              {
648            return match_count;              DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
649            }                "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
650                  match_count, rlevel*2-2, SP));
651                return match_count;
652                }
653              }
654          }          }
655        break;        break;
656    
# Line 803  for (;;) Line 808  for (;;)
808            }            }
809          else left_word = 0;          else left_word = 0;
810    
811          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
812            else right_word = 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
813            else              /* This is a fudge to ensure that if this is the */
814              {               /* last item in the pattern, we don't count it as */
815              reached_end--;  /* reached, thus disabling a partial match. */
816              right_word = 0;
817              }
818    
819          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
820            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 2158  for (;;) Line 2168  for (;;)
2168    
2169  /* ========================================================================== */  /* ========================================================================== */
2170        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2171        to use recursion in order to handle them. The "always failing" assersion        to use recursion in order to handle them. The "always failing" assertion
2172        (?!) is optimised when compiling to OP_FAIL, so we have to support that,        (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2173        though the other "backtracking verbs" are not supported. */        though the other "backtracking verbs" are not supported. */
2174    
2175        case OP_FAIL:        case OP_FAIL:
2176          forced_fail++;    /* Count FAILs for multiple states */
2177        break;        break;
2178    
2179        case OP_ASSERT:        case OP_ASSERT:
# Line 2201  for (;;) Line 2212  for (;;)
2212          {          {
2213          int local_offsets[1000];          int local_offsets[1000];
2214          int local_workspace[1000];          int local_workspace[1000];
2215          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2216            int condcode;
2217    
2218            /* Because of the way auto-callout works during compile, a callout item
2219            is inserted between OP_COND and an assertion condition. This does not
2220            happen for the other conditions. */
2221    
2222            if (code[LINK_SIZE+1] == OP_CALLOUT)
2223              {
2224              rrc = 0;
2225              if (pcre_callout != NULL)
2226                {
2227                pcre_callout_block cb;
2228                cb.version          = 1;   /* Version 1 of the callout block */
2229                cb.callout_number   = code[LINK_SIZE+2];
2230                cb.offset_vector    = offsets;
2231                cb.subject          = (PCRE_SPTR)start_subject;
2232                cb.subject_length   = end_subject - start_subject;
2233                cb.start_match      = current_subject - start_subject;
2234                cb.current_position = ptr - start_subject;
2235                cb.pattern_position = GET(code, LINK_SIZE + 3);
2236                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2237                cb.capture_top      = 1;
2238                cb.capture_last     = -1;
2239                cb.callout_data     = md->callout_data;
2240                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2241                }
2242              if (rrc > 0) break;                      /* Fail this thread */
2243              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2244              }
2245    
2246            condcode = code[LINK_SIZE+1];
2247    
2248          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2249    
# Line 2210  for (;;) Line 2252  for (;;)
2252          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2253    
2254          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2255            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2256    
2257          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2258          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
# Line 2222  for (;;) Line 2262  for (;;)
2262            {            {
2263            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2264            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2265            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0)
2266              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2267              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2268            }            }
2269    
2270          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2253  for (;;) Line 2294  for (;;)
2294                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2295              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2296            else            else
2297              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2298            }            }
2299          }          }
2300        break;        break;
# Line 2405  for (;;) Line 2446  for (;;)
2446        /* Handle callouts */        /* Handle callouts */
2447    
2448        case OP_CALLOUT:        case OP_CALLOUT:
2449          rrc = 0;
2450        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2451          {          {
         int rrc;  
2452          pcre_callout_block cb;          pcre_callout_block cb;
2453          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2454          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 2422  for (;;) Line 2463  for (;;)
2463          cb.capture_last     = -1;          cb.capture_last     = -1;
2464          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2465          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2466          }          }
2467          if (rrc == 0)
2468            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2469        break;        break;
2470    
2471    
# Line 2439  for (;;) Line 2481  for (;;)
2481    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2482    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2483    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2484    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions. The "forced_
2485      fail" variable counts the number of (*F) encountered for the character. If it
2486      is equal to the original active_count (saved in workspace[1]) it means that
2487      (*F) was found on every active state. In this case we don't want to give a
2488      partial match. */
2489    
2490    if (new_count <= 0)    if (new_count <= 0)
2491      {      {
2492      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2493          rlevel == 1 &&                         /* Top level match function */          reached_end != workspace[1] &&               /* Not all reached end */
2494          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2495          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
2496          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2497            ||                                           /* or... */
2498            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2499             match_count < 0)                            /* no matches */
2500            ) &&                                         /* And... */
2501            ptr >= end_subject &&                     /* Reached end of subject */
2502            ptr > current_subject)                    /* Matched non-empty string */
2503        {        {
2504        if (offsetcount >= 2)        if (offsetcount >= 2)
2505          {          {
# Line 2615  switch ((((options & PCRE_NEWLINE_BITS) Line 2667  switch ((((options & PCRE_NEWLINE_BITS)
2667           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2668    {    {
2669    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2670    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2671    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2672    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2673         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2674    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2675    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2676    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2747  for (;;) Line 2799  for (;;)
2799        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2800        end_subject = t;        end_subject = t;
2801        }        }
2802    
2803      /* There are some optimizations that avoid running the match if a known      /* There are some optimizations that avoid running the match if a known
2804      starting point is not found, or if a known later character is not present.      starting point is not found, or if a known later character is not present.
2805      However, there is an option that disables these, for testing and for      However, there is an option that disables these, for testing and for
2806      ensuring that all callouts do actually occur. */      ensuring that all callouts do actually occur. */
2807    
2808      if ((options & PCRE_NO_START_OPTIMIZE) == 0)      if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2809        {        {
2810    
2811        /* Advance to a known first byte. */        /* Advance to a known first byte. */
2812    
2813        if (first_byte >= 0)        if (first_byte >= 0)
2814          {          {
2815          if (first_byte_caseless)          if (first_byte_caseless)
# Line 2765  for (;;) Line 2817  for (;;)
2817                   lcc[*current_subject] != first_byte)                   lcc[*current_subject] != first_byte)
2818              current_subject++;              current_subject++;
2819          else          else
2820            while (current_subject < end_subject &&            while (current_subject < end_subject &&
2821                   *current_subject != first_byte)                   *current_subject != first_byte)
2822              current_subject++;              current_subject++;
2823          }          }
2824    
2825        /* Or to just after a linebreak for a multiline match if possible */        /* Or to just after a linebreak for a multiline match if possible */
2826    
2827        else if (startline)        else if (startline)
2828          {          {
2829          if (current_subject > md->start_subject + start_offset)          if (current_subject > md->start_subject + start_offset)
# Line 2779  for (;;) Line 2831  for (;;)
2831  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2832            if (utf8)            if (utf8)
2833              {              {
2834              while (current_subject < end_subject &&              while (current_subject < end_subject &&
2835                     !WAS_NEWLINE(current_subject))                     !WAS_NEWLINE(current_subject))
2836                {                {
2837                current_subject++;                current_subject++;
# Line 2792  for (;;) Line 2844  for (;;)
2844  #endif  #endif
2845            while (current_subject < end_subject && !WAS_NEWLINE(current_subject))            while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2846              current_subject++;              current_subject++;
2847    
2848            /* If we have just passed a CR and the newline option is ANY or            /* If we have just passed a CR and the newline option is ANY or
2849            ANYCRLF, and we are now at a LF, advance the match position by one            ANYCRLF, and we are now at a LF, advance the match position by one
2850            more character. */            more character. */
2851    
2852            if (current_subject[-1] == '\r' &&            if (current_subject[-1] == CHAR_CR &&
2853                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2854                 current_subject < end_subject &&                 current_subject < end_subject &&
2855                 *current_subject == '\n')                 *current_subject == CHAR_NL)
2856              current_subject++;              current_subject++;
2857            }            }
2858          }          }
2859    
2860        /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2861    
2862        else if (start_bits != NULL)        else if (start_bits != NULL)
2863          {          {
2864          while (current_subject < end_subject)          while (current_subject < end_subject)
# Line 2816  for (;;) Line 2868  for (;;)
2868              else break;              else break;
2869            }            }
2870          }          }
2871        }        }
2872    
2873      /* Restore fudged end_subject */      /* Restore fudged end_subject */
2874    
# Line 2836  for (;;) Line 2888  for (;;)
2888    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2889    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2890    
2891    ALSO: this processing is disabled when partial matching is requested, and can    ALSO: this processing is disabled when partial matching is requested, and can
2892    also be explicitly deactivated. */    also be explicitly deactivated. Furthermore, we have to disable when
2893      restarting after a partial match, because the required character may have
2894      already been matched. */
2895    
2896    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2897        req_byte >= 0 &&        req_byte >= 0 &&
2898        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2899        (options & PCRE_PARTIAL) == 0)        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2900      {      {
2901      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2902    
# Line 2916  for (;;) Line 2970  for (;;)
2970    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
2971    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2972    
2973    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2974        current_subject < end_subject &&        current_subject < end_subject &&
2975        *current_subject == '\n' &&        *current_subject == CHAR_NL &&
2976        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2977          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
2978           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.389  
changed lines
  Added in v.428

  ViewVC Help
Powered by ViewVC 1.1.5