/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 427 by ph10, Fri Aug 28 09:55:54 2009 UTC revision 439 by ph10, Tue Sep 8 17:27:24 2009 UTC
# Line 45  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49    the performance of his patterns greatly. I could not use it as it stood, as it
50    was not thread safe, and made assumptions about pattern sizes. Also, it caused
51    test 7 to loop, and test 9 to crash with a segfault.
52    
53    The issue is the check for duplicate states, which is done by a simple linear
54    search up the state list. (Grep for "duplicate" below to find the code.) For
55    many patterns, there will never be many states active at one time, so a simple
56    linear search is fine. In patterns that have many active states, it might be a
57    bottleneck. The suggested code used an indexing scheme to remember which states
58    had previously been used for each character, and avoided the linear search when
59    it knew there was no chance of a duplicate. This was implemented when adding
60    states to the state lists.
61    
62    I wrote some thread-safe, not-limited code to try something similar at the time
63    of checking for duplicates (instead of when adding states), using index vectors
64    on the stack. It did give a 13% improvement with one specially constructed
65    pattern for certain subject strings, but on other strings and on many of the
66    simpler patterns in the test suite it did worse. The major problem, I think,
67    was the extra time to initialize the index. This had to be done for each call
68    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69    only once - I suspect this was the cause of the problems with the tests.)
70    
71    Overall, I concluded that the gains in some cases did not outweigh the losses
72    in others, so I abandoned this code. */
73    
74    
75    
76  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
77  #include "config.h"  #include "config.h"
78  #endif  #endif
# Line 389  if (*first_op == OP_REVERSE) Line 417  if (*first_op == OP_REVERSE)
417        current_subject - start_subject : max_back;        current_subject - start_subject : max_back;
418      current_subject -= gone_back;      current_subject -= gone_back;
419      }      }
420    
421      /* Save the earliest consulted character */
422    
423      if (current_subject < md->start_used_ptr)
424        md->start_used_ptr = current_subject;
425    
426    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
427    
# Line 454  for (;;) Line 487  for (;;)
487    int i, j;    int i, j;
488    int clen, dlen;    int clen, dlen;
489    unsigned int c, d;    unsigned int c, d;
490      int forced_fail = 0;
491      int reached_end = 0;
492    
493    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
494    new state list. */    new state list. */
# Line 543  for (;;) Line 578  for (;;)
578          }          }
579        }        }
580    
581      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
582        See the note at the head of this module about the possibility of improving
583        performance here. */
584    
585      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
586        {        {
# Line 624  for (;;) Line 661  for (;;)
661            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
662            }            }
663          }          }
664        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
665          {          {
666          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          reached_end++;    /* Count branches that reach the end */
667            else if (match_count > 0 && ++match_count * 2 >= offsetcount)          if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
668              match_count = 0;            {
669          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
670          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));              else if (match_count > 0 && ++match_count * 2 >= offsetcount)
671          if (offsetcount >= 2)                match_count = 0;
672            {            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
673            offsets[0] = current_subject - start_subject;            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
674            offsets[1] = ptr - start_subject;            if (offsetcount >= 2)
675            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              {
676              offsets[1] - offsets[0], current_subject));              offsets[0] = current_subject - start_subject;
677            }              offsets[1] = ptr - start_subject;
678          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
679            {                offsets[1] - offsets[0], current_subject));
680            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              }
681              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
682              match_count, rlevel*2-2, SP));              {
683            return match_count;              DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
684            }                "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
685                  match_count, rlevel*2-2, SP));
686                return match_count;
687                }
688              }
689          }          }
690        break;        break;
691    
# Line 794  for (;;) Line 835  for (;;)
835          if (ptr > start_subject)          if (ptr > start_subject)
836            {            {
837            const uschar *temp = ptr - 1;            const uschar *temp = ptr - 1;
838              if (temp < md->start_used_ptr) md->start_used_ptr = temp;
839  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
840            if (utf8) BACKCHAR(temp);            if (utf8) BACKCHAR(temp);
841  #endif  #endif
# Line 802  for (;;) Line 844  for (;;)
844            }            }
845          else left_word = 0;          else left_word = 0;
846    
847          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
848            else right_word = 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
849            else              /* This is a fudge to ensure that if this is the */
850              {               /* last item in the pattern, we don't count it as */
851              reached_end--;  /* reached, thus disabling a partial match. */
852              right_word = 0;
853              }
854    
855          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
856            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 2162  for (;;) Line 2209  for (;;)
2209        though the other "backtracking verbs" are not supported. */        though the other "backtracking verbs" are not supported. */
2210    
2211        case OP_FAIL:        case OP_FAIL:
2212          forced_fail++;    /* Count FAILs for multiple states */
2213        break;        break;
2214    
2215        case OP_ASSERT:        case OP_ASSERT:
# Line 2469  for (;;) Line 2517  for (;;)
2517    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2518    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2519    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2520    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions. The "forced_
2521      fail" variable counts the number of (*F) encountered for the character. If it
2522      is equal to the original active_count (saved in workspace[1]) it means that
2523      (*F) was found on every active state. In this case we don't want to give a
2524      partial match. */
2525    
2526    if (new_count <= 0)    if (new_count <= 0)
2527      {      {
2528      if (rlevel == 1 &&                               /* Top level, and */      if (rlevel == 1 &&                               /* Top level, and */
2529            reached_end != workspace[1] &&               /* Not all reached end */
2530            forced_fail != workspace[1] &&               /* Not all forced fail & */
2531          (                                            /* either... */          (                                            /* either... */
2532          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2533          ||                                           /* or... */          ||                                           /* or... */
# Line 2485  for (;;) Line 2539  for (;;)
2539        {        {
2540        if (offsetcount >= 2)        if (offsetcount >= 2)
2541          {          {
2542          offsets[0] = current_subject - start_subject;          offsets[0] = md->start_used_ptr - start_subject;
2543          offsets[1] = end_subject - start_subject;          offsets[1] = end_subject - start_subject;
2544          }          }
2545        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
# Line 2871  for (;;) Line 2925  for (;;)
2925    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2926    
2927    ALSO: this processing is disabled when partial matching is requested, and can    ALSO: this processing is disabled when partial matching is requested, and can
2928    also be explicitly deactivated. */    also be explicitly deactivated. Furthermore, we have to disable when
2929      restarting after a partial match, because the required character may have
2930      already been matched. */
2931    
2932    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2933        req_byte >= 0 &&        req_byte >= 0 &&
2934        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2935        (options & PCRE_PARTIAL) == 0)        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2936      {      {
2937      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2938    
# Line 2916  for (;;) Line 2972  for (;;)
2972    
2973    /* OK, now we can do the business */    /* OK, now we can do the business */
2974    
2975      md->start_used_ptr = current_subject;
2976    
2977    rc = internal_dfa_exec(    rc = internal_dfa_exec(
2978      md,                                /* fixed match data */      md,                                /* fixed match data */
2979      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */

Legend:
Removed from v.427  
changed lines
  Added in v.439

  ViewVC Help
Powered by ViewVC 1.1.5