/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 459 by ph10, Sun Oct 4 09:21:39 2009 UTC revision 461 by ph10, Mon Oct 5 10:59:35 2009 UTC
# Line 45  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48  /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved  /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49  the performance of his patterns greatly. I could not use it as it stood, as it  the performance of his patterns greatly. I could not use it as it stood, as it
50  was not thread safe, and made assumptions about pattern sizes. Also, it caused  was not thread safe, and made assumptions about pattern sizes. Also, it caused
51  test 7 to loop, and test 9 to crash with a segfault.  test 7 to loop, and test 9 to crash with a segfault.
52    
53  The issue is the check for duplicate states, which is done by a simple linear  The issue is the check for duplicate states, which is done by a simple linear
# Line 68  was the extra time to initialize the ind Line 68  was the extra time to initialize the ind
68  of internal_dfa_exec(). (The supplied patch used a static vector, initialized  of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69  only once - I suspect this was the cause of the problems with the tests.)  only once - I suspect this was the cause of the problems with the tests.)
70    
71  Overall, I concluded that the gains in some cases did not outweigh the losses  Overall, I concluded that the gains in some cases did not outweigh the losses
72  in others, so I abandoned this code. */  in others, so I abandoned this code. */
73    
74    
# Line 417  if (*first_op == OP_REVERSE) Line 417  if (*first_op == OP_REVERSE)
417        current_subject - start_subject : max_back;        current_subject - start_subject : max_back;
418      current_subject -= gone_back;      current_subject -= gone_back;
419      }      }
420    
421    /* Save the earliest consulted character */    /* Save the earliest consulted character */
422    
423    if (current_subject < md->start_used_ptr)    if (current_subject < md->start_used_ptr)
424      md->start_used_ptr = current_subject;      md->start_used_ptr = current_subject;
425    
426    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
427    
# Line 488  for (;;) Line 488  for (;;)
488    int clen, dlen;    int clen, dlen;
489    unsigned int c, d;    unsigned int c, d;
490    int forced_fail = 0;    int forced_fail = 0;
491    int reached_end = 0;    int reached_end = 0;
492    
493    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
494    new state list. */    new state list. */
# Line 578  for (;;) Line 578  for (;;)
578          }          }
579        }        }
580    
581      /* Check for a duplicate state with the same count, and skip if found.      /* Check for a duplicate state with the same count, and skip if found.
582      See the note at the head of this module about the possibility of improving      See the note at the head of this module about the possibility of improving
583      performance here. */      performance here. */
584    
# Line 647  for (;;) Line 647  for (;;)
647  /* ========================================================================== */  /* ========================================================================== */
648        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
649        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. Otherwise, unless we have an empty string and
650        PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the        PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
651        start of the subject, save the match data, shifting up all previous        start of the subject, save the match data, shifting up all previous
652        matches so we always have the longest first. */        matches so we always have the longest first. */
653    
# Line 662  for (;;) Line 662  for (;;)
662            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
663            }            }
664          }          }
665        else        else
666          {          {
667          reached_end++;    /* Count branches that reach the end */          reached_end++;    /* Count branches that reach the end */
668          if (ptr > current_subject ||          if (ptr > current_subject ||
669              ((md->moptions & PCRE_NOTEMPTY) == 0 &&              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
670                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
671                  current_subject > start_subject + md->start_offset)))                  current_subject > start_subject + md->start_offset)))
# Line 689  for (;;) Line 689  for (;;)
689                match_count, rlevel*2-2, SP));                match_count, rlevel*2-2, SP));
690              return match_count;              return match_count;
691              }              }
692            }            }
693          }          }
694        break;        break;
695    
# Line 839  for (;;) Line 839  for (;;)
839          if (ptr > start_subject)          if (ptr > start_subject)
840            {            {
841            const uschar *temp = ptr - 1;            const uschar *temp = ptr - 1;
842            if (temp < md->start_used_ptr) md->start_used_ptr = temp;            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
843  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
844            if (utf8) BACKCHAR(temp);            if (utf8) BACKCHAR(temp);
845  #endif  #endif
# Line 848  for (;;) Line 848  for (;;)
848            }            }
849          else left_word = 0;          else left_word = 0;
850    
851          if (clen > 0)          if (clen > 0)
852            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
853          else              /* This is a fudge to ensure that if this is the */          else              /* This is a fudge to ensure that if this is the */
854            {               /* last item in the pattern, we don't count it as */            {               /* last item in the pattern, we don't count it as */
855            reached_end--;  /* reached, thus disabling a partial match. */            reached_end--;  /* reached, thus disabling a partial match. */
856            right_word = 0;            right_word = 0;
857            }            }
858    
859          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
860            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 2287  for (;;) Line 2287  for (;;)
2287    
2288          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2289    
2290          if (condcode == OP_CREF || condcode == OP_NCREF)          if (condcode == OP_CREF || condcode == OP_NCREF)
2291            return PCRE_ERROR_DFA_UCOND;            return PCRE_ERROR_DFA_UCOND;
2292    
2293          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
# Line 2531  for (;;) Line 2531  for (;;)
2531    if (new_count <= 0)    if (new_count <= 0)
2532      {      {
2533      if (rlevel == 1 &&                               /* Top level, and */      if (rlevel == 1 &&                               /* Top level, and */
2534          reached_end != workspace[1] &&               /* Not all reached end */          reached_end != workspace[1] &&               /* Not all reached end */
2535          forced_fail != workspace[1] &&               /* Not all forced fail & */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2536          (                                            /* either... */          (                                            /* either... */
2537          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
# Line 2652  if (extra_data != NULL) Line 2652  if (extra_data != NULL)
2652    if ((flags & PCRE_EXTRA_TABLES) != 0)    if ((flags & PCRE_EXTRA_TABLES) != 0)
2653      md->tables = extra_data->tables;      md->tables = extra_data->tables;
2654    }    }
2655    
2656  /* Check that the first field in the block is the magic number. If it is not,  /* Check that the first field in the block is the magic number. If it is not,
2657  test for a regex that was compiled on a host of opposite endianness. If this is  test for a regex that was compiled on a host of opposite endianness. If this is
2658  the case, flipped values are put in internal_re and internal_study if there was  the case, flipped values are put in internal_re and internal_study if there was
# Line 2914  for (;;) Line 2914  for (;;)
2914    
2915      end_subject = save_end_subject;      end_subject = save_end_subject;
2916    
2917      /* The following two optimizations are disabled for partial matching or if      /* The following two optimizations are disabled for partial matching or if
2918      disabling is explicitly requested (and of course, by the test above, this      disabling is explicitly requested (and of course, by the test above, this
2919      code is not obeyed when restarting after a partial match). */      code is not obeyed when restarting after a partial match). */
2920    
2921      if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&      if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2922          (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)          (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
2923        {        {
2924        /* If the pattern was studied, a minimum subject length may be set. This        /* If the pattern was studied, a minimum subject length may be set. This
2925        is a lower bound; no actual string of that length may actually match the        is a lower bound; no actual string of that length may actually match the
2926        pattern. Although the value is, strictly, in characters, we treat it as        pattern. Although the value is, strictly, in characters, we treat it as
# Line 2929  for (;;) Line 2929  for (;;)
2929        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
2930            end_subject - current_subject < study->minlength)            end_subject - current_subject < study->minlength)
2931          return PCRE_ERROR_NOMATCH;          return PCRE_ERROR_NOMATCH;
2932    
2933        /* If req_byte is set, we know that that character must appear in the        /* If req_byte is set, we know that that character must appear in the
2934        subject for the match to succeed. If the first character is set, req_byte        subject for the match to succeed. If the first character is set, req_byte
2935        must be later in the subject; otherwise the test starts at the match        must be later in the subject; otherwise the test starts at the match
# Line 2937  for (;;) Line 2937  for (;;)
2937        nested unlimited repeats that aren't going to match. Writing separate        nested unlimited repeats that aren't going to match. Writing separate
2938        code for cased/caseless versions makes it go faster, as does using an        code for cased/caseless versions makes it go faster, as does using an
2939        autoincrement and backing off on a match.        autoincrement and backing off on a match.
2940    
2941        HOWEVER: when the subject string is very, very long, searching to its end        HOWEVER: when the subject string is very, very long, searching to its end
2942        can take a long time, and give bad performance on quite ordinary        can take a long time, and give bad performance on quite ordinary
2943        patterns. This showed up when somebody was matching /^C/ on a 32-megabyte        patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
2944        string... so we don't do this when the string is sufficiently long. */        string... so we don't do this when the string is sufficiently long. */
2945    
2946        if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)        if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
2947          {          {
2948          register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);          register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2949    
2950          /* We don't need to repeat the search if we haven't yet reached the          /* We don't need to repeat the search if we haven't yet reached the
2951          place we found it at last time. */          place we found it at last time. */
2952    
2953          if (p > req_byte_ptr)          if (p > req_byte_ptr)
2954            {            {
2955            if (req_byte_caseless)            if (req_byte_caseless)
# Line 2967  for (;;) Line 2967  for (;;)
2967                if (*p++ == req_byte) { p--; break; }                if (*p++ == req_byte) { p--; break; }
2968                }                }
2969              }              }
2970    
2971            /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
2972            which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
2973    
2974            if (p >= end_subject) break;            if (p >= end_subject) break;
2975    
2976            /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
2977            found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
2978            the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
2979    
2980            req_byte_ptr = p;            req_byte_ptr = p;
2981            }            }
2982          }          }
2983        }        }
2984      }   /* End of optimizations that are done when not restarting */      }   /* End of optimizations that are done when not restarting */
2985    
2986    /* OK, now we can do the business */    /* OK, now we can do the business */
2987    
2988    md->start_used_ptr = current_subject;    md->start_used_ptr = current_subject;
2989    
2990    rc = internal_dfa_exec(    rc = internal_dfa_exec(
2991      md,                                /* fixed match data */      md,                                /* fixed match data */
2992      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */

Legend:
Removed from v.459  
changed lines
  Added in v.461

  ViewVC Help
Powered by ViewVC 1.1.5