/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1364 by ph10, Sat Oct 5 15:45:11 2013 UTC revision 1430 by ph10, Wed Jan 1 17:11:54 2014 UTC
# Line 151  static const pcre_uint8 coptable[] = { Line 151  static const pcre_uint8 coptable[] = {
151    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
152    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
154      0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
155    0,                             /* CLASS                                  */    0,                             /* CLASS                                  */
156    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
157    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
# Line 173  static const pcre_uint8 coptable[] = { Line 174  static const pcre_uint8 coptable[] = {
174    0, 0,                          /* ONCE, ONCE_NC                          */    0, 0,                          /* ONCE, ONCE_NC                          */
175    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
176    0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */    0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
177    0, 0,                          /* CREF, NCREF                            */    0, 0,                          /* CREF, DNCREF                           */
178    0, 0,                          /* RREF, NRREF                            */    0, 0,                          /* RREF, DNRREF                           */
179    0,                             /* DEF                                    */    0,                             /* DEF                                    */
180    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
181    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
# Line 222  static const pcre_uint8 poptable[] = { Line 223  static const pcre_uint8 poptable[] = {
223    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
224    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
225    1, 1,                          /* CRRANGE, CRMINRANGE                    */    1, 1,                          /* CRRANGE, CRMINRANGE                    */
226      1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
227    1,                             /* CLASS                                  */    1,                             /* CLASS                                  */
228    1,                             /* NCLASS                                 */    1,                             /* NCLASS                                 */
229    1,                             /* XCLASS - variable length               */    1,                             /* XCLASS - variable length               */
# Line 244  static const pcre_uint8 poptable[] = { Line 246  static const pcre_uint8 poptable[] = {
246    0, 0,                          /* ONCE, ONCE_NC                          */    0, 0,                          /* ONCE, ONCE_NC                          */
247    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
248    0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */    0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
249    0, 0,                          /* CREF, NCREF                            */    0, 0,                          /* CREF, DNCREF                           */
250    0, 0,                          /* RREF, NRREF                            */    0, 0,                          /* RREF, DNRREF                           */
251    0,                             /* DEF                                    */    0,                             /* DEF                                    */
252    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
253    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
# Line 1101  for (;;) Line 1103  for (;;)
1103            /* Perl space used to exclude VT, but from Perl 5.18 it is included,            /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1104            which means that Perl space and POSIX space are now identical. PCRE            which means that Perl space and POSIX space are now identical. PCRE
1105            was changed at release 8.34. */            was changed at release 8.34. */
1106    
1107            case PT_SPACE:    /* Perl space */            case PT_SPACE:    /* Perl space */
1108            case PT_PXSPACE:  /* POSIX space */            case PT_PXSPACE:  /* POSIX space */
1109            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||            switch(c)
1110                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||              {
1111                 c == CHAR_FF || c == CHAR_CR;              HSPACE_CASES:
1112                VSPACE_CASES:
1113                OK = TRUE;
1114                break;
1115    
1116                default:
1117                OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1118                break;
1119                }
1120            break;            break;
1121    
1122            case PT_WORD:            case PT_WORD:
# Line 1351  for (;;) Line 1361  for (;;)
1361            /* Perl space used to exclude VT, but from Perl 5.18 it is included,            /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1362            which means that Perl space and POSIX space are now identical. PCRE            which means that Perl space and POSIX space are now identical. PCRE
1363            was changed at release 8.34. */            was changed at release 8.34. */
1364    
1365            case PT_SPACE:    /* Perl space */            case PT_SPACE:    /* Perl space */
1366            case PT_PXSPACE:  /* POSIX space */            case PT_PXSPACE:  /* POSIX space */
1367            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||            switch(c)
1368                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||              {
1369                 c == CHAR_FF || c == CHAR_CR;              HSPACE_CASES:
1370                VSPACE_CASES:
1371                OK = TRUE;
1372                break;
1373    
1374                default:
1375                OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1376                break;
1377                }
1378            break;            break;
1379    
1380            case PT_WORD:            case PT_WORD:
# Line 1455  for (;;) Line 1473  for (;;)
1473            goto ANYNL01;            goto ANYNL01;
1474    
1475            case CHAR_CR:            case CHAR_CR:
1476            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1477            /* Fall through */            /* Fall through */
1478    
1479            ANYNL01:            ANYNL01:
# Line 1595  for (;;) Line 1613  for (;;)
1613            /* Perl space used to exclude VT, but from Perl 5.18 it is included,            /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1614            which means that Perl space and POSIX space are now identical. PCRE            which means that Perl space and POSIX space are now identical. PCRE
1615            was changed at release 8.34. */            was changed at release 8.34. */
1616    
1617            case PT_SPACE:    /* Perl space */            case PT_SPACE:    /* Perl space */
1618            case PT_PXSPACE:  /* POSIX space */            case PT_PXSPACE:  /* POSIX space */
1619            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||            switch(c)
1620                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||              {
1621                 c == CHAR_FF || c == CHAR_CR;              HSPACE_CASES:
1622                VSPACE_CASES:
1623                OK = TRUE;
1624                break;
1625    
1626                default:
1627                OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1628                break;
1629                }
1630            break;            break;
1631    
1632            case PT_WORD:            case PT_WORD:
# Line 1716  for (;;) Line 1742  for (;;)
1742            goto ANYNL02;            goto ANYNL02;
1743    
1744            case CHAR_CR:            case CHAR_CR:
1745            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1746            /* Fall through */            /* Fall through */
1747    
1748            ANYNL02:            ANYNL02:
# Line 1864  for (;;) Line 1890  for (;;)
1890            /* Perl space used to exclude VT, but from Perl 5.18 it is included,            /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1891            which means that Perl space and POSIX space are now identical. PCRE            which means that Perl space and POSIX space are now identical. PCRE
1892            was changed at release 8.34. */            was changed at release 8.34. */
1893    
1894            case PT_SPACE:    /* Perl space */            case PT_SPACE:    /* Perl space */
1895            case PT_PXSPACE:  /* POSIX space */            case PT_PXSPACE:  /* POSIX space */
1896            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||            switch(c)
1897                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||              {
1898                 c == CHAR_FF || c == CHAR_CR;              HSPACE_CASES:
1899                VSPACE_CASES:
1900                OK = TRUE;
1901                break;
1902    
1903                default:
1904                OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1905                break;
1906                }
1907            break;            break;
1908    
1909            case PT_WORD:            case PT_WORD:
# Line 1978  for (;;) Line 2012  for (;;)
2012            goto ANYNL03;            goto ANYNL03;
2013    
2014            case CHAR_CR:            case CHAR_CR:
2015            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
2016            /* Fall through */            /* Fall through */
2017    
2018            ANYNL03:            ANYNL03:
# Line 2176  for (;;) Line 2210  for (;;)
2210            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2211              reset_could_continue = TRUE;              reset_could_continue = TRUE;
2212            }            }
2213          else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)          else if (ptr[1] == CHAR_LF)
2214            {            {
2215            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2216            }            }
# Line 2537  for (;;) Line 2571  for (;;)
2571            {            {
2572            case OP_CRSTAR:            case OP_CRSTAR:
2573            case OP_CRMINSTAR:            case OP_CRMINSTAR:
2574              case OP_CRPOSSTAR:
2575            ADD_ACTIVE(next_state_offset + 1, 0);            ADD_ACTIVE(next_state_offset + 1, 0);
2576            if (isinclass) { ADD_NEW(state_offset, 0); }            if (isinclass)
2577                {
2578                if (*ecode == OP_CRPOSSTAR)
2579                  {
2580                  active_count--;           /* Remove non-match possibility */
2581                  next_active_state--;
2582                  }
2583                ADD_NEW(state_offset, 0);
2584                }
2585            break;            break;
2586    
2587            case OP_CRPLUS:            case OP_CRPLUS:
2588            case OP_CRMINPLUS:            case OP_CRMINPLUS:
2589              case OP_CRPOSPLUS:
2590            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2591            if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }            if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2592            if (isinclass) { count++; ADD_NEW(state_offset, count); }            if (isinclass)
2593                {
2594                if (count > 0 && *ecode == OP_CRPOSPLUS)
2595                  {
2596                  active_count--;           /* Remove non-match possibility */
2597                  next_active_state--;
2598                  }
2599                count++;
2600                ADD_NEW(state_offset, count);
2601                }
2602            break;            break;
2603    
2604            case OP_CRQUERY:            case OP_CRQUERY:
2605            case OP_CRMINQUERY:            case OP_CRMINQUERY:
2606              case OP_CRPOSQUERY:
2607            ADD_ACTIVE(next_state_offset + 1, 0);            ADD_ACTIVE(next_state_offset + 1, 0);
2608            if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }            if (isinclass)
2609                {
2610                if (*ecode == OP_CRPOSQUERY)
2611                  {
2612                  active_count--;           /* Remove non-match possibility */
2613                  next_active_state--;
2614                  }
2615                ADD_NEW(next_state_offset + 1, 0);
2616                }
2617            break;            break;
2618    
2619            case OP_CRRANGE:            case OP_CRRANGE:
2620            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2621              case OP_CRPOSRANGE:
2622            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2623            if (count >= (int)GET2(ecode, 1))            if (count >= (int)GET2(ecode, 1))
2624              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2625            if (isinclass)            if (isinclass)
2626              {              {
2627              int max = (int)GET2(ecode, 1 + IMM2_SIZE);              int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2628                if (*ecode == OP_CRPOSRANGE)
2629                  {
2630                  active_count--;           /* Remove non-match possibility */
2631                  next_active_state--;
2632                  }
2633              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2634                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2635              else              else
# Line 2661  for (;;) Line 2729  for (;;)
2729    
2730          condcode = code[LINK_SIZE+1];          condcode = code[LINK_SIZE+1];
2731    
2732          /* Back reference conditions are not supported */          /* Back reference conditions and duplicate named recursion conditions
2733            are not supported */
2734    
2735          if (condcode == OP_CREF || condcode == OP_NCREF)          if (condcode == OP_CREF || condcode == OP_DNCREF ||
2736                condcode == OP_DNRREF)
2737            return PCRE_ERROR_DFA_UCOND;            return PCRE_ERROR_DFA_UCOND;
2738    
2739          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
# Line 2675  for (;;) Line 2745  for (;;)
2745          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2746          recursed groups. */          recursed groups. */
2747    
2748          else if (condcode == OP_RREF || condcode == OP_NRREF)          else if (condcode == OP_RREF)
2749            {            {
2750            int value = GET2(code, LINK_SIZE + 2);            int value = GET2(code, LINK_SIZE + 2);
2751            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
# Line 3396  for (;;) Line 3466  for (;;)
3466    
3467      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3468        {        {
3469        /* Advance to a known first char. */        /* Advance to a known first pcre_uchar (i.e. data item) */
3470    
3471        if (has_first_char)        if (has_first_char)
3472          {          {
# Line 3404  for (;;) Line 3474  for (;;)
3474            {            {
3475            pcre_uchar csc;            pcre_uchar csc;
3476            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3477                   (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)                   (csc = *current_subject) != first_char && csc != first_char2)
3478              current_subject++;              current_subject++;
3479            }            }
3480          else          else
3481            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3482                   RAWUCHARTEST(current_subject) != first_char)                   *current_subject != first_char)
3483              current_subject++;              current_subject++;
3484          }          }
3485    
# Line 3439  for (;;) Line 3509  for (;;)
3509            ANYCRLF, and we are now at a LF, advance the match position by one            ANYCRLF, and we are now at a LF, advance the match position by one
3510            more character. */            more character. */
3511    
3512            if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&            if (current_subject[-1] == CHAR_CR &&
3513                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3514                 current_subject < end_subject &&                 current_subject < end_subject && *current_subject == CHAR_NL)
                RAWUCHARTEST(current_subject) == CHAR_NL)  
3515              current_subject++;              current_subject++;
3516            }            }
3517          }          }
3518    
3519        /* Or to a non-unique first char after study */        /* Advance to a non-unique first pcre_uchar after study */
3520    
3521        else if (start_bits != NULL)        else if (start_bits != NULL)
3522          {          {
3523          while (current_subject < end_subject)          while (current_subject < end_subject)
3524            {            {
3525            register pcre_uint32 c = RAWUCHARTEST(current_subject);            register pcre_uint32 c = *current_subject;
3526  #ifndef COMPILE_PCRE8  #ifndef COMPILE_PCRE8
3527            if (c > 255) c = 255;            if (c > 255) c = 255;
3528  #endif  #endif
3529            if ((start_bits[c/8] & (1 << (c&7))) == 0)            if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3530              {            current_subject++;
             current_subject++;  
 #if defined SUPPORT_UTF && defined COMPILE_PCRE8  
             /* In non 8-bit mode, the iteration will stop for  
             characters > 255 at the beginning or not stop at all. */  
             if (utf)  
               ACROSSCHAR(current_subject < end_subject, *current_subject,  
                 current_subject++);  
 #endif  
             }  
           else break;  
3531            }            }
3532          }          }
3533        }        }
# Line 3487  for (;;) Line 3546  for (;;)
3546        /* If the pattern was studied, a minimum subject length may be set. This        /* If the pattern was studied, a minimum subject length may be set. This
3547        is a lower bound; no actual string of that length may actually match the        is a lower bound; no actual string of that length may actually match the
3548        pattern. Although the value is, strictly, in characters, we treat it as        pattern. Although the value is, strictly, in characters, we treat it as
3549        bytes to avoid spending too much time in this optimization. */        in pcre_uchar units to avoid spending too much time in this optimization.
3550          */
3551    
3552        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3553            (pcre_uint32)(end_subject - current_subject) < study->minlength)            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3554          return PCRE_ERROR_NOMATCH;          return PCRE_ERROR_NOMATCH;
3555    
3556        /* If req_char is set, we know that that character must appear in the        /* If req_char is set, we know that that pcre_uchar must appear in the
3557        subject for the match to succeed. If the first character is set, req_char        subject for the match to succeed. If the first pcre_uchar is set,
3558        must be later in the subject; otherwise the test starts at the match        req_char must be later in the subject; otherwise the test starts at the
3559        point. This optimization can save a huge amount of work in patterns with        match point. This optimization can save a huge amount of work in patterns
3560        nested unlimited repeats that aren't going to match. Writing separate        with nested unlimited repeats that aren't going to match. Writing
3561        code for cased/caseless versions makes it go faster, as does using an        separate code for cased/caseless versions makes it go faster, as does
3562        autoincrement and backing off on a match.        using an autoincrement and backing off on a match.
3563    
3564        HOWEVER: when the subject string is very, very long, searching to its end        HOWEVER: when the subject string is very, very long, searching to its end
3565        can take a long time, and give bad performance on quite ordinary        can take a long time, and give bad performance on quite ordinary
# Line 3519  for (;;) Line 3579  for (;;)
3579              {              {
3580              while (p < end_subject)              while (p < end_subject)
3581                {                {
3582                register pcre_uint32 pp = RAWUCHARINCTEST(p);                register pcre_uint32 pp = *p++;
3583                if (pp == req_char || pp == req_char2) { p--; break; }                if (pp == req_char || pp == req_char2) { p--; break; }
3584                }                }
3585              }              }
# Line 3527  for (;;) Line 3587  for (;;)
3587              {              {
3588              while (p < end_subject)              while (p < end_subject)
3589                {                {
3590                if (RAWUCHARINCTEST(p) == req_char) { p--; break; }                if (*p++ == req_char) { p--; break; }
3591                }                }
3592              }              }
3593    
3594            /* If we can't find the required character, break the matching loop,            /* If we can't find the required pcre_uchar, break the matching loop,
3595            which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3596    
3597            if (p >= end_subject) break;            if (p >= end_subject) break;
3598    
3599            /* If we have found the required character, save the point where we            /* If we have found the required pcre_uchar, save the point where we
3600            found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3601            the start hasn't passed this character yet. */            the start hasn't passed this point yet. */
3602    
3603            req_char_ptr = p;            req_char_ptr = p;
3604            }            }
# Line 3595  for (;;) Line 3655  for (;;)
3655    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
3656    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3657    
3658    if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&    if (current_subject[-1] == CHAR_CR &&
3659        current_subject < end_subject &&        current_subject < end_subject &&
3660        RAWUCHARTEST(current_subject) == CHAR_NL &&        *current_subject == CHAR_NL &&
3661        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
3662          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
3663           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.1364  
changed lines
  Added in v.1430

  ViewVC Help
Powered by ViewVC 1.1.5