/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 925 by ph10, Wed Feb 22 14:24:56 2012 UTC revision 1041 by ph10, Sun Sep 16 10:16:27 2012 UTC
# Line 38  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a sort of DFA algorithm (not a true  alternative matching function that uses a sort of DFA algorithm (not a true
43  FSM). This is NOT Perl- compatible, but it has advantages in certain  FSM). This is NOT Perl-compatible, but it has advantages in certain
44  applications. */  applications. */
45    
46    
# Line 282  typedef struct stateblock { Line 281  typedef struct stateblock {
281    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
282  } stateblock;  } stateblock;
283    
284  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
285    
286    
287  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 382  for the current character, one for the f Line 381  for the current character, one for the f
381      next_new_state->count  = (y); \      next_new_state->count  = (y); \
382      next_new_state->data   = (z); \      next_new_state->data   = (z); \
383      next_new_state++; \      next_new_state++; \
384      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385          (x), (y), (z), __LINE__)); \
386      } \      } \
387    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
388    
# Line 573  for (;;) Line 573  for (;;)
573    int clen, dlen;    int clen, dlen;
574    unsigned int c, d;    unsigned int c, d;
575    int forced_fail = 0;    int forced_fail = 0;
576    BOOL partial_newline = FALSE;    BOOL partial_newline = FALSE;
577    BOOL could_continue = reset_could_continue;    BOOL could_continue = reset_could_continue;
578    reset_could_continue = FALSE;    reset_could_continue = FALSE;
579    
580    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
581    new state list. */    new state list. */
582    
# Line 611  for (;;) Line 611  for (;;)
611    
612    if (ptr < end_subject)    if (ptr < end_subject)
613      {      {
614      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of data items in the character */
615  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
616      if (utf) { GETCHARLEN(c, ptr, clen); } else      if (utf) { GETCHARLEN(c, ptr, clen); } else
617  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
# Line 645  for (;;) Line 645  for (;;)
645    
646      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
647      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
648      been skipped". If the could_continue flag was passed over from a previous      been skipped". If the could_continue flag was passed over from a previous
649      state, arrange for it to passed on. */      state, arrange for it to passed on. */
650    
651      if (state_offset < 0)      if (state_offset < 0)
# Line 695  for (;;) Line 695  for (;;)
695      permitted.      permitted.
696    
697      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
698      argument that is not a data character - but is always one byte long because      argument that is not a data character - but is always one byte long because
699      the values are small. We have to take special action to deal with  \P, \p,      the values are small. We have to take special action to deal with  \P, \p,
700      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
701      these ones to new opcodes. */      these ones to new opcodes. */
# Line 789  for (;;) Line 789  for (;;)
789              offsets[0] = (int)(current_subject - start_subject);              offsets[0] = (int)(current_subject - start_subject);
790              offsets[1] = (int)(ptr - start_subject);              offsets[1] = (int)(ptr - start_subject);
791              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
792                offsets[1] - offsets[0], current_subject));                offsets[1] - offsets[0], (char *)current_subject));
793              }              }
794            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
795              {              {
# Line 894  for (;;) Line 894  for (;;)
894        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
895        case OP_ANY:        case OP_ANY:
896        if (clen > 0 && !IS_NEWLINE(ptr))        if (clen > 0 && !IS_NEWLINE(ptr))
897          {          {
898          if (ptr + 1 >= md->end_subject &&          if (ptr + 1 >= md->end_subject &&
899              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
900              NLBLOCK->nltype == NLTYPE_FIXED &&              NLBLOCK->nltype == NLTYPE_FIXED &&
901              NLBLOCK->nllen == 2 &&              NLBLOCK->nllen == 2 &&
902              c == NLBLOCK->nl[0])              c == NLBLOCK->nl[0])
903            {            {
904            could_continue = partial_newline = TRUE;            could_continue = partial_newline = TRUE;
905            }            }
906          else          else
907            {            {
908            ADD_NEW(state_offset + 1, 0);            ADD_NEW(state_offset + 1, 0);
909            }            }
910          }          }
911        break;        break;
912    
# Line 938  for (;;) Line 938  for (;;)
938          else if (ptr + 1 >= md->end_subject &&          else if (ptr + 1 >= md->end_subject &&
939                   (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&                   (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
940                   NLBLOCK->nltype == NLTYPE_FIXED &&                   NLBLOCK->nltype == NLTYPE_FIXED &&
941                   NLBLOCK->nllen == 2 &&                   NLBLOCK->nllen == 2 &&
942                   c == NLBLOCK->nl[0])                   c == NLBLOCK->nl[0])
943            {            {
944            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
945              {              {
946              reset_could_continue = TRUE;              reset_could_continue = TRUE;
947              ADD_NEW_DATA(-(state_offset + 1), 0, 1);              ADD_NEW_DATA(-(state_offset + 1), 0, 1);
948              }              }
949            else could_continue = partial_newline = TRUE;            else could_continue = partial_newline = TRUE;
950            }            }
951          }          }
952        break;        break;
953    
# Line 963  for (;;) Line 963  for (;;)
963          else if (ptr + 1 >= md->end_subject &&          else if (ptr + 1 >= md->end_subject &&
964                   (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&                   (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
965                   NLBLOCK->nltype == NLTYPE_FIXED &&                   NLBLOCK->nltype == NLTYPE_FIXED &&
966                   NLBLOCK->nllen == 2 &&                   NLBLOCK->nllen == 2 &&
967                   c == NLBLOCK->nl[0])                   c == NLBLOCK->nl[0])
968            {            {
969            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
970              {              {
971              reset_could_continue = TRUE;              reset_could_continue = TRUE;
972              ADD_NEW_DATA(-(state_offset + 1), 0, 1);              ADD_NEW_DATA(-(state_offset + 1), 0, 1);
973              }              }
974            else could_continue = partial_newline = TRUE;            else could_continue = partial_newline = TRUE;
975            }            }
976          }          }
977        else if (IS_NEWLINE(ptr))        else if (IS_NEWLINE(ptr))
978          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
# Line 1138  for (;;) Line 1138  for (;;)
1138          if (d == OP_ANY && ptr + 1 >= md->end_subject &&          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1139              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1140              NLBLOCK->nltype == NLTYPE_FIXED &&              NLBLOCK->nltype == NLTYPE_FIXED &&
1141              NLBLOCK->nllen == 2 &&              NLBLOCK->nllen == 2 &&
1142              c == NLBLOCK->nl[0])              c == NLBLOCK->nl[0])
1143            {            {
1144            could_continue = partial_newline = TRUE;            could_continue = partial_newline = TRUE;
1145            }            }
1146          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1147              (c < 256 &&              (c < 256 &&
1148                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
# Line 1169  for (;;) Line 1169  for (;;)
1169          if (d == OP_ANY && ptr + 1 >= md->end_subject &&          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1170              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1171              NLBLOCK->nltype == NLTYPE_FIXED &&              NLBLOCK->nltype == NLTYPE_FIXED &&
1172              NLBLOCK->nllen == 2 &&              NLBLOCK->nllen == 2 &&
1173              c == NLBLOCK->nl[0])              c == NLBLOCK->nl[0])
1174            {            {
1175            could_continue = partial_newline = TRUE;            could_continue = partial_newline = TRUE;
1176            }            }
1177          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1178              (c < 256 &&              (c < 256 &&
1179                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
# Line 1199  for (;;) Line 1199  for (;;)
1199          if (d == OP_ANY && ptr + 1 >= md->end_subject &&          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1200              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1201              NLBLOCK->nltype == NLTYPE_FIXED &&              NLBLOCK->nltype == NLTYPE_FIXED &&
1202              NLBLOCK->nllen == 2 &&              NLBLOCK->nllen == 2 &&
1203              c == NLBLOCK->nl[0])              c == NLBLOCK->nl[0])
1204            {            {
1205            could_continue = partial_newline = TRUE;            could_continue = partial_newline = TRUE;
1206            }            }
1207          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208              (c < 256 &&              (c < 256 &&
1209                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
# Line 1227  for (;;) Line 1227  for (;;)
1227          if (d == OP_ANY && ptr + 1 >= md->end_subject &&          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1228              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1229              NLBLOCK->nltype == NLTYPE_FIXED &&              NLBLOCK->nltype == NLTYPE_FIXED &&
1230              NLBLOCK->nllen == 2 &&              NLBLOCK->nllen == 2 &&
1231              c == NLBLOCK->nl[0])              c == NLBLOCK->nl[0])
1232            {            {
1233            could_continue = partial_newline = TRUE;            could_continue = partial_newline = TRUE;
1234            }            }
1235          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1236              (c < 256 &&              (c < 256 &&
1237                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
# Line 1256  for (;;) Line 1256  for (;;)
1256          if (d == OP_ANY && ptr + 1 >= md->end_subject &&          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1257              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&              (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1258              NLBLOCK->nltype == NLTYPE_FIXED &&              NLBLOCK->nltype == NLTYPE_FIXED &&
1259              NLBLOCK->nllen == 2 &&              NLBLOCK->nllen == 2 &&
1260              c == NLBLOCK->nl[0])              c == NLBLOCK->nl[0])
1261            {            {
1262            could_continue = partial_newline = TRUE;            could_continue = partial_newline = TRUE;
1263            }            }
1264          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265              (c < 256 &&              (c < 256 &&
1266                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
# Line 1368  for (;;) Line 1368  for (;;)
1368        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1369        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1370        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1371        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1372          {          {
1373            int lgb, rgb;
1374          const pcre_uchar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1375          int ncount = 0;          int ncount = 0;
1376          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
# Line 1377  for (;;) Line 1378  for (;;)
1378            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1379            next_active_state--;            next_active_state--;
1380            }            }
1381            lgb = UCD_GRAPHBREAK(c);
1382          while (nptr < end_subject)          while (nptr < end_subject)
1383            {            {
1384            int nd;            dlen = 1;
1385            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1386            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1387            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1388            ncount++;            ncount++;
1389            nptr += ndlen;            lgb = rgb;
1390              nptr += dlen;
1391            }            }
1392          count++;          count++;
1393          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
# Line 1403  for (;;) Line 1406  for (;;)
1406          int ncount = 0;          int ncount = 0;
1407          switch (c)          switch (c)
1408            {            {
1409            case 0x000b:            case CHAR_VT:
1410            case 0x000c:            case CHAR_FF:
1411            case 0x0085:            case CHAR_NEL:
1412    #ifndef EBCDIC
1413            case 0x2028:            case 0x2028:
1414            case 0x2029:            case 0x2029:
1415    #endif  /* Not EBCDIC */
1416            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1417            goto ANYNL01;            goto ANYNL01;
1418    
1419            case 0x000d:            case CHAR_CR:
1420            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1421            /* Fall through */            /* Fall through */
1422    
1423            ANYNL01:            ANYNL01:
1424            case 0x000a:            case CHAR_LF:
1425            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1426              {              {
1427              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1443  for (;;) Line 1448  for (;;)
1448          BOOL OK;          BOOL OK;
1449          switch (c)          switch (c)
1450            {            {
1451            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1452            OK = TRUE;            OK = TRUE;
1453            break;            break;
1454    
# Line 1482  for (;;) Line 1481  for (;;)
1481          BOOL OK;          BOOL OK;
1482          switch (c)          switch (c)
1483            {            {
1484            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
1485            OK = TRUE;            OK = TRUE;
1486            break;            break;
1487    
# Line 1624  for (;;) Line 1605  for (;;)
1605        QS2:        QS2:
1606    
1607        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1608        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1609          {          {
1610            int lgb, rgb;
1611          const pcre_uchar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1612          int ncount = 0;          int ncount = 0;
1613          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
# Line 1634  for (;;) Line 1616  for (;;)
1616            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1617            next_active_state--;            next_active_state--;
1618            }            }
1619            lgb = UCD_GRAPHBREAK(c);
1620          while (nptr < end_subject)          while (nptr < end_subject)
1621            {            {
1622            int nd;            dlen = 1;
1623            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1624            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1625            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1626            ncount++;            ncount++;
1627            nptr += ndlen;            lgb = rgb;
1628              nptr += dlen;
1629            }            }
1630          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1631          }          }
# Line 1667  for (;;) Line 1651  for (;;)
1651          int ncount = 0;          int ncount = 0;
1652          switch (c)          switch (c)
1653            {            {
1654            case 0x000b:            case CHAR_VT:
1655            case 0x000c:            case CHAR_FF:
1656            case 0x0085:            case CHAR_NEL:
1657    #ifndef EBCDIC
1658            case 0x2028:            case 0x2028:
1659            case 0x2029:            case 0x2029:
1660    #endif  /* Not EBCDIC */
1661            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1662            goto ANYNL02;            goto ANYNL02;
1663    
1664            case 0x000d:            case CHAR_CR:
1665            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1666            /* Fall through */            /* Fall through */
1667    
1668            ANYNL02:            ANYNL02:
1669            case 0x000a:            case CHAR_LF:
1670            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1671                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1672              {              {
# Line 1715  for (;;) Line 1701  for (;;)
1701          BOOL OK;          BOOL OK;
1702          switch (c)          switch (c)
1703            {            {
1704            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1705            OK = TRUE;            OK = TRUE;
1706            break;            break;
1707    
# Line 1761  for (;;) Line 1741  for (;;)
1741          BOOL OK;          BOOL OK;
1742          switch (c)          switch (c)
1743            {            {
1744            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
1745            OK = TRUE;            OK = TRUE;
1746            break;            break;
1747    
# Line 1891  for (;;) Line 1853  for (;;)
1853        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1854          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1855        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1856        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1857          {          {
1858            int lgb, rgb;
1859          const pcre_uchar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1860          int ncount = 0;          int ncount = 0;
1861          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
# Line 1900  for (;;) Line 1863  for (;;)
1863            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1864            next_active_state--;            next_active_state--;
1865            }            }
1866            lgb = UCD_GRAPHBREAK(c);
1867          while (nptr < end_subject)          while (nptr < end_subject)
1868            {            {
1869            int nd;            dlen = 1;
1870            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1871            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1872            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1873            ncount++;            ncount++;
1874            nptr += ndlen;            lgb = rgb;
1875              nptr += dlen;
1876            }            }
1877          if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)          if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1878              reset_could_continue = TRUE;              reset_could_continue = TRUE;
1879          if (++count >= GET2(code, 1))          if (++count >= GET2(code, 1))
1880            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1881          else          else
# Line 1932  for (;;) Line 1897  for (;;)
1897          int ncount = 0;          int ncount = 0;
1898          switch (c)          switch (c)
1899            {            {
1900            case 0x000b:            case CHAR_VT:
1901            case 0x000c:            case CHAR_FF:
1902            case 0x0085:            case CHAR_NEL:
1903    #ifndef EBCDIC
1904            case 0x2028:            case 0x2028:
1905            case 0x2029:            case 0x2029:
1906    #endif  /* Not EBCDIC */
1907            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1908            goto ANYNL03;            goto ANYNL03;
1909    
1910            case 0x000d:            case CHAR_CR:
1911            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1912            /* Fall through */            /* Fall through */
1913    
1914            ANYNL03:            ANYNL03:
1915            case 0x000a:            case CHAR_LF:
1916            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1917              {              {
1918              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1976  for (;;) Line 1943  for (;;)
1943          BOOL OK;          BOOL OK;
1944          switch (c)          switch (c)
1945            {            {
1946            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1947            OK = TRUE;            OK = TRUE;
1948            break;            break;
1949    
# Line 2018  for (;;) Line 1979  for (;;)
1979          BOOL OK;          BOOL OK;
1980          switch (c)          switch (c)
1981            {            {
1982            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
1983            OK = TRUE;            OK = TRUE;
1984            break;            break;
1985    
# Line 2112  for (;;) Line 2055  for (;;)
2055        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2056    
2057        case OP_EXTUNI:        case OP_EXTUNI:
2058        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
2059          {          {
2060            int lgb, rgb;
2061          const pcre_uchar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
2062          int ncount = 0;          int ncount = 0;
2063            lgb = UCD_GRAPHBREAK(c);
2064          while (nptr < end_subject)          while (nptr < end_subject)
2065            {            {
2066            int nclen = 1;            dlen = 1;
2067            GETCHARLEN(c, nptr, nclen);            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2068            if (UCD_CATEGORY(c) != ucp_M) break;            rgb = UCD_GRAPHBREAK(d);
2069              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2070            ncount++;            ncount++;
2071            nptr += nclen;            lgb = rgb;
2072              nptr += dlen;
2073            }            }
2074          if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)          if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2075              reset_could_continue = TRUE;              reset_could_continue = TRUE;
2076          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2077          }          }
2078        break;        break;
# Line 2139  for (;;) Line 2086  for (;;)
2086        case OP_ANYNL:        case OP_ANYNL:
2087        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2088          {          {
2089          case 0x000b:          case CHAR_VT:
2090          case 0x000c:          case CHAR_FF:
2091          case 0x0085:          case CHAR_NEL:
2092    #ifndef EBCDIC
2093          case 0x2028:          case 0x2028:
2094          case 0x2029:          case 0x2029:
2095    #endif  /* Not EBCDIC */
2096          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2097    
2098          case 0x000a:          case CHAR_LF:
2099          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2100          break;          break;
2101    
2102          case 0x000d:          case CHAR_CR:
2103          if (ptr + 1 >= end_subject)          if (ptr + 1 >= end_subject)
2104            {            {
2105            ADD_NEW(state_offset + 1, 0);            ADD_NEW(state_offset + 1, 0);
2106            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2107              reset_could_continue = TRUE;              reset_could_continue = TRUE;
2108            }            }
2109          else if (ptr[1] == 0x0a)          else if (ptr[1] == CHAR_LF)
2110            {            {
2111            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2112            }            }
2113          else          else
2114            {            {
2115            ADD_NEW(state_offset + 1, 0);            ADD_NEW(state_offset + 1, 0);
2116            }            }
2117          break;          break;
2118          }          }
2119        break;        break;
# Line 2173  for (;;) Line 2122  for (;;)
2122        case OP_NOT_VSPACE:        case OP_NOT_VSPACE:
2123        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2124          {          {
2125          case 0x000a:          VSPACE_CASES:
         case 0x000b:  
         case 0x000c:  
         case 0x000d:  
         case 0x0085:  
         case 0x2028:  
         case 0x2029:  
2126          break;          break;
2127    
2128          default:          default:
# Line 2192  for (;;) Line 2135  for (;;)
2135        case OP_VSPACE:        case OP_VSPACE:
2136        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2137          {          {
2138          case 0x000a:          VSPACE_CASES:
         case 0x000b:  
         case 0x000c:  
         case 0x000d:  
         case 0x0085:  
         case 0x2028:  
         case 0x2029:  
2139          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2140          break;          break;
2141    
2142          default: break;          default:
2143            break;
2144          }          }
2145        break;        break;
2146    
# Line 2210  for (;;) Line 2148  for (;;)
2148        case OP_NOT_HSPACE:        case OP_NOT_HSPACE:
2149        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2150          {          {
2151          case 0x09:      /* HT */          HSPACE_CASES:
         case 0x20:      /* SPACE */  
         case 0xa0:      /* NBSP */  
         case 0x1680:    /* OGHAM SPACE MARK */  
         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
         case 0x2000:    /* EN QUAD */  
         case 0x2001:    /* EM QUAD */  
         case 0x2002:    /* EN SPACE */  
         case 0x2003:    /* EM SPACE */  
         case 0x2004:    /* THREE-PER-EM SPACE */  
         case 0x2005:    /* FOUR-PER-EM SPACE */  
         case 0x2006:    /* SIX-PER-EM SPACE */  
         case 0x2007:    /* FIGURE SPACE */  
         case 0x2008:    /* PUNCTUATION SPACE */  
         case 0x2009:    /* THIN SPACE */  
         case 0x200A:    /* HAIR SPACE */  
         case 0x202f:    /* NARROW NO-BREAK SPACE */  
         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
         case 0x3000:    /* IDEOGRAPHIC SPACE */  
2152          break;          break;
2153    
2154          default:          default:
# Line 2241  for (;;) Line 2161  for (;;)
2161        case OP_HSPACE:        case OP_HSPACE:
2162        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2163          {          {
2164          case 0x09:      /* HT */          HSPACE_CASES:
         case 0x20:      /* SPACE */  
         case 0xa0:      /* NBSP */  
         case 0x1680:    /* OGHAM SPACE MARK */  
         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
         case 0x2000:    /* EN QUAD */  
         case 0x2001:    /* EM QUAD */  
         case 0x2002:    /* EN SPACE */  
         case 0x2003:    /* EM SPACE */  
         case 0x2004:    /* THREE-PER-EM SPACE */  
         case 0x2005:    /* FOUR-PER-EM SPACE */  
         case 0x2006:    /* SIX-PER-EM SPACE */  
         case 0x2007:    /* FIGURE SPACE */  
         case 0x2008:    /* PUNCTUATION SPACE */  
         case 0x2009:    /* THIN SPACE */  
         case 0x200A:    /* HAIR SPACE */  
         case 0x202f:    /* NARROW NO-BREAK SPACE */  
         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
         case 0x3000:    /* IDEOGRAPHIC SPACE */  
2165          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2166          break;          break;
2167    
2168            default:
2169            break;
2170          }          }
2171        break;        break;
2172    
# Line 2277  for (;;) Line 2182  for (;;)
2182    
2183        case OP_NOTI:        case OP_NOTI:
2184        if (clen > 0)        if (clen > 0)
2185          {          {
2186          unsigned int otherd;          unsigned int otherd;
2187  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
2188          if (utf && d >= 128)          if (utf && d >= 128)
# Line 2291  for (;;) Line 2196  for (;;)
2196          otherd = TABLE_GET(d, fcc, d);          otherd = TABLE_GET(d, fcc, d);
2197          if (c != d && c != otherd)          if (c != d && c != otherd)
2198            { ADD_NEW(state_offset + dlen + 1, 0); }            { ADD_NEW(state_offset + dlen + 1, 0); }
2199          }          }
2200        break;        break;
2201    
2202        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 2797  for (;;) Line 2702  for (;;)
2702              {              {
2703              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2704  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
2705              const pcre_uchar *p = start_subject + local_offsets[rc];              if (utf)
2706              const pcre_uchar *pp = start_subject + local_offsets[rc+1];                {
2707              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;                const pcre_uchar *p = start_subject + local_offsets[rc];
2708                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2709                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2710                  }
2711  #endif  #endif
2712              if (charcount > 0)              if (charcount > 0)
2713                {                {
# Line 2898  for (;;) Line 2806  for (;;)
2806              const pcre_uchar *pp = local_ptr;              const pcre_uchar *pp = local_ptr;
2807              charcount = (int)(pp - p);              charcount = (int)(pp - p);
2808  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
2809              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;              if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2810  #endif  #endif
2811              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2812              }              }
# Line 2980  for (;;) Line 2888  for (;;)
2888            else            else
2889              {              {
2890  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
2891              const pcre_uchar *p = start_subject + local_offsets[0];              if (utf)
2892              const pcre_uchar *pp = start_subject + local_offsets[1];                {
2893              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;                const pcre_uchar *p = start_subject + local_offsets[0];
2894                  const pcre_uchar *pp = start_subject + local_offsets[1];
2895                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2896                  }
2897  #endif  #endif
2898              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2899              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
# Line 3047  for (;;) Line 2958  for (;;)
2958    
2959    The "could_continue" variable is true if a state could have continued but    The "could_continue" variable is true if a state could have continued but
2960    for the fact that the end of the subject was reached. */    for the fact that the end of the subject was reached. */
2961    
2962    if (new_count <= 0)    if (new_count <= 0)
2963      {      {
2964      if (rlevel == 1 &&                               /* Top level, and */      if (rlevel == 1 &&                               /* Top level, and */
# Line 3064  for (;;) Line 2975  for (;;)
2975            (                                          /* or ... */            (                                          /* or ... */
2976            ptr >= end_subject &&                /* End of subject and */            ptr >= end_subject &&                /* End of subject and */
2977            ptr > md->start_used_ptr)            /* Inspected non-empty string */            ptr > md->start_used_ptr)            /* Inspected non-empty string */
2978            )            )
2979          )          )
2980        {        {
2981        if (offsetcount >= 2)        if (offsetcount >= 2)
2982          {          {
# Line 3162  if (offsetcount < 0) return PCRE_ERROR_B Line 3073  if (offsetcount < 0) return PCRE_ERROR_B
3073  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3074  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3075    
3076  /* We need to find the pointer to any study data before we test for byte  /* Check that the first field in the block is the magic number. If it is not,
3077  flipping, so we scan the extra_data block first. This may set two fields in the  return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3078  match block, so we must initialize them beforehand. However, the other fields  REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3079  in the match block must not be set until after the byte flipping. */  means that the pattern is likely compiled with different endianness. */
3080    
3081    if (re->magic_number != MAGIC_NUMBER)
3082      return re->magic_number == REVERSED_MAGIC_NUMBER?
3083        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3084    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3085    
3086    /* If restarting after a partial match, do some sanity checks on the contents
3087    of the workspace. */
3088    
3089    if ((options & PCRE_DFA_RESTART) != 0)
3090      {
3091      if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3092        workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3093          return PCRE_ERROR_DFA_BADRESTART;
3094      }
3095    
3096    /* Set up study, callout, and table data */
3097    
3098  md->tables = re->tables;  md->tables = re->tables;
3099  md->callout_data = NULL;  md->callout_data = NULL;
# Line 3182  if (extra_data != NULL) Line 3110  if (extra_data != NULL)
3110      md->callout_data = extra_data->callout_data;      md->callout_data = extra_data->callout_data;
3111    if ((flags & PCRE_EXTRA_TABLES) != 0)    if ((flags & PCRE_EXTRA_TABLES) != 0)
3112      md->tables = extra_data->tables;      md->tables = extra_data->tables;
   ((pcre_extra *)extra_data)->flags &= ~PCRE_EXTRA_USED_JIT;  /* No JIT support here */  
3113    }    }
3114    
 /* Check that the first field in the block is the magic number. If it is not,  
 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to  
 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which  
 means that the pattern is likely compiled with different endianness. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   return re->magic_number == REVERSED_MAGIC_NUMBER?  
     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;  
 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;  
   
3115  /* Set some local values */  /* Set some local values */
3116    
3117  current_subject = (const pcre_uchar *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;

Legend:
Removed from v.925  
changed lines
  Added in v.1041

  ViewVC Help
Powered by ViewVC 1.1.5