/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 881 by ph10, Sun Jan 15 18:07:05 2012 UTC revision 1100 by chpe, Tue Oct 16 15:56:26 2012 UTC
# Line 38  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a sort of DFA algorithm (not a true  alternative matching function that uses a sort of DFA algorithm (not a true
43  FSM). This is NOT Perl- compatible, but it has advantages in certain  FSM). This is NOT Perl-compatible, but it has advantages in certain
44  applications. */  applications. */
45    
46    
# Line 282  typedef struct stateblock { Line 281  typedef struct stateblock {
281    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
282  } stateblock;  } stateblock;
283    
284  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
285    
286    
287  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 303  Returns:       nothing Line 302  Returns:       nothing
302  static void  static void
303  pchars(const pcre_uchar *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
304  {  {
305  int c;  pcre_uint32 c;
306  while (length-- > 0)  while (length-- > 0)
307    {    {
308    if (isprint(c = *(p++)))    if (isprint(c = *(p++)))
309      fprintf(f, "%c", c);      fprintf(f, "%c", c);
310    else    else
311      fprintf(f, "\\x%02x", c);      fprintf(f, "\\x{%02x}", c);
312    }    }
313  }  }
314  #endif  #endif
# Line 382  for the current character, one for the f Line 381  for the current character, one for the f
381      next_new_state->count  = (y); \      next_new_state->count  = (y); \
382      next_new_state->data   = (z); \      next_new_state->data   = (z); \
383      next_new_state++; \      next_new_state++; \
384      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385          (x), (y), (z), __LINE__)); \
386      } \      } \
387    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
388    
# Line 424  BOOL utf = (md->poptions & PCRE_UTF8) != Line 424  BOOL utf = (md->poptions & PCRE_UTF8) !=
424  BOOL utf = FALSE;  BOOL utf = FALSE;
425  #endif  #endif
426    
427    BOOL reset_could_continue = FALSE;
428    
429  rlevel++;  rlevel++;
430  offsetcount &= (-2);  offsetcount &= (-2);
431    
# Line 569  for (;;) Line 571  for (;;)
571    {    {
572    int i, j;    int i, j;
573    int clen, dlen;    int clen, dlen;
574    unsigned int c, d;    pcre_uint32 c, d;
575    int forced_fail = 0;    int forced_fail = 0;
576    BOOL could_continue = FALSE;    BOOL partial_newline = FALSE;
577      BOOL could_continue = reset_could_continue;
578      reset_could_continue = FALSE;
579    
580    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
581    new state list. */    new state list. */
# Line 607  for (;;) Line 611  for (;;)
611    
612    if (ptr < end_subject)    if (ptr < end_subject)
613      {      {
614      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of data items in the character */
615  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
616      if (utf) { GETCHARLEN(c, ptr, clen); } else      GETCHARLENTEST(c, ptr, clen);
617  #endif  /* SUPPORT_UTF */  #else
618      c = *ptr;      c = *ptr;
619    #endif  /* SUPPORT_UTF */
620      }      }
621    else    else
622      {      {
# Line 641  for (;;) Line 646  for (;;)
646    
647      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
648      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
649      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
650        state, arrange for it to passed on. */
651    
652      if (state_offset < 0)      if (state_offset < 0)
653        {        {
# Line 650  for (;;) Line 656  for (;;)
656          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
657          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
658            current_state->data - 1);            current_state->data - 1);
659            if (could_continue) reset_could_continue = TRUE;
660          continue;          continue;
661          }          }
662        else        else
# Line 689  for (;;) Line 696  for (;;)
696      permitted.      permitted.
697    
698      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
699      argument that is not a data character - but is always one byte long. We      argument that is not a data character - but is always one byte long because
700      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in      the values are small. We have to take special action to deal with  \P, \p,
701      this case. To keep the other cases fast, convert these ones to new opcodes.      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
702      */      these ones to new opcodes. */
703    
704      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
705        {        {
# Line 783  for (;;) Line 790  for (;;)
790              offsets[0] = (int)(current_subject - start_subject);              offsets[0] = (int)(current_subject - start_subject);
791              offsets[1] = (int)(ptr - start_subject);              offsets[1] = (int)(ptr - start_subject);
792              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
793                offsets[1] - offsets[0], current_subject));                offsets[1] - offsets[0], (char *)current_subject));
794              }              }
795            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
796              {              {
# Line 888  for (;;) Line 895  for (;;)
895        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
896        case OP_ANY:        case OP_ANY:
897        if (clen > 0 && !IS_NEWLINE(ptr))        if (clen > 0 && !IS_NEWLINE(ptr))
898          { ADD_NEW(state_offset + 1, 0); }          {
899            if (ptr + 1 >= md->end_subject &&
900                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
901                NLBLOCK->nltype == NLTYPE_FIXED &&
902                NLBLOCK->nllen == 2 &&
903                c == NLBLOCK->nl[0])
904              {
905              could_continue = partial_newline = TRUE;
906              }
907            else
908              {
909              ADD_NEW(state_offset + 1, 0);
910              }
911            }
912        break;        break;
913    
914        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 916  for (;;) Line 936  for (;;)
936                 (ptr == end_subject - md->nllen)                 (ptr == end_subject - md->nllen)
937              ))              ))
938            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
939            else if (ptr + 1 >= md->end_subject &&
940                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
941                     NLBLOCK->nltype == NLTYPE_FIXED &&
942                     NLBLOCK->nllen == 2 &&
943                     c == NLBLOCK->nl[0])
944              {
945              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
946                {
947                reset_could_continue = TRUE;
948                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
949                }
950              else could_continue = partial_newline = TRUE;
951              }
952          }          }
953        break;        break;
954    
# Line 928  for (;;) Line 961  for (;;)
961          else if (clen == 0 ||          else if (clen == 0 ||
962              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
963            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
964            else if (ptr + 1 >= md->end_subject &&
965                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
966                     NLBLOCK->nltype == NLTYPE_FIXED &&
967                     NLBLOCK->nllen == 2 &&
968                     c == NLBLOCK->nl[0])
969              {
970              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
971                {
972                reset_could_continue = TRUE;
973                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
974                }
975              else could_continue = partial_newline = TRUE;
976              }
977          }          }
978        else if (IS_NEWLINE(ptr))        else if (IS_NEWLINE(ptr))
979          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
# Line 962  for (;;) Line 1008  for (;;)
1008            {            {
1009            const pcre_uchar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1010            if (temp < md->start_used_ptr) md->start_used_ptr = temp;            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1011  #ifdef SUPPORT_UTF  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1012            if (utf) { BACKCHAR(temp); }            if (utf) { BACKCHAR(temp); }
1013  #endif  #endif
1014            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
# Line 1015  for (;;) Line 1061  for (;;)
1061        if (clen > 0)        if (clen > 0)
1062          {          {
1063          BOOL OK;          BOOL OK;
1064            const pcre_uint32 *cp;
1065          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1066          switch(code[1])          switch(code[1])
1067            {            {
# Line 1062  for (;;) Line 1109  for (;;)
1109                 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||                 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1110                 c == CHAR_UNDERSCORE;                 c == CHAR_UNDERSCORE;
1111            break;            break;
1112    
1113              case PT_CLIST:
1114              cp = PRIV(ucd_caseless_sets) + prop->caseset;
1115              for (;;)
1116                {
1117                if (c < *cp) { OK = FALSE; break; }
1118                if (c == *cp++) { OK = TRUE; break; }
1119                }
1120              break;
1121    
1122            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1123    
# Line 1090  for (;;) Line 1146  for (;;)
1146        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1147        if (clen > 0)        if (clen > 0)
1148          {          {
1149          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1150                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1151                NLBLOCK->nltype == NLTYPE_FIXED &&
1152                NLBLOCK->nllen == 2 &&
1153                c == NLBLOCK->nl[0])
1154              {
1155              could_continue = partial_newline = TRUE;
1156              }
1157            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1158              (c < 256 &&              (c < 256 &&
1159                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1160                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1113  for (;;) Line 1177  for (;;)
1177        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1178        if (clen > 0)        if (clen > 0)
1179          {          {
1180          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1181                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1182                NLBLOCK->nltype == NLTYPE_FIXED &&
1183                NLBLOCK->nllen == 2 &&
1184                c == NLBLOCK->nl[0])
1185              {
1186              could_continue = partial_newline = TRUE;
1187              }
1188            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1189              (c < 256 &&              (c < 256 &&
1190                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1191                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1135  for (;;) Line 1207  for (;;)
1207        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1208        if (clen > 0)        if (clen > 0)
1209          {          {
1210          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1211                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1212                NLBLOCK->nltype == NLTYPE_FIXED &&
1213                NLBLOCK->nllen == 2 &&
1214                c == NLBLOCK->nl[0])
1215              {
1216              could_continue = partial_newline = TRUE;
1217              }
1218            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1219              (c < 256 &&              (c < 256 &&
1220                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1221                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1155  for (;;) Line 1235  for (;;)
1235        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1236        if (clen > 0)        if (clen > 0)
1237          {          {
1238          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1239                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1240                NLBLOCK->nltype == NLTYPE_FIXED &&
1241                NLBLOCK->nllen == 2 &&
1242                c == NLBLOCK->nl[0])
1243              {
1244              could_continue = partial_newline = TRUE;
1245              }
1246            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1247              (c < 256 &&              (c < 256 &&
1248                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1249                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1176  for (;;) Line 1264  for (;;)
1264        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1265        if (clen > 0)        if (clen > 0)
1266          {          {
1267          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1268                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1269                NLBLOCK->nltype == NLTYPE_FIXED &&
1270                NLBLOCK->nllen == 2 &&
1271                c == NLBLOCK->nl[0])
1272              {
1273              could_continue = partial_newline = TRUE;
1274              }
1275            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1276              (c < 256 &&              (c < 256 &&
1277                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1278                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1209  for (;;) Line 1305  for (;;)
1305        if (clen > 0)        if (clen > 0)
1306          {          {
1307          BOOL OK;          BOOL OK;
1308            const pcre_uint32 *cp;
1309          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1310          switch(code[2])          switch(code[2])
1311            {            {
# Line 1257  for (;;) Line 1354  for (;;)
1354                 c == CHAR_UNDERSCORE;                 c == CHAR_UNDERSCORE;
1355            break;            break;
1356    
1357              case PT_CLIST:
1358              cp = PRIV(ucd_caseless_sets) + prop->caseset;
1359              for (;;)
1360                {
1361                if (c < *cp) { OK = FALSE; break; }
1362                if (c == *cp++) { OK = TRUE; break; }
1363                }
1364              break;
1365    
1366            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1367    
1368            default:            default:
# Line 1283  for (;;) Line 1389  for (;;)
1389        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1390        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1391        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1392        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1393          {          {
1394            int lgb, rgb;
1395          const pcre_uchar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1396          int ncount = 0;          int ncount = 0;
1397          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
# Line 1292  for (;;) Line 1399  for (;;)
1399            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1400            next_active_state--;            next_active_state--;
1401            }            }
1402            lgb = UCD_GRAPHBREAK(c);
1403          while (nptr < end_subject)          while (nptr < end_subject)
1404            {            {
1405            int nd;            dlen = 1;
1406            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1407            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1408            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1409            ncount++;            ncount++;
1410            nptr += ndlen;            lgb = rgb;
1411              nptr += dlen;
1412            }            }
1413          count++;          count++;
1414          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
# Line 1318  for (;;) Line 1427  for (;;)
1427          int ncount = 0;          int ncount = 0;
1428          switch (c)          switch (c)
1429            {            {
1430            case 0x000b:            case CHAR_VT:
1431            case 0x000c:            case CHAR_FF:
1432            case 0x0085:            case CHAR_NEL:
1433    #ifndef EBCDIC
1434            case 0x2028:            case 0x2028:
1435            case 0x2029:            case 0x2029:
1436    #endif  /* Not EBCDIC */
1437            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1438            goto ANYNL01;            goto ANYNL01;
1439    
1440            case 0x000d:            case CHAR_CR:
1441            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1442            /* Fall through */            /* Fall through */
1443    
1444            ANYNL01:            ANYNL01:
1445            case 0x000a:            case CHAR_LF:
1446            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1447              {              {
1448              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1358  for (;;) Line 1469  for (;;)
1469          BOOL OK;          BOOL OK;
1470          switch (c)          switch (c)
1471            {            {
1472            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1473            OK = TRUE;            OK = TRUE;
1474            break;            break;
1475    
# Line 1397  for (;;) Line 1502  for (;;)
1502          BOOL OK;          BOOL OK;
1503          switch (c)          switch (c)
1504            {            {
1505            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
1506            OK = TRUE;            OK = TRUE;
1507            break;            break;
1508    
# Line 1456  for (;;) Line 1543  for (;;)
1543        if (clen > 0)        if (clen > 0)
1544          {          {
1545          BOOL OK;          BOOL OK;
1546            const pcre_uint32 *cp;
1547          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1548          switch(code[2])          switch(code[2])
1549            {            {
# Line 1504  for (;;) Line 1592  for (;;)
1592                 c == CHAR_UNDERSCORE;                 c == CHAR_UNDERSCORE;
1593            break;            break;
1594    
1595              case PT_CLIST:
1596              cp = PRIV(ucd_caseless_sets) + prop->caseset;
1597              for (;;)
1598                {
1599                if (c < *cp) { OK = FALSE; break; }
1600                if (c == *cp++) { OK = TRUE; break; }
1601                }
1602              break;
1603    
1604            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1605    
1606            default:            default:
# Line 1539  for (;;) Line 1636  for (;;)
1636        QS2:        QS2:
1637    
1638        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1639        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1640          {          {
1641            int lgb, rgb;
1642          const pcre_uchar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1643          int ncount = 0;          int ncount = 0;
1644          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
# Line 1549  for (;;) Line 1647  for (;;)
1647            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1648            next_active_state--;            next_active_state--;
1649            }            }
1650            lgb = UCD_GRAPHBREAK(c);
1651          while (nptr < end_subject)          while (nptr < end_subject)
1652            {            {
1653            int nd;            dlen = 1;
1654            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1655            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1656            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1657            ncount++;            ncount++;
1658            nptr += ndlen;            lgb = rgb;
1659              nptr += dlen;
1660            }            }
1661          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1662          }          }
# Line 1582  for (;;) Line 1682  for (;;)
1682          int ncount = 0;          int ncount = 0;
1683          switch (c)          switch (c)
1684            {            {
1685            case 0x000b:            case CHAR_VT:
1686            case 0x000c:            case CHAR_FF:
1687            case 0x0085:            case CHAR_NEL:
1688    #ifndef EBCDIC
1689            case 0x2028:            case 0x2028:
1690            case 0x2029:            case 0x2029:
1691    #endif  /* Not EBCDIC */
1692            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1693            goto ANYNL02;            goto ANYNL02;
1694    
1695            case 0x000d:            case CHAR_CR:
1696            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1697            /* Fall through */            /* Fall through */
1698    
1699            ANYNL02:            ANYNL02:
1700            case 0x000a:            case CHAR_LF:
1701            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1702                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1703              {              {
# Line 1630  for (;;) Line 1732  for (;;)
1732          BOOL OK;          BOOL OK;
1733          switch (c)          switch (c)
1734            {            {
1735            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1736            OK = TRUE;            OK = TRUE;
1737            break;            break;
1738    
# Line 1676  for (;;) Line 1772  for (;;)
1772          BOOL OK;          BOOL OK;
1773          switch (c)          switch (c)
1774            {            {
1775            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
1776            OK = TRUE;            OK = TRUE;
1777            break;            break;
1778    
# Line 1728  for (;;) Line 1806  for (;;)
1806        if (clen > 0)        if (clen > 0)
1807          {          {
1808          BOOL OK;          BOOL OK;
1809            const pcre_uint32 *cp;
1810          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1811          switch(code[1 + IMM2_SIZE + 1])          switch(code[1 + IMM2_SIZE + 1])
1812            {            {
# Line 1776  for (;;) Line 1855  for (;;)
1855                 c == CHAR_UNDERSCORE;                 c == CHAR_UNDERSCORE;
1856            break;            break;
1857    
1858              case PT_CLIST:
1859              cp = PRIV(ucd_caseless_sets) + prop->caseset;
1860              for (;;)
1861                {
1862                if (c < *cp) { OK = FALSE; break; }
1863                if (c == *cp++) { OK = TRUE; break; }
1864                }
1865              break;
1866    
1867            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1868    
1869            default:            default:
# Line 1806  for (;;) Line 1894  for (;;)
1894        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1895          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1896        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1897        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1898          {          {
1899            int lgb, rgb;
1900          const pcre_uchar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1901          int ncount = 0;          int ncount = 0;
1902          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
# Line 1815  for (;;) Line 1904  for (;;)
1904            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1905            next_active_state--;            next_active_state--;
1906            }            }
1907            lgb = UCD_GRAPHBREAK(c);
1908          while (nptr < end_subject)          while (nptr < end_subject)
1909            {            {
1910            int nd;            dlen = 1;
1911            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1912            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1913            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1914            ncount++;            ncount++;
1915            nptr += ndlen;            lgb = rgb;
1916              nptr += dlen;
1917            }            }
1918            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1919                reset_could_continue = TRUE;
1920          if (++count >= GET2(code, 1))          if (++count >= GET2(code, 1))
1921            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1922          else          else
# Line 1845  for (;;) Line 1938  for (;;)
1938          int ncount = 0;          int ncount = 0;
1939          switch (c)          switch (c)
1940            {            {
1941            case 0x000b:            case CHAR_VT:
1942            case 0x000c:            case CHAR_FF:
1943            case 0x0085:            case CHAR_NEL:
1944    #ifndef EBCDIC
1945            case 0x2028:            case 0x2028:
1946            case 0x2029:            case 0x2029:
1947    #endif  /* Not EBCDIC */
1948            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1949            goto ANYNL03;            goto ANYNL03;
1950    
1951            case 0x000d:            case CHAR_CR:
1952            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1953            /* Fall through */            /* Fall through */
1954    
1955            ANYNL03:            ANYNL03:
1956            case 0x000a:            case CHAR_LF:
1957            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1958              {              {
1959              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1889  for (;;) Line 1984  for (;;)
1984          BOOL OK;          BOOL OK;
1985          switch (c)          switch (c)
1986            {            {
1987            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1988            OK = TRUE;            OK = TRUE;
1989            break;            break;
1990    
# Line 1931  for (;;) Line 2020  for (;;)
2020          BOOL OK;          BOOL OK;
2021          switch (c)          switch (c)
2022            {            {
2023            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
2024            OK = TRUE;            OK = TRUE;
2025            break;            break;
2026    
# Line 2025  for (;;) Line 2096  for (;;)
2096        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2097    
2098        case OP_EXTUNI:        case OP_EXTUNI:
2099        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
2100          {          {
2101            int lgb, rgb;
2102          const pcre_uchar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
2103          int ncount = 0;          int ncount = 0;
2104            lgb = UCD_GRAPHBREAK(c);
2105          while (nptr < end_subject)          while (nptr < end_subject)
2106            {            {
2107            int nclen = 1;            dlen = 1;
2108            GETCHARLEN(c, nptr, nclen);            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2109            if (UCD_CATEGORY(c) != ucp_M) break;            rgb = UCD_GRAPHBREAK(d);
2110              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2111            ncount++;            ncount++;
2112            nptr += nclen;            lgb = rgb;
2113              nptr += dlen;
2114            }            }
2115            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2116                reset_could_continue = TRUE;
2117          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2118          }          }
2119        break;        break;
# Line 2050  for (;;) Line 2127  for (;;)
2127        case OP_ANYNL:        case OP_ANYNL:
2128        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2129          {          {
2130          case 0x000b:          case CHAR_VT:
2131          case 0x000c:          case CHAR_FF:
2132          case 0x0085:          case CHAR_NEL:
2133    #ifndef EBCDIC
2134          case 0x2028:          case 0x2028:
2135          case 0x2029:          case 0x2029:
2136    #endif  /* Not EBCDIC */
2137          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2138    
2139          case 0x000a:          case CHAR_LF:
2140          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2141          break;          break;
2142    
2143          case 0x000d:          case CHAR_CR:
2144          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 >= end_subject)
2145              {
2146              ADD_NEW(state_offset + 1, 0);
2147              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2148                reset_could_continue = TRUE;
2149              }
2150            else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2151            {            {
2152            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2153            }            }
# Line 2078  for (;;) Line 2163  for (;;)
2163        case OP_NOT_VSPACE:        case OP_NOT_VSPACE:
2164        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2165          {          {
2166          case 0x000a:          VSPACE_CASES:
         case 0x000b:  
         case 0x000c:  
         case 0x000d:  
         case 0x0085:  
         case 0x2028:  
         case 0x2029:  
2167          break;          break;
2168    
2169          default:          default:
# Line 2097  for (;;) Line 2176  for (;;)
2176        case OP_VSPACE:        case OP_VSPACE:
2177        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2178          {          {
2179          case 0x000a:          VSPACE_CASES:
         case 0x000b:  
         case 0x000c:  
         case 0x000d:  
         case 0x0085:  
         case 0x2028:  
         case 0x2029:  
2180          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2181          break;          break;
2182    
2183          default: break;          default:
2184            break;
2185          }          }
2186        break;        break;
2187    
# Line 2115  for (;;) Line 2189  for (;;)
2189        case OP_NOT_HSPACE:        case OP_NOT_HSPACE:
2190        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2191          {          {
2192          case 0x09:      /* HT */          HSPACE_CASES:
         case 0x20:      /* SPACE */  
         case 0xa0:      /* NBSP */  
         case 0x1680:    /* OGHAM SPACE MARK */  
         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
         case 0x2000:    /* EN QUAD */  
         case 0x2001:    /* EM QUAD */  
         case 0x2002:    /* EN SPACE */  
         case 0x2003:    /* EM SPACE */  
         case 0x2004:    /* THREE-PER-EM SPACE */  
         case 0x2005:    /* FOUR-PER-EM SPACE */  
         case 0x2006:    /* SIX-PER-EM SPACE */  
         case 0x2007:    /* FIGURE SPACE */  
         case 0x2008:    /* PUNCTUATION SPACE */  
         case 0x2009:    /* THIN SPACE */  
         case 0x200A:    /* HAIR SPACE */  
         case 0x202f:    /* NARROW NO-BREAK SPACE */  
         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
         case 0x3000:    /* IDEOGRAPHIC SPACE */  
2193          break;          break;
2194    
2195          default:          default:
# Line 2146  for (;;) Line 2202  for (;;)
2202        case OP_HSPACE:        case OP_HSPACE:
2203        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2204          {          {
2205          case 0x09:      /* HT */          HSPACE_CASES:
         case 0x20:      /* SPACE */  
         case 0xa0:      /* NBSP */  
         case 0x1680:    /* OGHAM SPACE MARK */  
         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
         case 0x2000:    /* EN QUAD */  
         case 0x2001:    /* EM QUAD */  
         case 0x2002:    /* EN SPACE */  
         case 0x2003:    /* EM SPACE */  
         case 0x2004:    /* THREE-PER-EM SPACE */  
         case 0x2005:    /* FOUR-PER-EM SPACE */  
         case 0x2006:    /* SIX-PER-EM SPACE */  
         case 0x2007:    /* FIGURE SPACE */  
         case 0x2008:    /* PUNCTUATION SPACE */  
         case 0x2009:    /* THIN SPACE */  
         case 0x200A:    /* HAIR SPACE */  
         case 0x202f:    /* NARROW NO-BREAK SPACE */  
         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
         case 0x3000:    /* IDEOGRAPHIC SPACE */  
2206          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2207          break;          break;
2208    
2209            default:
2210            break;
2211          }          }
2212        break;        break;
2213    
2214        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2215        /* Match a negated single character casefully. This is only used for        /* Match a negated single character casefully. */
       one-byte characters, that is, we know that d < 256. The character we are  
       checking (c) can be multibyte. */  
2216    
2217        case OP_NOT:        case OP_NOT:
2218        if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }        if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2219        break;        break;
2220    
2221        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2222        /* Match a negated single character caselessly. This is only used for        /* Match a negated single character caselessly. */
       one-byte characters, that is, we know that d < 256. The character we are  
       checking (c) can be multibyte. */  
2223    
2224        case OP_NOTI:        case OP_NOTI:
2225        if (clen > 0 && c != d && c != fcc[d])        if (clen > 0)
2226          { ADD_NEW(state_offset + dlen + 1, 0); }          {
2227            unsigned int otherd;
2228    #ifdef SUPPORT_UTF
2229            if (utf && d >= 128)
2230              {
2231    #ifdef SUPPORT_UCP
2232              otherd = UCD_OTHERCASE(d);
2233    #endif  /* SUPPORT_UCP */
2234              }
2235            else
2236    #endif  /* SUPPORT_UTF */
2237            otherd = TABLE_GET(d, fcc, d);
2238            if (c != d && c != otherd)
2239              { ADD_NEW(state_offset + dlen + 1, 0); }
2240            }
2241        break;        break;
2242    
2243        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 2210  for (;;) Line 2261  for (;;)
2261        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2262        if (clen > 0)        if (clen > 0)
2263          {          {
2264          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2265          if (caseless)          if (caseless)
2266            {            {
2267  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
# Line 2257  for (;;) Line 2308  for (;;)
2308        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2309        if (clen > 0)        if (clen > 0)
2310          {          {
2311          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2312          if (caseless)          if (caseless)
2313            {            {
2314  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
# Line 2302  for (;;) Line 2353  for (;;)
2353        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2354        if (clen > 0)        if (clen > 0)
2355          {          {
2356          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2357          if (caseless)          if (caseless)
2358            {            {
2359  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
# Line 2339  for (;;) Line 2390  for (;;)
2390        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2391        if (clen > 0)        if (clen > 0)
2392          {          {
2393          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2394          if (caseless)          if (caseless)
2395            {            {
2396  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
# Line 2383  for (;;) Line 2434  for (;;)
2434        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2435        if (clen > 0)        if (clen > 0)
2436          {          {
2437          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2438          if (caseless)          if (caseless)
2439            {            {
2440  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
# Line 2557  for (;;) Line 2608  for (;;)
2608              cb.version          = 1;   /* Version 1 of the callout block */              cb.version          = 1;   /* Version 1 of the callout block */
2609              cb.callout_number   = code[LINK_SIZE+2];              cb.callout_number   = code[LINK_SIZE+2];
2610              cb.offset_vector    = offsets;              cb.offset_vector    = offsets;
2611  #ifdef COMPILE_PCRE8  #if defined COMPILE_PCRE8
2612              cb.subject          = (PCRE_SPTR)start_subject;              cb.subject          = (PCRE_SPTR)start_subject;
2613  #else  #elif defined COMPILE_PCRE16
2614              cb.subject          = (PCRE_SPTR16)start_subject;              cb.subject          = (PCRE_SPTR16)start_subject;
2615    #elif defined COMPILE_PCRE32
2616                cb.subject          = (PCRE_SPTR32)start_subject;
2617  #endif  #endif
2618              cb.subject_length   = (int)(end_subject - start_subject);              cb.subject_length   = (int)(end_subject - start_subject);
2619              cb.start_match      = (int)(current_subject - start_subject);              cb.start_match      = (int)(current_subject - start_subject);
# Line 2690  for (;;) Line 2743  for (;;)
2743            {            {
2744            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2745              {              {
             const pcre_uchar *p = start_subject + local_offsets[rc];  
             const pcre_uchar *pp = start_subject + local_offsets[rc+1];  
2746              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2747  #ifdef SUPPORT_UTF  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2748              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;              if (utf)
2749                  {
2750                  const pcre_uchar *p = start_subject + local_offsets[rc];
2751                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2752                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2753                  }
2754  #endif  #endif
2755              if (charcount > 0)              if (charcount > 0)
2756                {                {
# Line 2792  for (;;) Line 2848  for (;;)
2848              const pcre_uchar *p = ptr;              const pcre_uchar *p = ptr;
2849              const pcre_uchar *pp = local_ptr;              const pcre_uchar *pp = local_ptr;
2850              charcount = (int)(pp - p);              charcount = (int)(pp - p);
2851  #ifdef SUPPORT_UTF  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2852              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;              if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2853  #endif  #endif
2854              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2855              }              }
# Line 2874  for (;;) Line 2930  for (;;)
2930              }              }
2931            else            else
2932              {              {
2933  #ifdef SUPPORT_UTF  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2934              const pcre_uchar *p = start_subject + local_offsets[0];              if (utf)
2935              const pcre_uchar *pp = start_subject + local_offsets[1];                {
2936              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;                const pcre_uchar *p = start_subject + local_offsets[0];
2937                  const pcre_uchar *pp = start_subject + local_offsets[1];
2938                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2939                  }
2940  #endif  #endif
2941              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2942              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
# Line 2900  for (;;) Line 2959  for (;;)
2959          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2960          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2961          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2962  #ifdef COMPILE_PCRE8  #if defined COMPILE_PCRE8
2963          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2964  #else  #elif defined COMPILE_PCRE16
2965          cb.subject          = (PCRE_SPTR16)start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
2966    #elif defined COMPILE_PCRE32
2967            cb.subject          = (PCRE_SPTR32)start_subject;
2968  #endif  #endif
2969          cb.subject_length   = (int)(end_subject - start_subject);          cb.subject_length   = (int)(end_subject - start_subject);
2970          cb.start_match      = (int)(current_subject - start_subject);          cb.start_match      = (int)(current_subject - start_subject);
# Line 2946  for (;;) Line 3007  for (;;)
3007    if (new_count <= 0)    if (new_count <= 0)
3008      {      {
3009      if (rlevel == 1 &&                               /* Top level, and */      if (rlevel == 1 &&                               /* Top level, and */
3010          could_continue &&                            /* Some could go on */          could_continue &&                            /* Some could go on, and */
3011          forced_fail != workspace[1] &&               /* Not all forced fail & */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3012          (                                            /* either... */          (                                            /* either... */
3013          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
# Line 2954  for (;;) Line 3015  for (;;)
3015          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3016           match_count < 0)                            /* no matches */           match_count < 0)                            /* no matches */
3017          ) &&                                         /* And... */          ) &&                                         /* And... */
3018          ptr >= end_subject &&                  /* Reached end of subject */          (
3019          ptr > md->start_used_ptr)              /* Inspected non-empty string */          partial_newline ||                           /* Either partial NL */
3020              (                                          /* or ... */
3021              ptr >= end_subject &&                /* End of subject and */
3022              ptr > md->start_used_ptr)            /* Inspected non-empty string */
3023              )
3024            )
3025        {        {
3026        if (offsetcount >= 2)        if (offsetcount >= 2)
3027          {          {
# Line 3014  Returns:          > 0 => number of match Line 3080  Returns:          > 0 => number of match
3080                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3081  */  */
3082    
3083  #ifdef COMPILE_PCRE8  #if defined COMPILE_PCRE8
3084  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3085  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3086    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3087    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3088  #else  #elif defined COMPILE_PCRE16
3089  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3090  pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,  pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3091    PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3092    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3093    #elif defined COMPILE_PCRE32
3094    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3095    pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3096      PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3097      int offsetcount, int *workspace, int wscount)
3098  #endif  #endif
3099  {  {
3100  REAL_PCRE *re = (REAL_PCRE *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
# Line 3052  if (offsetcount < 0) return PCRE_ERROR_B Line 3123  if (offsetcount < 0) return PCRE_ERROR_B
3123  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3124  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;  if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3125    
3126  /* We need to find the pointer to any study data before we test for byte  /* Check that the first field in the block is the magic number. If it is not,
3127  flipping, so we scan the extra_data block first. This may set two fields in the  return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3128  match block, so we must initialize them beforehand. However, the other fields  REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3129  in the match block must not be set until after the byte flipping. */  means that the pattern is likely compiled with different endianness. */
3130    
3131    if (re->magic_number != MAGIC_NUMBER)
3132      return re->magic_number == REVERSED_MAGIC_NUMBER?
3133        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3134    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3135    
3136    /* If restarting after a partial match, do some sanity checks on the contents
3137    of the workspace. */
3138    
3139    if ((options & PCRE_DFA_RESTART) != 0)
3140      {
3141      if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3142        workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3143          return PCRE_ERROR_DFA_BADRESTART;
3144      }
3145    
3146    /* Set up study, callout, and table data */
3147    
3148  md->tables = re->tables;  md->tables = re->tables;
3149  md->callout_data = NULL;  md->callout_data = NULL;
# Line 3074  if (extra_data != NULL) Line 3162  if (extra_data != NULL)
3162      md->tables = extra_data->tables;      md->tables = extra_data->tables;
3163    }    }
3164    
 /* Check that the first field in the block is the magic number. If it is not,  
 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to  
 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which  
 means that the pattern is likely compiled with different endianness. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   return re->magic_number == REVERSED_MAGIC_NUMBER?  
     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;  
 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;  
   
3165  /* Set some local values */  /* Set some local values */
3166    
3167  current_subject = (const pcre_uchar *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
# Line 3091  end_subject = (const pcre_uchar *)subjec Line 3169  end_subject = (const pcre_uchar *)subjec
3169  req_char_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3170    
3171  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
3172  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3173  utf = (re->options & PCRE_UTF8) != 0;  utf = (re->options & PCRE_UTF8) != 0;
3174  #else  #else
3175  utf = FALSE;  utf = FALSE;
# Line 3177  if (utf && (options & PCRE_NO_UTF8_CHECK Line 3255  if (utf && (options & PCRE_NO_UTF8_CHECK
3255        offsets[0] = erroroffset;        offsets[0] = erroroffset;
3256        offsets[1] = errorcode;        offsets[1] = errorcode;
3257        }        }
3258      return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?  #if defined COMPILE_PCRE8
3259        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3260        PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;        PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3261    #elif defined COMPILE_PCRE16
3262        return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3263          PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3264    #elif defined COMPILE_PCRE32
3265        return PCRE_ERROR_BADUTF32;
3266    #endif
3267      }      }
3268    #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3269    if (start_offset > 0 && start_offset < length &&    if (start_offset > 0 && start_offset < length &&
3270          NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))          NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3271      return PCRE_ERROR_BADUTF8_OFFSET;      return PCRE_ERROR_BADUTF8_OFFSET;
3272    #endif
3273    }    }
3274  #endif  #endif
3275    
# Line 3209  if (!anchored) Line 3296  if (!anchored)
3296    if ((re->flags & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3297      {      {
3298      has_first_char = TRUE;      has_first_char = TRUE;
3299      first_char = first_char2 = re->first_char;      first_char = first_char2 = (pcre_uchar)(re->first_char);
3300      if ((re->flags & PCRE_FCH_CASELESS) != 0)      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3301        {        {
3302        first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);        first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
# Line 3233  character" set. */ Line 3320  character" set. */
3320  if ((re->flags & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3321    {    {
3322    has_req_char = TRUE;    has_req_char = TRUE;
3323    req_char = req_char2 = re->req_char;    req_char = req_char2 = (pcre_uchar)(re->req_char);
3324    if ((re->flags & PCRE_RCH_CASELESS) != 0)    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3325      {      {
3326      req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);      req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
# Line 3292  for (;;) Line 3379  for (;;)
3379        if (has_first_char)        if (has_first_char)
3380          {          {
3381          if (first_char != first_char2)          if (first_char != first_char2)
3382              {
3383              pcre_uchar csc;
3384            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3385                *current_subject != first_char && *current_subject != first_char2)                   (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3386              current_subject++;              current_subject++;
3387              }
3388          else          else
3389            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3390                   *current_subject != first_char)                   RAWUCHARTEST(current_subject) != first_char)
3391              current_subject++;              current_subject++;
3392          }          }
3393    
# Line 3327  for (;;) Line 3417  for (;;)
3417            ANYCRLF, and we are now at a LF, advance the match position by one            ANYCRLF, and we are now at a LF, advance the match position by one
3418            more character. */            more character. */
3419    
3420            if (current_subject[-1] == CHAR_CR &&            if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3421                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3422                 current_subject < end_subject &&                 current_subject < end_subject &&
3423                 *current_subject == CHAR_NL)                 RAWUCHARTEST(current_subject) == CHAR_NL)
3424              current_subject++;              current_subject++;
3425            }            }
3426          }          }
# Line 3341  for (;;) Line 3431  for (;;)
3431          {          {
3432          while (current_subject < end_subject)          while (current_subject < end_subject)
3433            {            {
3434            register unsigned int c = *current_subject;            register pcre_uint32 c = RAWUCHARTEST(current_subject);
3435  #ifndef COMPILE_PCRE8  #ifndef COMPILE_PCRE8
3436            if (c > 255) c = 255;            if (c > 255) c = 255;
3437  #endif  #endif
# Line 3407  for (;;) Line 3497  for (;;)
3497              {              {
3498              while (p < end_subject)              while (p < end_subject)
3499                {                {
3500                register int pp = *p++;                register pcre_uint32 pp = RAWUCHARINCTEST(p);
3501                if (pp == req_char || pp == req_char2) { p--; break; }                if (pp == req_char || pp == req_char2) { p--; break; }
3502                }                }
3503              }              }
# Line 3415  for (;;) Line 3505  for (;;)
3505              {              {
3506              while (p < end_subject)              while (p < end_subject)
3507                {                {
3508                if (*p++ == req_char) { p--; break; }                if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3509                }                }
3510              }              }
3511    
# Line 3473  for (;;) Line 3563  for (;;)
3563    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
3564    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3565    
3566    if (current_subject[-1] == CHAR_CR &&    if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3567        current_subject < end_subject &&        current_subject < end_subject &&
3568        *current_subject == CHAR_NL &&        RAWUCHARTEST(current_subject) == CHAR_NL &&
3569        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
3570          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
3571           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.881  
changed lines
  Added in v.1100

  ViewVC Help
Powered by ViewVC 1.1.5