/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 150 by ph10, Tue Apr 17 08:22:40 2007 UTC revision 392 by ph10, Tue Mar 17 21:30:30 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    #ifdef HAVE_CONFIG_H
49    #include "config.h"
50    #endif
51    
52  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
53  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
54  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 63  applications. */ Line 68  applications. */
68    
69  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
70  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
71  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
72    never stored, so we push them well clear of the normal opcodes. */
73    
74  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
75  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
76  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
77    #define OP_HSPACE_EXTRA     360
78    #define OP_VSPACE_EXTRA     380
79    
80    
81  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
82  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes is possible to
83  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
84  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
85  small value. */  small value. ***NOTE*** If the start of this table is modified, the two tables
86    that follow must also be modified. */
87    
88  static uschar coptable[] = {  static const uschar coptable[] = {
89    0,                             /* End                                    */    0,                             /* End                                    */
90    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
91    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
92    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
93      0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
94      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
95    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
96    1,                             /* Char                                   */    1,                             /* Char                                   */
97    1,                             /* Charnc                                 */    1,                             /* Charnc                                 */
# Line 120  static uschar coptable[] = { Line 131  static uschar coptable[] = {
131    0,                             /* CREF                                   */    0,                             /* CREF                                   */
132    0,                             /* RREF                                   */    0,                             /* RREF                                   */
133    0,                             /* DEF                                    */    0,                             /* DEF                                    */
134    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
135      0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
136      0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
137  };  };
138    
139  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
140  and \w */  and \w */
141    
142  static uschar toptable1[] = {  static const uschar toptable1[] = {
143    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
144    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
145    ctype_space, ctype_space,    ctype_space, ctype_space,
146    ctype_word,  ctype_word,    ctype_word,  ctype_word,
147    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
148  };  };
149    
150  static uschar toptable2[] = {  static const uschar toptable2[] = {
151    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
152    ctype_digit, 0,    ctype_digit, 0,
153    ctype_space, 0,    ctype_space, 0,
154    ctype_word,  0,    ctype_word,  0,
155    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
156  };  };
157    
158    
# Line 211  Arguments: Line 224  Arguments:
224    rlevel            function call recursion level    rlevel            function call recursion level
225    recursing         regex recursive call level    recursing         regex recursive call level
226    
227  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
228                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
229                       -1 => failed to match                       -1 => failed to match
230                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
231    
# Line 500  for (;;) Line 513  for (;;)
513      const uschar *code;      const uschar *code;
514      int state_offset = current_state->offset;      int state_offset = current_state->offset;
515      int count, codevalue;      int count, codevalue;
     int chartype, script;  
516    
517  #ifdef DEBUG  #ifdef DEBUG
518      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 555  for (;;) Line 567  for (;;)
567      permitted.      permitted.
568    
569      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
570      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long. We
571      Unfortunately, we have to take special action to deal with  \P, \p, and      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
572      \X in this case. To keep the other cases fast, convert these ones to new      this case. To keep the other cases fast, convert these ones to new opcodes.
573      opcodes. */      */
574    
575      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
576        {        {
# Line 576  for (;;) Line 588  for (;;)
588            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
589            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
590            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
591              case OP_NOT_HSPACE:
592              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
593              case OP_NOT_VSPACE:
594              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
595            default: break;            default: break;
596            }            }
597          }          }
# Line 676  for (;;) Line 692  for (;;)
692        break;        break;
693    
694        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
695          case OP_SKIPZERO:
696          code += 1 + GET(code, 2);
697          while (*code == OP_ALT) code += GET(code, 1);
698          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
699          break;
700    
701          /*-----------------------------------------------------------------*/
702        case OP_CIRC:        case OP_CIRC:
703        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
704            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
# Line 714  for (;;) Line 737  for (;;)
737    
738        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
739        case OP_ANY:        case OP_ANY:
740        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
741            { ADD_NEW(state_offset + 1, 0); }
742          break;
743    
744          /*-----------------------------------------------------------------*/
745          case OP_ALLANY:
746          if (clen > 0)
747          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
748        break;        break;
749    
# Line 729  for (;;) Line 758  for (;;)
758        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
759          {          {
760          if (clen == 0 ||          if (clen == 0 ||
761              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
762                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
763              ))              ))
764            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 783  for (;;) Line 812  for (;;)
812        break;        break;
813    
814    
 #ifdef SUPPORT_UCP  
   
815        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
816        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
817        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
818        */        */
819    
820    #ifdef SUPPORT_UCP
821        case OP_PROP:        case OP_PROP:
822        case OP_NOTPROP:        case OP_NOTPROP:
823        if (clen > 0)        if (clen > 0)
824          {          {
825          BOOL OK;          BOOL OK;
826          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
827          switch(code[1])          switch(code[1])
828            {            {
829            case PT_ANY:            case PT_ANY:
# Line 803  for (;;) Line 831  for (;;)
831            break;            break;
832    
833            case PT_LAMP:            case PT_LAMP:
834            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
835            break;            break;
836    
837            case PT_GC:            case PT_GC:
838            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
839            break;            break;
840    
841            case PT_PC:            case PT_PC:
842            OK = chartype == code[2];            OK = prop->chartype == code[2];
843            break;            break;
844    
845            case PT_SC:            case PT_SC:
846            OK = script == code[2];            OK = prop->script == code[2];
847            break;            break;
848    
849            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 835  for (;;) Line 863  for (;;)
863  /* ========================================================================== */  /* ========================================================================== */
864        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
865        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
866        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
867        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
868    
869        case OP_TYPEPLUS:        case OP_TYPEPLUS:
870        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 847  for (;;) Line 875  for (;;)
875          {          {
876          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
877              (c < 256 &&              (c < 256 &&
878                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
879                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
880            {            {
881            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 873  for (;;) Line 898  for (;;)
898          {          {
899          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
900              (c < 256 &&              (c < 256 &&
901                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
902                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
903            {            {
904            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 898  for (;;) Line 920  for (;;)
920          {          {
921          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
922              (c < 256 &&              (c < 256 &&
923                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
924                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
925            {            {
926            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 921  for (;;) Line 940  for (;;)
940          {          {
941          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
942              (c < 256 &&              (c < 256 &&
943                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
944                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
945            {            {
946            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 945  for (;;) Line 961  for (;;)
961          {          {
962          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
963              (c < 256 &&              (c < 256 &&
964                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
965                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
966            {            {
967            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 970  for (;;) Line 983  for (;;)
983        argument. It keeps the code above fast for the other cases. The argument        argument. It keeps the code above fast for the other cases. The argument
984        is in the d variable. */        is in the d variable. */
985    
986    #ifdef SUPPORT_UCP
987        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
988        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
989        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
# Line 978  for (;;) Line 992  for (;;)
992        if (clen > 0)        if (clen > 0)
993          {          {
994          BOOL OK;          BOOL OK;
995          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
996          switch(code[2])          switch(code[2])
997            {            {
998            case PT_ANY:            case PT_ANY:
# Line 986  for (;;) Line 1000  for (;;)
1000            break;            break;
1001    
1002            case PT_LAMP:            case PT_LAMP:
1003            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1004            break;            break;
1005    
1006            case PT_GC:            case PT_GC:
1007            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1008            break;            break;
1009    
1010            case PT_PC:            case PT_PC:
1011            OK = chartype == code[3];            OK = prop->chartype == code[3];
1012            break;            break;
1013    
1014            case PT_SC:            case PT_SC:
1015            OK = script == code[3];            OK = prop->script == code[3];
1016            break;            break;
1017    
1018            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1027  for (;;) Line 1041  for (;;)
1041        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1042        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1043        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1044        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1045          {          {
1046          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1047          int ncount = 0;          int ncount = 0;
# Line 1041  for (;;) Line 1055  for (;;)
1055            int nd;            int nd;
1056            int ndlen = 1;            int ndlen = 1;
1057            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1058            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1059            ncount++;            ncount++;
1060            nptr += ndlen;            nptr += ndlen;
1061            }            }
# Line 1049  for (;;) Line 1063  for (;;)
1063          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1064          }          }
1065        break;        break;
1066    #endif
1067    
1068        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1069        case OP_ANYNL_EXTRA + OP_TYPEPLUS:        case OP_ANYNL_EXTRA + OP_TYPEPLUS:
# Line 1061  for (;;) Line 1076  for (;;)
1076          int ncount = 0;          int ncount = 0;
1077          switch (c)          switch (c)
1078            {            {
1079              case 0x000b:
1080              case 0x000c:
1081              case 0x0085:
1082              case 0x2028:
1083              case 0x2029:
1084              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1085              goto ANYNL01;
1086    
1087            case 0x000d:            case 0x000d:
1088            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1089            /* Fall through */            /* Fall through */
1090    
1091              ANYNL01:
1092              case 0x000a:
1093              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1094                {
1095                active_count--;           /* Remove non-match possibility */
1096                next_active_state--;
1097                }
1098              count++;
1099              ADD_NEW_DATA(-state_offset, count, ncount);
1100              break;
1101    
1102              default:
1103              break;
1104              }
1105            }
1106          break;
1107    
1108          /*-----------------------------------------------------------------*/
1109          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1110          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1111          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1112          count = current_state->count;  /* Already matched */
1113          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1114          if (clen > 0)
1115            {
1116            BOOL OK;
1117            switch (c)
1118              {
1119            case 0x000a:            case 0x000a:
1120            case 0x000b:            case 0x000b:
1121            case 0x000c:            case 0x000c:
1122              case 0x000d:
1123            case 0x0085:            case 0x0085:
1124            case 0x2028:            case 0x2028:
1125            case 0x2029:            case 0x2029:
1126            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            OK = TRUE;
1127              break;
1128    
1129              default:
1130              OK = FALSE;
1131              break;
1132              }
1133    
1134            if (OK == (d == OP_VSPACE))
1135              {
1136              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1137              {              {
1138              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1139              next_active_state--;              next_active_state--;
1140              }              }
1141            count++;            count++;
1142            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, 0);
1143              }
1144            }
1145          break;
1146    
1147          /*-----------------------------------------------------------------*/
1148          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1149          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1150          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1151          count = current_state->count;  /* Already matched */
1152          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1153          if (clen > 0)
1154            {
1155            BOOL OK;
1156            switch (c)
1157              {
1158              case 0x09:      /* HT */
1159              case 0x20:      /* SPACE */
1160              case 0xa0:      /* NBSP */
1161              case 0x1680:    /* OGHAM SPACE MARK */
1162              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1163              case 0x2000:    /* EN QUAD */
1164              case 0x2001:    /* EM QUAD */
1165              case 0x2002:    /* EN SPACE */
1166              case 0x2003:    /* EM SPACE */
1167              case 0x2004:    /* THREE-PER-EM SPACE */
1168              case 0x2005:    /* FOUR-PER-EM SPACE */
1169              case 0x2006:    /* SIX-PER-EM SPACE */
1170              case 0x2007:    /* FIGURE SPACE */
1171              case 0x2008:    /* PUNCTUATION SPACE */
1172              case 0x2009:    /* THIN SPACE */
1173              case 0x200A:    /* HAIR SPACE */
1174              case 0x202f:    /* NARROW NO-BREAK SPACE */
1175              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1176              case 0x3000:    /* IDEOGRAPHIC SPACE */
1177              OK = TRUE;
1178            break;            break;
1179    
1180            default:            default:
1181              OK = FALSE;
1182            break;            break;
1183            }            }
1184    
1185            if (OK == (d == OP_HSPACE))
1186              {
1187              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1188                {
1189                active_count--;           /* Remove non-match possibility */
1190                next_active_state--;
1191                }
1192              count++;
1193              ADD_NEW_DATA(-state_offset, count, 0);
1194              }
1195          }          }
1196        break;        break;
1197    
1198        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1199    #ifdef SUPPORT_UCP
1200        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1201        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1202        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
# Line 1102  for (;;) Line 1214  for (;;)
1214        if (clen > 0)        if (clen > 0)
1215          {          {
1216          BOOL OK;          BOOL OK;
1217          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1218          switch(code[2])          switch(code[2])
1219            {            {
1220            case PT_ANY:            case PT_ANY:
# Line 1110  for (;;) Line 1222  for (;;)
1222            break;            break;
1223    
1224            case PT_LAMP:            case PT_LAMP:
1225            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1226            break;            break;
1227    
1228            case PT_GC:            case PT_GC:
1229            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1230            break;            break;
1231    
1232            case PT_PC:            case PT_PC:
1233            OK = chartype == code[3];            OK = prop->chartype == code[3];
1234            break;            break;
1235    
1236            case PT_SC:            case PT_SC:
1237            OK = script == code[3];            OK = prop->script == code[3];
1238            break;            break;
1239    
1240            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1160  for (;;) Line 1272  for (;;)
1272        QS2:        QS2:
1273    
1274        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1275        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1276          {          {
1277          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1278          int ncount = 0;          int ncount = 0;
# Line 1175  for (;;) Line 1287  for (;;)
1287            int nd;            int nd;
1288            int ndlen = 1;            int ndlen = 1;
1289            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1290            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1291            ncount++;            ncount++;
1292            nptr += ndlen;            nptr += ndlen;
1293            }            }
1294          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1295          }          }
1296        break;        break;
1297    #endif
1298    
1299        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1300        case OP_ANYNL_EXTRA + OP_TYPEQUERY:        case OP_ANYNL_EXTRA + OP_TYPEQUERY:
# Line 1202  for (;;) Line 1315  for (;;)
1315          int ncount = 0;          int ncount = 0;
1316          switch (c)          switch (c)
1317            {            {
1318              case 0x000b:
1319              case 0x000c:
1320              case 0x0085:
1321              case 0x2028:
1322              case 0x2029:
1323              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1324              goto ANYNL02;
1325    
1326            case 0x000d:            case 0x000d:
1327            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1328            /* Fall through */            /* Fall through */
1329    
1330              ANYNL02:
1331              case 0x000a:
1332              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1333                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1334                {
1335                active_count--;           /* Remove non-match possibility */
1336                next_active_state--;
1337                }
1338              ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1339              break;
1340    
1341              default:
1342              break;
1343              }
1344            }
1345          break;
1346    
1347          /*-----------------------------------------------------------------*/
1348          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1349          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1350          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1351          count = 2;
1352          goto QS4;
1353    
1354          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1355          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1356          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1357          count = 0;
1358    
1359          QS4:
1360          ADD_ACTIVE(state_offset + 2, 0);
1361          if (clen > 0)
1362            {
1363            BOOL OK;
1364            switch (c)
1365              {
1366            case 0x000a:            case 0x000a:
1367            case 0x000b:            case 0x000b:
1368            case 0x000c:            case 0x000c:
1369              case 0x000d:
1370            case 0x0085:            case 0x0085:
1371            case 0x2028:            case 0x2028:
1372            case 0x2029:            case 0x2029:
1373            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            OK = TRUE;
1374                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)            break;
1375    
1376              default:
1377              OK = FALSE;
1378              break;
1379              }
1380            if (OK == (d == OP_VSPACE))
1381              {
1382              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1383                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1384              {              {
1385              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1386              next_active_state--;              next_active_state--;
1387              }              }
1388            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, 0);
1389              }
1390            }
1391          break;
1392    
1393          /*-----------------------------------------------------------------*/
1394          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1395          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1396          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1397          count = 2;
1398          goto QS5;
1399    
1400          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1401          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1402          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1403          count = 0;
1404    
1405          QS5:
1406          ADD_ACTIVE(state_offset + 2, 0);
1407          if (clen > 0)
1408            {
1409            BOOL OK;
1410            switch (c)
1411              {
1412              case 0x09:      /* HT */
1413              case 0x20:      /* SPACE */
1414              case 0xa0:      /* NBSP */
1415              case 0x1680:    /* OGHAM SPACE MARK */
1416              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1417              case 0x2000:    /* EN QUAD */
1418              case 0x2001:    /* EM QUAD */
1419              case 0x2002:    /* EN SPACE */
1420              case 0x2003:    /* EM SPACE */
1421              case 0x2004:    /* THREE-PER-EM SPACE */
1422              case 0x2005:    /* FOUR-PER-EM SPACE */
1423              case 0x2006:    /* SIX-PER-EM SPACE */
1424              case 0x2007:    /* FIGURE SPACE */
1425              case 0x2008:    /* PUNCTUATION SPACE */
1426              case 0x2009:    /* THIN SPACE */
1427              case 0x200A:    /* HAIR SPACE */
1428              case 0x202f:    /* NARROW NO-BREAK SPACE */
1429              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1430              case 0x3000:    /* IDEOGRAPHIC SPACE */
1431              OK = TRUE;
1432            break;            break;
1433    
1434            default:            default:
1435              OK = FALSE;
1436            break;            break;
1437            }            }
1438    
1439            if (OK == (d == OP_HSPACE))
1440              {
1441              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1442                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1443                {
1444                active_count--;           /* Remove non-match possibility */
1445                next_active_state--;
1446                }
1447              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1448              }
1449          }          }
1450        break;        break;
1451    
1452        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1453    #ifdef SUPPORT_UCP
1454        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_PROP_EXTRA + OP_TYPEEXACT:
1455        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_PROP_EXTRA + OP_TYPEUPTO:
1456        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
# Line 1236  for (;;) Line 1461  for (;;)
1461        if (clen > 0)        if (clen > 0)
1462          {          {
1463          BOOL OK;          BOOL OK;
1464          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1465          switch(code[4])          switch(code[4])
1466            {            {
1467            case PT_ANY:            case PT_ANY:
# Line 1244  for (;;) Line 1469  for (;;)
1469            break;            break;
1470    
1471            case PT_LAMP:            case PT_LAMP:
1472            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1473            break;            break;
1474    
1475            case PT_GC:            case PT_GC:
1476            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1477            break;            break;
1478    
1479            case PT_PC:            case PT_PC:
1480            OK = chartype == code[5];            OK = prop->chartype == code[5];
1481            break;            break;
1482    
1483            case PT_SC:            case PT_SC:
1484            OK = script == code[5];            OK = prop->script == code[5];
1485            break;            break;
1486    
1487            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1289  for (;;) Line 1514  for (;;)
1514        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1515          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1516        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1517        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1518          {          {
1519          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1520          int ncount = 0;          int ncount = 0;
# Line 1303  for (;;) Line 1528  for (;;)
1528            int nd;            int nd;
1529            int ndlen = 1;            int ndlen = 1;
1530            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1531            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1532            ncount++;            ncount++;
1533            nptr += ndlen;            nptr += ndlen;
1534            }            }
# Line 1313  for (;;) Line 1538  for (;;)
1538            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1539          }          }
1540        break;        break;
1541    #endif
1542    
1543        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1544        case OP_ANYNL_EXTRA + OP_TYPEEXACT:        case OP_ANYNL_EXTRA + OP_TYPEEXACT:
# Line 1327  for (;;) Line 1553  for (;;)
1553          int ncount = 0;          int ncount = 0;
1554          switch (c)          switch (c)
1555            {            {
1556              case 0x000b:
1557              case 0x000c:
1558              case 0x0085:
1559              case 0x2028:
1560              case 0x2029:
1561              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1562              goto ANYNL03;
1563    
1564            case 0x000d:            case 0x000d:
1565            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1566            /* Fall through */            /* Fall through */
1567    
1568              ANYNL03:
1569              case 0x000a:
1570              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1571                {
1572                active_count--;           /* Remove non-match possibility */
1573                next_active_state--;
1574                }
1575              if (++count >= GET2(code, 1))
1576                { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1577              else
1578                { ADD_NEW_DATA(-state_offset, count, ncount); }
1579              break;
1580    
1581              default:
1582              break;
1583              }
1584            }
1585          break;
1586    
1587          /*-----------------------------------------------------------------*/
1588          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1589          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1590          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1591          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1592          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1593            { ADD_ACTIVE(state_offset + 4, 0); }
1594          count = current_state->count;  /* Number already matched */
1595          if (clen > 0)
1596            {
1597            BOOL OK;
1598            switch (c)
1599              {
1600            case 0x000a:            case 0x000a:
1601            case 0x000b:            case 0x000b:
1602            case 0x000c:            case 0x000c:
1603              case 0x000d:
1604            case 0x0085:            case 0x0085:
1605            case 0x2028:            case 0x2028:
1606            case 0x2029:            case 0x2029:
1607            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            OK = TRUE;
1608              break;
1609    
1610              default:
1611              OK = FALSE;
1612              }
1613    
1614            if (OK == (d == OP_VSPACE))
1615              {
1616              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1617              {              {
1618              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1619              next_active_state--;              next_active_state--;
1620              }              }
1621            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1622              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1623            else            else
1624              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1625              }
1626            }
1627          break;
1628    
1629          /*-----------------------------------------------------------------*/
1630          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1631          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1632          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1633          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1634          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1635            { ADD_ACTIVE(state_offset + 4, 0); }
1636          count = current_state->count;  /* Number already matched */
1637          if (clen > 0)
1638            {
1639            BOOL OK;
1640            switch (c)
1641              {
1642              case 0x09:      /* HT */
1643              case 0x20:      /* SPACE */
1644              case 0xa0:      /* NBSP */
1645              case 0x1680:    /* OGHAM SPACE MARK */
1646              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1647              case 0x2000:    /* EN QUAD */
1648              case 0x2001:    /* EM QUAD */
1649              case 0x2002:    /* EN SPACE */
1650              case 0x2003:    /* EM SPACE */
1651              case 0x2004:    /* THREE-PER-EM SPACE */
1652              case 0x2005:    /* FOUR-PER-EM SPACE */
1653              case 0x2006:    /* SIX-PER-EM SPACE */
1654              case 0x2007:    /* FIGURE SPACE */
1655              case 0x2008:    /* PUNCTUATION SPACE */
1656              case 0x2009:    /* THIN SPACE */
1657              case 0x200A:    /* HAIR SPACE */
1658              case 0x202f:    /* NARROW NO-BREAK SPACE */
1659              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1660              case 0x3000:    /* IDEOGRAPHIC SPACE */
1661              OK = TRUE;
1662            break;            break;
1663    
1664            default:            default:
1665              OK = FALSE;
1666            break;            break;
1667            }            }
1668    
1669            if (OK == (d == OP_HSPACE))
1670              {
1671              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1672                {
1673                active_count--;           /* Remove non-match possibility */
1674                next_active_state--;
1675                }
1676              if (++count >= GET2(code, 1))
1677                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1678              else
1679                { ADD_NEW_DATA(-state_offset, count, 0); }
1680              }
1681          }          }
1682        break;        break;
1683    
# Line 1379  for (;;) Line 1708  for (;;)
1708            other case of the character. */            other case of the character. */
1709    
1710  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1711            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
1712  #else  #else
1713            othercase = NOTACHAR;            othercase = NOTACHAR;
1714  #endif  #endif
# Line 1404  for (;;) Line 1733  for (;;)
1733        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1734    
1735        case OP_EXTUNI:        case OP_EXTUNI:
1736        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1737          {          {
1738          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1739          int ncount = 0;          int ncount = 0;
# Line 1412  for (;;) Line 1741  for (;;)
1741            {            {
1742            int nclen = 1;            int nclen = 1;
1743            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1744            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
1745            ncount++;            ncount++;
1746            nptr += nclen;            nptr += nclen;
1747            }            }
# Line 1429  for (;;) Line 1758  for (;;)
1758        case OP_ANYNL:        case OP_ANYNL:
1759        if (clen > 0) switch(c)        if (clen > 0) switch(c)
1760          {          {
         case 0x000a:  
1761          case 0x000b:          case 0x000b:
1762          case 0x000c:          case 0x000c:
1763          case 0x0085:          case 0x0085:
1764          case 0x2028:          case 0x2028:
1765          case 0x2029:          case 0x2029:
1766            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1767    
1768            case 0x000a:
1769          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1770          break;          break;
1771    
1772          case 0x000d:          case 0x000d:
1773          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1774            {            {
# Line 1451  for (;;) Line 1783  for (;;)
1783        break;        break;
1784    
1785        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1786          case OP_NOT_VSPACE:
1787          if (clen > 0) switch(c)
1788            {
1789            case 0x000a:
1790            case 0x000b:
1791            case 0x000c:
1792            case 0x000d:
1793            case 0x0085:
1794            case 0x2028:
1795            case 0x2029:
1796            break;
1797    
1798            default:
1799            ADD_NEW(state_offset + 1, 0);
1800            break;
1801            }
1802          break;
1803    
1804          /*-----------------------------------------------------------------*/
1805          case OP_VSPACE:
1806          if (clen > 0) switch(c)
1807            {
1808            case 0x000a:
1809            case 0x000b:
1810            case 0x000c:
1811            case 0x000d:
1812            case 0x0085:
1813            case 0x2028:
1814            case 0x2029:
1815            ADD_NEW(state_offset + 1, 0);
1816            break;
1817    
1818            default: break;
1819            }
1820          break;
1821    
1822          /*-----------------------------------------------------------------*/
1823          case OP_NOT_HSPACE:
1824          if (clen > 0) switch(c)
1825            {
1826            case 0x09:      /* HT */
1827            case 0x20:      /* SPACE */
1828            case 0xa0:      /* NBSP */
1829            case 0x1680:    /* OGHAM SPACE MARK */
1830            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1831            case 0x2000:    /* EN QUAD */
1832            case 0x2001:    /* EM QUAD */
1833            case 0x2002:    /* EN SPACE */
1834            case 0x2003:    /* EM SPACE */
1835            case 0x2004:    /* THREE-PER-EM SPACE */
1836            case 0x2005:    /* FOUR-PER-EM SPACE */
1837            case 0x2006:    /* SIX-PER-EM SPACE */
1838            case 0x2007:    /* FIGURE SPACE */
1839            case 0x2008:    /* PUNCTUATION SPACE */
1840            case 0x2009:    /* THIN SPACE */
1841            case 0x200A:    /* HAIR SPACE */
1842            case 0x202f:    /* NARROW NO-BREAK SPACE */
1843            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1844            case 0x3000:    /* IDEOGRAPHIC SPACE */
1845            break;
1846    
1847            default:
1848            ADD_NEW(state_offset + 1, 0);
1849            break;
1850            }
1851          break;
1852    
1853          /*-----------------------------------------------------------------*/
1854          case OP_HSPACE:
1855          if (clen > 0) switch(c)
1856            {
1857            case 0x09:      /* HT */
1858            case 0x20:      /* SPACE */
1859            case 0xa0:      /* NBSP */
1860            case 0x1680:    /* OGHAM SPACE MARK */
1861            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1862            case 0x2000:    /* EN QUAD */
1863            case 0x2001:    /* EM QUAD */
1864            case 0x2002:    /* EN SPACE */
1865            case 0x2003:    /* EM SPACE */
1866            case 0x2004:    /* THREE-PER-EM SPACE */
1867            case 0x2005:    /* FOUR-PER-EM SPACE */
1868            case 0x2006:    /* SIX-PER-EM SPACE */
1869            case 0x2007:    /* FIGURE SPACE */
1870            case 0x2008:    /* PUNCTUATION SPACE */
1871            case 0x2009:    /* THIN SPACE */
1872            case 0x200A:    /* HAIR SPACE */
1873            case 0x202f:    /* NARROW NO-BREAK SPACE */
1874            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1875            case 0x3000:    /* IDEOGRAPHIC SPACE */
1876            ADD_NEW(state_offset + 1, 0);
1877            break;
1878            }
1879          break;
1880    
1881          /*-----------------------------------------------------------------*/
1882        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character. This is only used for one-byte
1883        characters, that is, we know that d < 256. The character we are        characters, that is, we know that d < 256. The character we are
1884        checking (c) can be multibyte. */        checking (c) can be multibyte. */
# Line 1481  for (;;) Line 1909  for (;;)
1909            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1910              {              {
1911  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1912              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1913  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1914              }              }
1915            else            else
# Line 1519  for (;;) Line 1947  for (;;)
1947            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1948              {              {
1949  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1950              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1951  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1952              }              }
1953            else            else
# Line 1555  for (;;) Line 1983  for (;;)
1983            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1984              {              {
1985  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1986              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1987  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1988              }              }
1989            else            else
# Line 1587  for (;;) Line 2015  for (;;)
2015            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2016              {              {
2017  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2018              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2019  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2020              }              }
2021            else            else
# Line 1622  for (;;) Line 2050  for (;;)
2050            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2051              {              {
2052  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2053              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2054  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2055              }              }
2056            else            else
# Line 1730  for (;;) Line 2158  for (;;)
2158    
2159  /* ========================================================================== */  /* ========================================================================== */
2160        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2161        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assersion
2162          (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2163          though the other "backtracking verbs" are not supported. */
2164    
2165          case OP_FAIL:
2166          break;
2167    
2168        case OP_ASSERT:        case OP_ASSERT:
2169        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 2073  Returns:          > 0 => number of match Line 2506  Returns:          > 0 => number of match
2506                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2507  */  */
2508    
2509  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2510  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2511    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2512    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2163  md->end_subject = end_subject; Line 2596  md->end_subject = end_subject;
2596  md->moptions = options;  md->moptions = options;
2597  md->poptions = re->options;  md->poptions = re->options;
2598    
2599    /* If the BSR option is not set at match time, copy what was set
2600    at compile time. */
2601    
2602    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2603      {
2604      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2605        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2606    #ifdef BSR_ANYCRLF
2607      else md->moptions |= PCRE_BSR_ANYCRLF;
2608    #endif
2609      }
2610    
2611  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
2612  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
2613    
# Line 2170  switch ((((options & PCRE_NEWLINE_BITS) Line 2615  switch ((((options & PCRE_NEWLINE_BITS)
2615           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2616    {    {
2617    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2618    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2619    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2620    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2621         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2622    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2623    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2624    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2233  if (md->tables == NULL) md->tables = _pc Line 2678  if (md->tables == NULL) md->tables = _pc
2678  used in a loop when finding where to start. */  used in a loop when finding where to start. */
2679    
2680  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
2681  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
2682  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
2683    
2684  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2244  studied, there may be a bitmap of possib Line 2689  studied, there may be a bitmap of possib
2689    
2690  if (!anchored)  if (!anchored)
2691    {    {
2692    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
2693      {      {
2694      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
2695      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 2261  if (!anchored) Line 2706  if (!anchored)
2706  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
2707  character" set. */  character" set. */
2708    
2709  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
2710    {    {
2711    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
2712    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 2269  if ((re->options & PCRE_REQCHSET) != 0) Line 2714  if ((re->options & PCRE_REQCHSET) != 0)
2714    }    }
2715    
2716  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2717  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2718  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2719    
2720  for (;;)  for (;;)
2721    {    {
# Line 2281  for (;;) Line 2725  for (;;)
2725      {      {
2726      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2727    
2728      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2729      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2730      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2731      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2732    
2733      if (firstline)      if (firstline)
2734        {        {
2735        const uschar *t = current_subject;        USPTR t = current_subject;
2736    #ifdef SUPPORT_UTF8
2737          if (utf8)
2738            {
2739            while (t < md->end_subject && !IS_NEWLINE(t))
2740              {
2741              t++;
2742              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2743              }
2744            }
2745          else
2746    #endif
2747        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2748        end_subject = t;        end_subject = t;
2749        }        }
2750    
2751      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2752        starting point is not found, or if a known later character is not present.
2753        However, there is an option that disables these, for testing and for
2754        ensuring that all callouts do actually occur. */
2755    
2756        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2757        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2758    
2759      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2760    
2761      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + start_offset)  
2762          {          {
2763          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (first_byte_caseless)
2764            current_subject++;            while (current_subject < end_subject &&
2765                     lcc[*current_subject] != first_byte)
2766                current_subject++;
2767            else
2768              while (current_subject < end_subject &&
2769                     *current_subject != first_byte)
2770                current_subject++;
2771            }
2772    
2773          /* Or to just after a linebreak for a multiline match if possible */
2774    
2775          /* If we have just passed a CR and the newline option is ANY or        else if (startline)
2776          ANYCRLF, and we are now at a LF, advance the match position by one more          {
2777          character. */          if (current_subject > md->start_subject + start_offset)
2778              {
2779          if (current_subject[-1] == '\r' &&  #ifdef SUPPORT_UTF8
2780               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&            if (utf8)
2781               current_subject < end_subject &&              {
2782               *current_subject == '\n')              while (current_subject < end_subject &&
2783            current_subject++;                     !WAS_NEWLINE(current_subject))
2784                  {
2785                  current_subject++;
2786                  while(current_subject < end_subject &&
2787                        (*current_subject & 0xc0) == 0x80)
2788                    current_subject++;
2789                  }
2790                }
2791              else
2792    #endif
2793              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2794                current_subject++;
2795    
2796              /* If we have just passed a CR and the newline option is ANY or
2797              ANYCRLF, and we are now at a LF, advance the match position by one
2798              more character. */
2799    
2800              if (current_subject[-1] == CHAR_CR &&
2801                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2802                   current_subject < end_subject &&
2803                   *current_subject == CHAR_NL)
2804                current_subject++;
2805              }
2806          }          }
       }  
2807    
2808      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2809    
2810      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2811          {          {
2812          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2813          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2814            else break;            register unsigned int c = *current_subject;
2815              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2816                else break;
2817              }
2818          }          }
2819        }        }
2820    
# Line 2356  for (;;) Line 2836  for (;;)
2836    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2837    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2838    
2839    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2840    */    also be explicitly deactivated. */
2841    
2842    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2843          req_byte >= 0 &&
2844        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2845        (options & PCRE_PARTIAL) == 0)        (options & PCRE_PARTIAL) == 0)
2846      {      {
# Line 2431  for (;;) Line 2912  for (;;)
2912      }      }
2913    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
2914    
2915    /* If we have just passed a CR and the newline option is CRLF or ANY or    /* If we have just passed a CR and we are now at a LF, and the pattern does
2916    ANYCRLF, and we are now at a LF, advance the match position by one more    not contain any explicit matches for \r or \n, and the newline option is CRLF
2917    character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2918    
2919    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2920         (md->nltype == NLTYPE_ANY ||        current_subject < end_subject &&
2921          md->nltype == NLTYPE_ANYCRLF ||        *current_subject == CHAR_NL &&
2922          md->nllen == 2) &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2923         current_subject < end_subject &&          (md->nltype == NLTYPE_ANY ||
2924         *current_subject == '\n')           md->nltype == NLTYPE_ANYCRLF ||
2925             md->nllen == 2))
2926      current_subject++;      current_subject++;
2927    
2928    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.150  
changed lines
  Added in v.392

  ViewVC Help
Powered by ViewVC 1.1.5