/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 168 by ph10, Tue May 29 15:18:18 2007 UTC revision 397 by ph10, Fri Mar 20 19:40:08 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    #ifdef HAVE_CONFIG_H
49    #include "config.h"
50    #endif
51    
52  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
53  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
54  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 63  applications. */ Line 68  applications. */
68    
69  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
70  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
71  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
72    never stored, so we push them well clear of the normal opcodes. */
73    
74  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
75  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
76  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
77    #define OP_HSPACE_EXTRA     360
78    #define OP_VSPACE_EXTRA     380
79    
80    
81  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
82  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes is possible to
83  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
84  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
85  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. ***NOTE*** If the start of this table is modified, the two tables
86  that follow must also be modified. */  that follow must also be modified. */
87    
88  static uschar coptable[] = {  static const uschar coptable[] = {
89    0,                             /* End                                    */    0,                             /* End                                    */
90    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
91    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
92    0, 0,                          /* Any, Anybyte                           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
93    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
94      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
95    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
96    1,                             /* Char                                   */    1,                             /* Char                                   */
97    1,                             /* Charnc                                 */    1,                             /* Charnc                                 */
# Line 122  static uschar coptable[] = { Line 131  static uschar coptable[] = {
131    0,                             /* CREF                                   */    0,                             /* CREF                                   */
132    0,                             /* RREF                                   */    0,                             /* RREF                                   */
133    0,                             /* DEF                                    */    0,                             /* DEF                                    */
134    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
135      0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
136      0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
137  };  };
138    
139  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
140  and \w */  and \w */
141    
142  static uschar toptable1[] = {  static const uschar toptable1[] = {
143    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
144    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
145    ctype_space, ctype_space,    ctype_space, ctype_space,
146    ctype_word,  ctype_word,    ctype_word,  ctype_word,
147    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
148  };  };
149    
150  static uschar toptable2[] = {  static const uschar toptable2[] = {
151    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
152    ctype_digit, 0,    ctype_digit, 0,
153    ctype_space, 0,    ctype_space, 0,
154    ctype_word,  0,    ctype_word,  0,
155    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
156  };  };
157    
158    
# Line 213  Arguments: Line 224  Arguments:
224    rlevel            function call recursion level    rlevel            function call recursion level
225    recursing         regex recursive call level    recursing         regex recursive call level
226    
227  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
228                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
229                       -1 => failed to match                       -1 => failed to match
230                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
231    
# Line 501  for (;;) Line 512  for (;;)
512      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
513      const uschar *code;      const uschar *code;
514      int state_offset = current_state->offset;      int state_offset = current_state->offset;
515      int count, codevalue;      int count, codevalue, rrc;
 #ifdef SUPPORT_UCP  
     int chartype, script;  
 #endif  
516    
517  #ifdef DEBUG  #ifdef DEBUG
518      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 559  for (;;) Line 567  for (;;)
567      permitted.      permitted.
568    
569      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
570      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long. We
571      Unfortunately, we have to take special action to deal with  \P, \p, and      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
572      \X in this case. To keep the other cases fast, convert these ones to new      this case. To keep the other cases fast, convert these ones to new opcodes.
573      opcodes. */      */
574    
575      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
576        {        {
# Line 580  for (;;) Line 588  for (;;)
588            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
589            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
590            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
591              case OP_NOT_HSPACE:
592              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
593              case OP_NOT_VSPACE:
594              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
595            default: break;            default: break;
596            }            }
597          }          }
# Line 680  for (;;) Line 692  for (;;)
692        break;        break;
693    
694        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
695          case OP_SKIPZERO:
696          code += 1 + GET(code, 2);
697          while (*code == OP_ALT) code += GET(code, 1);
698          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
699          break;
700    
701          /*-----------------------------------------------------------------*/
702        case OP_CIRC:        case OP_CIRC:
703        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
704            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
# Line 718  for (;;) Line 737  for (;;)
737    
738        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
739        case OP_ANY:        case OP_ANY:
740        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
741            { ADD_NEW(state_offset + 1, 0); }
742          break;
743    
744          /*-----------------------------------------------------------------*/
745          case OP_ALLANY:
746          if (clen > 0)
747          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
748        break;        break;
749    
# Line 733  for (;;) Line 758  for (;;)
758        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
759          {          {
760          if (clen == 0 ||          if (clen == 0 ||
761              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
762                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
763              ))              ))
764            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 798  for (;;) Line 823  for (;;)
823        if (clen > 0)        if (clen > 0)
824          {          {
825          BOOL OK;          BOOL OK;
826          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
827          switch(code[1])          switch(code[1])
828            {            {
829            case PT_ANY:            case PT_ANY:
# Line 806  for (;;) Line 831  for (;;)
831            break;            break;
832    
833            case PT_LAMP:            case PT_LAMP:
834            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
835            break;            break;
836    
837            case PT_GC:            case PT_GC:
838            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
839            break;            break;
840    
841            case PT_PC:            case PT_PC:
842            OK = chartype == code[2];            OK = prop->chartype == code[2];
843            break;            break;
844    
845            case PT_SC:            case PT_SC:
846            OK = script == code[2];            OK = prop->script == code[2];
847            break;            break;
848    
849            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 838  for (;;) Line 863  for (;;)
863  /* ========================================================================== */  /* ========================================================================== */
864        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
865        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
866        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
867        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
868    
869        case OP_TYPEPLUS:        case OP_TYPEPLUS:
870        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 850  for (;;) Line 875  for (;;)
875          {          {
876          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
877              (c < 256 &&              (c < 256 &&
878                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
879                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
880            {            {
881            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 876  for (;;) Line 898  for (;;)
898          {          {
899          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
900              (c < 256 &&              (c < 256 &&
901                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
902                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
903            {            {
904            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 901  for (;;) Line 920  for (;;)
920          {          {
921          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
922              (c < 256 &&              (c < 256 &&
923                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
924                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
925            {            {
926            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 924  for (;;) Line 940  for (;;)
940          {          {
941          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
942              (c < 256 &&              (c < 256 &&
943                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
944                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
945            {            {
946            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 948  for (;;) Line 961  for (;;)
961          {          {
962          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
963              (c < 256 &&              (c < 256 &&
964                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
965                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
966            {            {
967            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 982  for (;;) Line 992  for (;;)
992        if (clen > 0)        if (clen > 0)
993          {          {
994          BOOL OK;          BOOL OK;
995          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
996          switch(code[2])          switch(code[2])
997            {            {
998            case PT_ANY:            case PT_ANY:
# Line 990  for (;;) Line 1000  for (;;)
1000            break;            break;
1001    
1002            case PT_LAMP:            case PT_LAMP:
1003            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1004            break;            break;
1005    
1006            case PT_GC:            case PT_GC:
1007            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1008            break;            break;
1009    
1010            case PT_PC:            case PT_PC:
1011            OK = chartype == code[3];            OK = prop->chartype == code[3];
1012            break;            break;
1013    
1014            case PT_SC:            case PT_SC:
1015            OK = script == code[3];            OK = prop->script == code[3];
1016            break;            break;
1017    
1018            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1031  for (;;) Line 1041  for (;;)
1041        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1042        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1043        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1044        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1045          {          {
1046          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1047          int ncount = 0;          int ncount = 0;
# Line 1045  for (;;) Line 1055  for (;;)
1055            int nd;            int nd;
1056            int ndlen = 1;            int ndlen = 1;
1057            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1058            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1059            ncount++;            ncount++;
1060            nptr += ndlen;            nptr += ndlen;
1061            }            }
# Line 1066  for (;;) Line 1076  for (;;)
1076          int ncount = 0;          int ncount = 0;
1077          switch (c)          switch (c)
1078            {            {
1079              case 0x000b:
1080              case 0x000c:
1081              case 0x0085:
1082              case 0x2028:
1083              case 0x2029:
1084              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1085              goto ANYNL01;
1086    
1087            case 0x000d:            case 0x000d:
1088            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1089            /* Fall through */            /* Fall through */
1090    
1091              ANYNL01:
1092              case 0x000a:
1093              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1094                {
1095                active_count--;           /* Remove non-match possibility */
1096                next_active_state--;
1097                }
1098              count++;
1099              ADD_NEW_DATA(-state_offset, count, ncount);
1100              break;
1101    
1102              default:
1103              break;
1104              }
1105            }
1106          break;
1107    
1108          /*-----------------------------------------------------------------*/
1109          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1110          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1111          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1112          count = current_state->count;  /* Already matched */
1113          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1114          if (clen > 0)
1115            {
1116            BOOL OK;
1117            switch (c)
1118              {
1119            case 0x000a:            case 0x000a:
1120            case 0x000b:            case 0x000b:
1121            case 0x000c:            case 0x000c:
1122              case 0x000d:
1123            case 0x0085:            case 0x0085:
1124            case 0x2028:            case 0x2028:
1125            case 0x2029:            case 0x2029:
1126            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            OK = TRUE;
1127              break;
1128    
1129              default:
1130              OK = FALSE;
1131              break;
1132              }
1133    
1134            if (OK == (d == OP_VSPACE))
1135              {
1136              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1137              {              {
1138              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1139              next_active_state--;              next_active_state--;
1140              }              }
1141            count++;            count++;
1142            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, 0);
1143              }
1144            }
1145          break;
1146    
1147          /*-----------------------------------------------------------------*/
1148          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1149          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1150          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1151          count = current_state->count;  /* Already matched */
1152          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1153          if (clen > 0)
1154            {
1155            BOOL OK;
1156            switch (c)
1157              {
1158              case 0x09:      /* HT */
1159              case 0x20:      /* SPACE */
1160              case 0xa0:      /* NBSP */
1161              case 0x1680:    /* OGHAM SPACE MARK */
1162              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1163              case 0x2000:    /* EN QUAD */
1164              case 0x2001:    /* EM QUAD */
1165              case 0x2002:    /* EN SPACE */
1166              case 0x2003:    /* EM SPACE */
1167              case 0x2004:    /* THREE-PER-EM SPACE */
1168              case 0x2005:    /* FOUR-PER-EM SPACE */
1169              case 0x2006:    /* SIX-PER-EM SPACE */
1170              case 0x2007:    /* FIGURE SPACE */
1171              case 0x2008:    /* PUNCTUATION SPACE */
1172              case 0x2009:    /* THIN SPACE */
1173              case 0x200A:    /* HAIR SPACE */
1174              case 0x202f:    /* NARROW NO-BREAK SPACE */
1175              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1176              case 0x3000:    /* IDEOGRAPHIC SPACE */
1177              OK = TRUE;
1178            break;            break;
1179    
1180            default:            default:
1181              OK = FALSE;
1182            break;            break;
1183            }            }
1184    
1185            if (OK == (d == OP_HSPACE))
1186              {
1187              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1188                {
1189                active_count--;           /* Remove non-match possibility */
1190                next_active_state--;
1191                }
1192              count++;
1193              ADD_NEW_DATA(-state_offset, count, 0);
1194              }
1195          }          }
1196        break;        break;
1197    
# Line 1108  for (;;) Line 1214  for (;;)
1214        if (clen > 0)        if (clen > 0)
1215          {          {
1216          BOOL OK;          BOOL OK;
1217          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1218          switch(code[2])          switch(code[2])
1219            {            {
1220            case PT_ANY:            case PT_ANY:
# Line 1116  for (;;) Line 1222  for (;;)
1222            break;            break;
1223    
1224            case PT_LAMP:            case PT_LAMP:
1225            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1226            break;            break;
1227    
1228            case PT_GC:            case PT_GC:
1229            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1230            break;            break;
1231    
1232            case PT_PC:            case PT_PC:
1233            OK = chartype == code[3];            OK = prop->chartype == code[3];
1234            break;            break;
1235    
1236            case PT_SC:            case PT_SC:
1237            OK = script == code[3];            OK = prop->script == code[3];
1238            break;            break;
1239    
1240            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1166  for (;;) Line 1272  for (;;)
1272        QS2:        QS2:
1273    
1274        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1275        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1276          {          {
1277          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1278          int ncount = 0;          int ncount = 0;
# Line 1181  for (;;) Line 1287  for (;;)
1287            int nd;            int nd;
1288            int ndlen = 1;            int ndlen = 1;
1289            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1290            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1291            ncount++;            ncount++;
1292            nptr += ndlen;            nptr += ndlen;
1293            }            }
# Line 1209  for (;;) Line 1315  for (;;)
1315          int ncount = 0;          int ncount = 0;
1316          switch (c)          switch (c)
1317            {            {
1318              case 0x000b:
1319              case 0x000c:
1320              case 0x0085:
1321              case 0x2028:
1322              case 0x2029:
1323              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1324              goto ANYNL02;
1325    
1326            case 0x000d:            case 0x000d:
1327            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1328            /* Fall through */            /* Fall through */
1329    
1330              ANYNL02:
1331              case 0x000a:
1332              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1333                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1334                {
1335                active_count--;           /* Remove non-match possibility */
1336                next_active_state--;
1337                }
1338              ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1339              break;
1340    
1341              default:
1342              break;
1343              }
1344            }
1345          break;
1346    
1347          /*-----------------------------------------------------------------*/
1348          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1349          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1350          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1351          count = 2;
1352          goto QS4;
1353    
1354          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1355          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1356          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1357          count = 0;
1358    
1359          QS4:
1360          ADD_ACTIVE(state_offset + 2, 0);
1361          if (clen > 0)
1362            {
1363            BOOL OK;
1364            switch (c)
1365              {
1366            case 0x000a:            case 0x000a:
1367            case 0x000b:            case 0x000b:
1368            case 0x000c:            case 0x000c:
1369              case 0x000d:
1370            case 0x0085:            case 0x0085:
1371            case 0x2028:            case 0x2028:
1372            case 0x2029:            case 0x2029:
1373            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            OK = TRUE;
1374                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)            break;
1375    
1376              default:
1377              OK = FALSE;
1378              break;
1379              }
1380            if (OK == (d == OP_VSPACE))
1381              {
1382              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1383                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1384              {              {
1385              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1386              next_active_state--;              next_active_state--;
1387              }              }
1388            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, 0);
1389              }
1390            }
1391          break;
1392    
1393          /*-----------------------------------------------------------------*/
1394          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1395          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1396          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1397          count = 2;
1398          goto QS5;
1399    
1400          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1401          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1402          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1403          count = 0;
1404    
1405          QS5:
1406          ADD_ACTIVE(state_offset + 2, 0);
1407          if (clen > 0)
1408            {
1409            BOOL OK;
1410            switch (c)
1411              {
1412              case 0x09:      /* HT */
1413              case 0x20:      /* SPACE */
1414              case 0xa0:      /* NBSP */
1415              case 0x1680:    /* OGHAM SPACE MARK */
1416              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1417              case 0x2000:    /* EN QUAD */
1418              case 0x2001:    /* EM QUAD */
1419              case 0x2002:    /* EN SPACE */
1420              case 0x2003:    /* EM SPACE */
1421              case 0x2004:    /* THREE-PER-EM SPACE */
1422              case 0x2005:    /* FOUR-PER-EM SPACE */
1423              case 0x2006:    /* SIX-PER-EM SPACE */
1424              case 0x2007:    /* FIGURE SPACE */
1425              case 0x2008:    /* PUNCTUATION SPACE */
1426              case 0x2009:    /* THIN SPACE */
1427              case 0x200A:    /* HAIR SPACE */
1428              case 0x202f:    /* NARROW NO-BREAK SPACE */
1429              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1430              case 0x3000:    /* IDEOGRAPHIC SPACE */
1431              OK = TRUE;
1432            break;            break;
1433    
1434            default:            default:
1435              OK = FALSE;
1436            break;            break;
1437            }            }
1438    
1439            if (OK == (d == OP_HSPACE))
1440              {
1441              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1442                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1443                {
1444                active_count--;           /* Remove non-match possibility */
1445                next_active_state--;
1446                }
1447              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1448              }
1449          }          }
1450        break;        break;
1451    
# Line 1244  for (;;) Line 1461  for (;;)
1461        if (clen > 0)        if (clen > 0)
1462          {          {
1463          BOOL OK;          BOOL OK;
1464          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1465          switch(code[4])          switch(code[4])
1466            {            {
1467            case PT_ANY:            case PT_ANY:
# Line 1252  for (;;) Line 1469  for (;;)
1469            break;            break;
1470    
1471            case PT_LAMP:            case PT_LAMP:
1472            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1473            break;            break;
1474    
1475            case PT_GC:            case PT_GC:
1476            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1477            break;            break;
1478    
1479            case PT_PC:            case PT_PC:
1480            OK = chartype == code[5];            OK = prop->chartype == code[5];
1481            break;            break;
1482    
1483            case PT_SC:            case PT_SC:
1484            OK = script == code[5];            OK = prop->script == code[5];
1485            break;            break;
1486    
1487            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1297  for (;;) Line 1514  for (;;)
1514        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1515          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1516        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1517        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1518          {          {
1519          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1520          int ncount = 0;          int ncount = 0;
# Line 1311  for (;;) Line 1528  for (;;)
1528            int nd;            int nd;
1529            int ndlen = 1;            int ndlen = 1;
1530            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1531            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1532            ncount++;            ncount++;
1533            nptr += ndlen;            nptr += ndlen;
1534            }            }
# Line 1336  for (;;) Line 1553  for (;;)
1553          int ncount = 0;          int ncount = 0;
1554          switch (c)          switch (c)
1555            {            {
1556              case 0x000b:
1557              case 0x000c:
1558              case 0x0085:
1559              case 0x2028:
1560              case 0x2029:
1561              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1562              goto ANYNL03;
1563    
1564            case 0x000d:            case 0x000d:
1565            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1566            /* Fall through */            /* Fall through */
1567    
1568              ANYNL03:
1569              case 0x000a:
1570              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1571                {
1572                active_count--;           /* Remove non-match possibility */
1573                next_active_state--;
1574                }
1575              if (++count >= GET2(code, 1))
1576                { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1577              else
1578                { ADD_NEW_DATA(-state_offset, count, ncount); }
1579              break;
1580    
1581              default:
1582              break;
1583              }
1584            }
1585          break;
1586    
1587          /*-----------------------------------------------------------------*/
1588          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1589          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1590          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1591          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1592          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1593            { ADD_ACTIVE(state_offset + 4, 0); }
1594          count = current_state->count;  /* Number already matched */
1595          if (clen > 0)
1596            {
1597            BOOL OK;
1598            switch (c)
1599              {
1600            case 0x000a:            case 0x000a:
1601            case 0x000b:            case 0x000b:
1602            case 0x000c:            case 0x000c:
1603              case 0x000d:
1604            case 0x0085:            case 0x0085:
1605            case 0x2028:            case 0x2028:
1606            case 0x2029:            case 0x2029:
1607            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            OK = TRUE;
1608              break;
1609    
1610              default:
1611              OK = FALSE;
1612              }
1613    
1614            if (OK == (d == OP_VSPACE))
1615              {
1616              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1617              {              {
1618              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1619              next_active_state--;              next_active_state--;
1620              }              }
1621            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1622              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1623            else            else
1624              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1625              }
1626            }
1627          break;
1628    
1629          /*-----------------------------------------------------------------*/
1630          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1631          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1632          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1633          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1634          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1635            { ADD_ACTIVE(state_offset + 4, 0); }
1636          count = current_state->count;  /* Number already matched */
1637          if (clen > 0)
1638            {
1639            BOOL OK;
1640            switch (c)
1641              {
1642              case 0x09:      /* HT */
1643              case 0x20:      /* SPACE */
1644              case 0xa0:      /* NBSP */
1645              case 0x1680:    /* OGHAM SPACE MARK */
1646              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1647              case 0x2000:    /* EN QUAD */
1648              case 0x2001:    /* EM QUAD */
1649              case 0x2002:    /* EN SPACE */
1650              case 0x2003:    /* EM SPACE */
1651              case 0x2004:    /* THREE-PER-EM SPACE */
1652              case 0x2005:    /* FOUR-PER-EM SPACE */
1653              case 0x2006:    /* SIX-PER-EM SPACE */
1654              case 0x2007:    /* FIGURE SPACE */
1655              case 0x2008:    /* PUNCTUATION SPACE */
1656              case 0x2009:    /* THIN SPACE */
1657              case 0x200A:    /* HAIR SPACE */
1658              case 0x202f:    /* NARROW NO-BREAK SPACE */
1659              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1660              case 0x3000:    /* IDEOGRAPHIC SPACE */
1661              OK = TRUE;
1662            break;            break;
1663    
1664            default:            default:
1665              OK = FALSE;
1666            break;            break;
1667            }            }
1668    
1669            if (OK == (d == OP_HSPACE))
1670              {
1671              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1672                {
1673                active_count--;           /* Remove non-match possibility */
1674                next_active_state--;
1675                }
1676              if (++count >= GET2(code, 1))
1677                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1678              else
1679                { ADD_NEW_DATA(-state_offset, count, 0); }
1680              }
1681          }          }
1682        break;        break;
1683    
# Line 1388  for (;;) Line 1708  for (;;)
1708            other case of the character. */            other case of the character. */
1709    
1710  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1711            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
1712  #else  #else
1713            othercase = NOTACHAR;            othercase = NOTACHAR;
1714  #endif  #endif
# Line 1413  for (;;) Line 1733  for (;;)
1733        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1734    
1735        case OP_EXTUNI:        case OP_EXTUNI:
1736        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1737          {          {
1738          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1739          int ncount = 0;          int ncount = 0;
# Line 1421  for (;;) Line 1741  for (;;)
1741            {            {
1742            int nclen = 1;            int nclen = 1;
1743            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1744            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
1745            ncount++;            ncount++;
1746            nptr += nclen;            nptr += nclen;
1747            }            }
# Line 1438  for (;;) Line 1758  for (;;)
1758        case OP_ANYNL:        case OP_ANYNL:
1759        if (clen > 0) switch(c)        if (clen > 0) switch(c)
1760          {          {
         case 0x000a:  
1761          case 0x000b:          case 0x000b:
1762          case 0x000c:          case 0x000c:
1763          case 0x0085:          case 0x0085:
1764          case 0x2028:          case 0x2028:
1765          case 0x2029:          case 0x2029:
1766            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1767    
1768            case 0x000a:
1769          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1770          break;          break;
1771    
1772          case 0x000d:          case 0x000d:
1773          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1774            {            {
# Line 1460  for (;;) Line 1783  for (;;)
1783        break;        break;
1784    
1785        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1786          case OP_NOT_VSPACE:
1787          if (clen > 0) switch(c)
1788            {
1789            case 0x000a:
1790            case 0x000b:
1791            case 0x000c:
1792            case 0x000d:
1793            case 0x0085:
1794            case 0x2028:
1795            case 0x2029:
1796            break;
1797    
1798            default:
1799            ADD_NEW(state_offset + 1, 0);
1800            break;
1801            }
1802          break;
1803    
1804          /*-----------------------------------------------------------------*/
1805          case OP_VSPACE:
1806          if (clen > 0) switch(c)
1807            {
1808            case 0x000a:
1809            case 0x000b:
1810            case 0x000c:
1811            case 0x000d:
1812            case 0x0085:
1813            case 0x2028:
1814            case 0x2029:
1815            ADD_NEW(state_offset + 1, 0);
1816            break;
1817    
1818            default: break;
1819            }
1820          break;
1821    
1822          /*-----------------------------------------------------------------*/
1823          case OP_NOT_HSPACE:
1824          if (clen > 0) switch(c)
1825            {
1826            case 0x09:      /* HT */
1827            case 0x20:      /* SPACE */
1828            case 0xa0:      /* NBSP */
1829            case 0x1680:    /* OGHAM SPACE MARK */
1830            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1831            case 0x2000:    /* EN QUAD */
1832            case 0x2001:    /* EM QUAD */
1833            case 0x2002:    /* EN SPACE */
1834            case 0x2003:    /* EM SPACE */
1835            case 0x2004:    /* THREE-PER-EM SPACE */
1836            case 0x2005:    /* FOUR-PER-EM SPACE */
1837            case 0x2006:    /* SIX-PER-EM SPACE */
1838            case 0x2007:    /* FIGURE SPACE */
1839            case 0x2008:    /* PUNCTUATION SPACE */
1840            case 0x2009:    /* THIN SPACE */
1841            case 0x200A:    /* HAIR SPACE */
1842            case 0x202f:    /* NARROW NO-BREAK SPACE */
1843            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1844            case 0x3000:    /* IDEOGRAPHIC SPACE */
1845            break;
1846    
1847            default:
1848            ADD_NEW(state_offset + 1, 0);
1849            break;
1850            }
1851          break;
1852    
1853          /*-----------------------------------------------------------------*/
1854          case OP_HSPACE:
1855          if (clen > 0) switch(c)
1856            {
1857            case 0x09:      /* HT */
1858            case 0x20:      /* SPACE */
1859            case 0xa0:      /* NBSP */
1860            case 0x1680:    /* OGHAM SPACE MARK */
1861            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1862            case 0x2000:    /* EN QUAD */
1863            case 0x2001:    /* EM QUAD */
1864            case 0x2002:    /* EN SPACE */
1865            case 0x2003:    /* EM SPACE */
1866            case 0x2004:    /* THREE-PER-EM SPACE */
1867            case 0x2005:    /* FOUR-PER-EM SPACE */
1868            case 0x2006:    /* SIX-PER-EM SPACE */
1869            case 0x2007:    /* FIGURE SPACE */
1870            case 0x2008:    /* PUNCTUATION SPACE */
1871            case 0x2009:    /* THIN SPACE */
1872            case 0x200A:    /* HAIR SPACE */
1873            case 0x202f:    /* NARROW NO-BREAK SPACE */
1874            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1875            case 0x3000:    /* IDEOGRAPHIC SPACE */
1876            ADD_NEW(state_offset + 1, 0);
1877            break;
1878            }
1879          break;
1880    
1881          /*-----------------------------------------------------------------*/
1882        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character. This is only used for one-byte
1883        characters, that is, we know that d < 256. The character we are        characters, that is, we know that d < 256. The character we are
1884        checking (c) can be multibyte. */        checking (c) can be multibyte. */
# Line 1490  for (;;) Line 1909  for (;;)
1909            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1910              {              {
1911  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1912              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1913  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1914              }              }
1915            else            else
# Line 1528  for (;;) Line 1947  for (;;)
1947            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1948              {              {
1949  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1950              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1951  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1952              }              }
1953            else            else
# Line 1564  for (;;) Line 1983  for (;;)
1983            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1984              {              {
1985  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1986              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1987  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1988              }              }
1989            else            else
# Line 1596  for (;;) Line 2015  for (;;)
2015            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2016              {              {
2017  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2018              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2019  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2020              }              }
2021            else            else
# Line 1631  for (;;) Line 2050  for (;;)
2050            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2051              {              {
2052  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2053              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2054  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2055              }              }
2056            else            else
# Line 1739  for (;;) Line 2158  for (;;)
2158    
2159  /* ========================================================================== */  /* ========================================================================== */
2160        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2161        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assersion
2162          (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2163          though the other "backtracking verbs" are not supported. */
2164    
2165          case OP_FAIL:
2166          break;
2167    
2168        case OP_ASSERT:        case OP_ASSERT:
2169        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1777  for (;;) Line 2201  for (;;)
2201          {          {
2202          int local_offsets[1000];          int local_offsets[1000];
2203          int local_workspace[1000];          int local_workspace[1000];
2204          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2205            int condcode;
2206    
2207            /* Because of the way auto-callout works during compile, a callout item
2208            is inserted between OP_COND and an assertion condition. */
2209    
2210            if (code[LINK_SIZE+1] == OP_CALLOUT)
2211              {
2212              if (pcre_callout != NULL)
2213                {
2214                int rrc;
2215                pcre_callout_block cb;
2216                cb.version          = 1;   /* Version 1 of the callout block */
2217                cb.callout_number   = code[LINK_SIZE+2];
2218                cb.offset_vector    = offsets;
2219                cb.subject          = (PCRE_SPTR)start_subject;
2220                cb.subject_length   = end_subject - start_subject;
2221                cb.start_match      = current_subject - start_subject;
2222                cb.current_position = ptr - start_subject;
2223                cb.pattern_position = GET(code, LINK_SIZE + 3);
2224                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2225                cb.capture_top      = 1;
2226                cb.capture_last     = -1;
2227                cb.callout_data     = md->callout_data;
2228                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2229                if (rrc == 0) { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2230                }
2231              code += _pcre_OP_lengths[OP_CALLOUT];
2232              }
2233    
2234            condcode = code[LINK_SIZE+1];
2235    
2236          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2237    
2238          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
# Line 1787  for (;;) Line 2241  for (;;)
2241    
2242          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2243            {            {
2244            ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);            ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0);
2245            }            }
2246    
2247          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
# Line 1799  for (;;) Line 2253  for (;;)
2253            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2254            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2255            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2256              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2257            }            }
2258    
2259          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1827  for (;;) Line 2281  for (;;)
2281    
2282            if ((rc >= 0) ==            if ((rc >= 0) ==
2283                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2284              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              {
2285                ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0);
2286                }
2287            else            else
2288              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2289            }            }
2290          }          }
2291        break;        break;
# Line 1981  for (;;) Line 2437  for (;;)
2437        /* Handle callouts */        /* Handle callouts */
2438    
2439        case OP_CALLOUT:        case OP_CALLOUT:
2440          rrc = 0;
2441        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2442          {          {
         int rrc;  
2443          pcre_callout_block cb;          pcre_callout_block cb;
2444          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2445          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 1998  for (;;) Line 2454  for (;;)
2454          cb.capture_last     = -1;          cb.capture_last     = -1;
2455          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2456          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2457          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          }
2458          }        if (rrc == 0)
2459            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2460        break;        break;
2461    
2462    
# Line 2082  Returns:          > 0 => number of match Line 2539  Returns:          > 0 => number of match
2539                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2540  */  */
2541    
2542  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2543  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2544    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2545    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2172  md->end_subject = end_subject; Line 2629  md->end_subject = end_subject;
2629  md->moptions = options;  md->moptions = options;
2630  md->poptions = re->options;  md->poptions = re->options;
2631    
2632    /* If the BSR option is not set at match time, copy what was set
2633    at compile time. */
2634    
2635    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2636      {
2637      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2638        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2639    #ifdef BSR_ANYCRLF
2640      else md->moptions |= PCRE_BSR_ANYCRLF;
2641    #endif
2642      }
2643    
2644  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
2645  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
2646    
# Line 2179  switch ((((options & PCRE_NEWLINE_BITS) Line 2648  switch ((((options & PCRE_NEWLINE_BITS)
2648           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2649    {    {
2650    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2651    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2652    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2653    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2654         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2655    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2656    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2657    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2242  if (md->tables == NULL) md->tables = _pc Line 2711  if (md->tables == NULL) md->tables = _pc
2711  used in a loop when finding where to start. */  used in a loop when finding where to start. */
2712    
2713  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
2714  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
2715  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
2716    
2717  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2253  studied, there may be a bitmap of possib Line 2722  studied, there may be a bitmap of possib
2722    
2723  if (!anchored)  if (!anchored)
2724    {    {
2725    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
2726      {      {
2727      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
2728      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 2270  if (!anchored) Line 2739  if (!anchored)
2739  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
2740  character" set. */  character" set. */
2741    
2742  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
2743    {    {
2744    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
2745    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 2278  if ((re->options & PCRE_REQCHSET) != 0) Line 2747  if ((re->options & PCRE_REQCHSET) != 0)
2747    }    }
2748    
2749  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2750  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2751  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2752    
2753  for (;;)  for (;;)
2754    {    {
# Line 2290  for (;;) Line 2758  for (;;)
2758      {      {
2759      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2760    
2761      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2762      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2763      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2764      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2765    
2766      if (firstline)      if (firstline)
2767        {        {
2768        const uschar *t = current_subject;        USPTR t = current_subject;
2769    #ifdef SUPPORT_UTF8
2770          if (utf8)
2771            {
2772            while (t < md->end_subject && !IS_NEWLINE(t))
2773              {
2774              t++;
2775              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2776              }
2777            }
2778          else
2779    #endif
2780        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2781        end_subject = t;        end_subject = t;
2782        }        }
2783    
2784      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2785        starting point is not found, or if a known later character is not present.
2786        However, there is an option that disables these, for testing and for
2787        ensuring that all callouts do actually occur. */
2788    
2789        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2790        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2791    
2792      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2793    
2794      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + start_offset)  
2795          {          {
2796          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (first_byte_caseless)
2797            current_subject++;            while (current_subject < end_subject &&
2798                     lcc[*current_subject] != first_byte)
2799                current_subject++;
2800            else
2801              while (current_subject < end_subject &&
2802                     *current_subject != first_byte)
2803                current_subject++;
2804            }
2805    
2806          /* Or to just after a linebreak for a multiline match if possible */
2807    
2808          /* If we have just passed a CR and the newline option is ANY or        else if (startline)
2809          ANYCRLF, and we are now at a LF, advance the match position by one more          {
2810          character. */          if (current_subject > md->start_subject + start_offset)
2811              {
2812          if (current_subject[-1] == '\r' &&  #ifdef SUPPORT_UTF8
2813               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&            if (utf8)
2814               current_subject < end_subject &&              {
2815               *current_subject == '\n')              while (current_subject < end_subject &&
2816            current_subject++;                     !WAS_NEWLINE(current_subject))
2817                  {
2818                  current_subject++;
2819                  while(current_subject < end_subject &&
2820                        (*current_subject & 0xc0) == 0x80)
2821                    current_subject++;
2822                  }
2823                }
2824              else
2825    #endif
2826              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2827                current_subject++;
2828    
2829              /* If we have just passed a CR and the newline option is ANY or
2830              ANYCRLF, and we are now at a LF, advance the match position by one
2831              more character. */
2832    
2833              if (current_subject[-1] == CHAR_CR &&
2834                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2835                   current_subject < end_subject &&
2836                   *current_subject == CHAR_NL)
2837                current_subject++;
2838              }
2839          }          }
       }  
2840    
2841      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2842    
2843      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2844          {          {
2845          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2846          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2847            else break;            register unsigned int c = *current_subject;
2848              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2849                else break;
2850              }
2851          }          }
2852        }        }
2853    
# Line 2365  for (;;) Line 2869  for (;;)
2869    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2870    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2871    
2872    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2873    */    also be explicitly deactivated. */
2874    
2875    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2876          req_byte >= 0 &&
2877        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2878        (options & PCRE_PARTIAL) == 0)        (options & PCRE_PARTIAL) == 0)
2879      {      {
# Line 2440  for (;;) Line 2945  for (;;)
2945      }      }
2946    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
2947    
2948    /* If we have just passed a CR and the newline option is CRLF or ANY or    /* If we have just passed a CR and we are now at a LF, and the pattern does
2949    ANYCRLF, and we are now at a LF, advance the match position by one more    not contain any explicit matches for \r or \n, and the newline option is CRLF
2950    character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2951    
2952    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2953         (md->nltype == NLTYPE_ANY ||        current_subject < end_subject &&
2954          md->nltype == NLTYPE_ANYCRLF ||        *current_subject == CHAR_NL &&
2955          md->nllen == 2) &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2956         current_subject < end_subject &&          (md->nltype == NLTYPE_ANY ||
2957         *current_subject == '\n')           md->nltype == NLTYPE_ANYCRLF ||
2958             md->nllen == 2))
2959      current_subject++;      current_subject++;
2960    
2961    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.168  
changed lines
  Added in v.397

  ViewVC Help
Powered by ViewVC 1.1.5