/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 178 by ph10, Wed Jun 13 08:44:34 2007 UTC revision 361 by ph10, Thu Jul 10 16:03:28 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 44  FSM). This is NOT Perl- compatible, but
44  applications. */  applications. */
45    
46    
47    #ifdef HAVE_CONFIG_H
48    #include "config.h"
49    #endif
50    
51  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
52  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
53  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 63  applications. */ Line 67  applications. */
67    
68  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
70  enough. The resulting opcodes don't have to be less than 256 because they are  enough. The resulting opcodes don't have to be less than 256 because they are
71  never stored, so we push them well clear of the normal opcodes. */  never stored, so we push them well clear of the normal opcodes. */
72    
73  #define OP_PROP_EXTRA       300  #define OP_PROP_EXTRA       300
# Line 80  centralize the loading of these characte Line 84  centralize the loading of these characte
84  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. ***NOTE*** If the start of this table is modified, the two tables
85  that follow must also be modified. */  that follow must also be modified. */
86    
87  static uschar coptable[] = {  static const uschar coptable[] = {
88    0,                             /* End                                    */    0,                             /* End                                    */
89    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
90    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
91    0, 0,                          /* Any, Anybyte                           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
92    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
93    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
94    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
# Line 126  static uschar coptable[] = { Line 130  static uschar coptable[] = {
130    0,                             /* CREF                                   */    0,                             /* CREF                                   */
131    0,                             /* RREF                                   */    0,                             /* RREF                                   */
132    0,                             /* DEF                                    */    0,                             /* DEF                                    */
133    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
134      0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
135      0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
136  };  };
137    
138  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139  and \w */  and \w */
140    
141  static uschar toptable1[] = {  static const uschar toptable1[] = {
142    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
143    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
144    ctype_space, ctype_space,    ctype_space, ctype_space,
145    ctype_word,  ctype_word,    ctype_word,  ctype_word,
146    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
147  };  };
148    
149  static uschar toptable2[] = {  static const uschar toptable2[] = {
150    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
151    ctype_digit, 0,    ctype_digit, 0,
152    ctype_space, 0,    ctype_space, 0,
153    ctype_word,  0,    ctype_word,  0,
154    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
155  };  };
156    
157    
# Line 217  Arguments: Line 223  Arguments:
223    rlevel            function call recursion level    rlevel            function call recursion level
224    recursing         regex recursive call level    recursing         regex recursive call level
225    
226  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
227                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
228                       -1 => failed to match                       -1 => failed to match
229                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
230    
# Line 506  for (;;) Line 512  for (;;)
512      const uschar *code;      const uschar *code;
513      int state_offset = current_state->offset;      int state_offset = current_state->offset;
514      int count, codevalue;      int count, codevalue;
 #ifdef SUPPORT_UCP  
     int chartype, script;  
 #endif  
515    
516  #ifdef DEBUG  #ifdef DEBUG
517      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 585  for (;;) Line 588  for (;;)
588            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
589            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
590            case OP_NOT_HSPACE:            case OP_NOT_HSPACE:
591            case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;            case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
592            case OP_NOT_VSPACE:            case OP_NOT_VSPACE:
593            case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;            case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
594            default: break;            default: break;
595            }            }
596          }          }
# Line 688  for (;;) Line 691  for (;;)
691        break;        break;
692    
693        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
694          case OP_SKIPZERO:
695          code += 1 + GET(code, 2);
696          while (*code == OP_ALT) code += GET(code, 1);
697          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
698          break;
699    
700          /*-----------------------------------------------------------------*/
701        case OP_CIRC:        case OP_CIRC:
702        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
703            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
# Line 726  for (;;) Line 736  for (;;)
736    
737        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
738        case OP_ANY:        case OP_ANY:
739        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
740            { ADD_NEW(state_offset + 1, 0); }
741          break;
742    
743          /*-----------------------------------------------------------------*/
744          case OP_ALLANY:
745          if (clen > 0)
746          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
747        break;        break;
748    
# Line 806  for (;;) Line 822  for (;;)
822        if (clen > 0)        if (clen > 0)
823          {          {
824          BOOL OK;          BOOL OK;
825          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
826          switch(code[1])          switch(code[1])
827            {            {
828            case PT_ANY:            case PT_ANY:
# Line 814  for (;;) Line 830  for (;;)
830            break;            break;
831    
832            case PT_LAMP:            case PT_LAMP:
833            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
834            break;            break;
835    
836            case PT_GC:            case PT_GC:
837            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
838            break;            break;
839    
840            case PT_PC:            case PT_PC:
841            OK = chartype == code[2];            OK = prop->chartype == code[2];
842            break;            break;
843    
844            case PT_SC:            case PT_SC:
845            OK = script == code[2];            OK = prop->script == code[2];
846            break;            break;
847    
848            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 846  for (;;) Line 862  for (;;)
862  /* ========================================================================== */  /* ========================================================================== */
863        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
864        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
865        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
866        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
867    
868        case OP_TYPEPLUS:        case OP_TYPEPLUS:
869        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 858  for (;;) Line 874  for (;;)
874          {          {
875          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
876              (c < 256 &&              (c < 256 &&
877                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
878                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
879            {            {
880            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 884  for (;;) Line 897  for (;;)
897          {          {
898          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
899              (c < 256 &&              (c < 256 &&
900                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
901                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
902            {            {
903            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 909  for (;;) Line 919  for (;;)
919          {          {
920          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
921              (c < 256 &&              (c < 256 &&
922                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
923                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
924            {            {
925            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 932  for (;;) Line 939  for (;;)
939          {          {
940          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
941              (c < 256 &&              (c < 256 &&
942                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
943                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
944            {            {
945            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 956  for (;;) Line 960  for (;;)
960          {          {
961          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
962              (c < 256 &&              (c < 256 &&
963                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
964                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
965            {            {
966            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 990  for (;;) Line 991  for (;;)
991        if (clen > 0)        if (clen > 0)
992          {          {
993          BOOL OK;          BOOL OK;
994          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
995          switch(code[2])          switch(code[2])
996            {            {
997            case PT_ANY:            case PT_ANY:
# Line 998  for (;;) Line 999  for (;;)
999            break;            break;
1000    
1001            case PT_LAMP:            case PT_LAMP:
1002            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1003            break;            break;
1004    
1005            case PT_GC:            case PT_GC:
1006            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1007            break;            break;
1008    
1009            case PT_PC:            case PT_PC:
1010            OK = chartype == code[3];            OK = prop->chartype == code[3];
1011            break;            break;
1012    
1013            case PT_SC:            case PT_SC:
1014            OK = script == code[3];            OK = prop->script == code[3];
1015            break;            break;
1016    
1017            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1039  for (;;) Line 1040  for (;;)
1040        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1041        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1042        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1043        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1044          {          {
1045          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1046          int ncount = 0;          int ncount = 0;
# Line 1053  for (;;) Line 1054  for (;;)
1054            int nd;            int nd;
1055            int ndlen = 1;            int ndlen = 1;
1056            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1057            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1058            ncount++;            ncount++;
1059            nptr += ndlen;            nptr += ndlen;
1060            }            }
# Line 1074  for (;;) Line 1075  for (;;)
1075          int ncount = 0;          int ncount = 0;
1076          switch (c)          switch (c)
1077            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1078            case 0x000b:            case 0x000b:
1079            case 0x000c:            case 0x000c:
1080            case 0x0085:            case 0x0085:
1081            case 0x2028:            case 0x2028:
1082            case 0x2029:            case 0x2029:
1083              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1084              goto ANYNL01;
1085    
1086              case 0x000d:
1087              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1088              /* Fall through */
1089    
1090              ANYNL01:
1091              case 0x000a:
1092            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1093              {              {
1094              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1091  for (;;) Line 1097  for (;;)
1097            count++;            count++;
1098            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, ncount);
1099            break;            break;
1100    
1101            default:            default:
1102            break;            break;
1103            }            }
# Line 1105  for (;;) Line 1112  for (;;)
1112        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113        if (clen > 0)        if (clen > 0)
1114          {          {
1115          BOOL OK;          BOOL OK;
1116          switch (c)          switch (c)
1117            {            {
1118            case 0x000a:            case 0x000a:
# Line 1116  for (;;) Line 1123  for (;;)
1123            case 0x2028:            case 0x2028:
1124            case 0x2029:            case 0x2029:
1125            OK = TRUE;            OK = TRUE;
1126            break;            break;
1127    
1128            default:            default:
1129            OK = FALSE;            OK = FALSE;
1130            break;            break;
1131            }            }
1132    
1133          if (OK == (d == OP_VSPACE))          if (OK == (d == OP_VSPACE))
1134            {            {
1135            if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1136              {              {
1137              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1144  for (;;) Line 1151  for (;;)
1151        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1152        if (clen > 0)        if (clen > 0)
1153          {          {
1154          BOOL OK;          BOOL OK;
1155          switch (c)          switch (c)
1156            {            {
1157            case 0x09:      /* HT */            case 0x09:      /* HT */
# Line 1168  for (;;) Line 1175  for (;;)
1175            case 0x3000:    /* IDEOGRAPHIC SPACE */            case 0x3000:    /* IDEOGRAPHIC SPACE */
1176            OK = TRUE;            OK = TRUE;
1177            break;            break;
1178    
1179            default:            default:
1180            OK = FALSE;            OK = FALSE;
1181            break;            break;
1182            }            }
1183    
1184          if (OK == (d == OP_HSPACE))          if (OK == (d == OP_HSPACE))
1185            {            {
1186            if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1187              {              {
1188              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1206  for (;;) Line 1213  for (;;)
1213        if (clen > 0)        if (clen > 0)
1214          {          {
1215          BOOL OK;          BOOL OK;
1216          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1217          switch(code[2])          switch(code[2])
1218            {            {
1219            case PT_ANY:            case PT_ANY:
# Line 1214  for (;;) Line 1221  for (;;)
1221            break;            break;
1222    
1223            case PT_LAMP:            case PT_LAMP:
1224            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1225            break;            break;
1226    
1227            case PT_GC:            case PT_GC:
1228            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1229            break;            break;
1230    
1231            case PT_PC:            case PT_PC:
1232            OK = chartype == code[3];            OK = prop->chartype == code[3];
1233            break;            break;
1234    
1235            case PT_SC:            case PT_SC:
1236            OK = script == code[3];            OK = prop->script == code[3];
1237            break;            break;
1238    
1239            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1264  for (;;) Line 1271  for (;;)
1271        QS2:        QS2:
1272    
1273        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1274        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1275          {          {
1276          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1277          int ncount = 0;          int ncount = 0;
# Line 1279  for (;;) Line 1286  for (;;)
1286            int nd;            int nd;
1287            int ndlen = 1;            int ndlen = 1;
1288            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1289            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1290            ncount++;            ncount++;
1291            nptr += ndlen;            nptr += ndlen;
1292            }            }
# Line 1307  for (;;) Line 1314  for (;;)
1314          int ncount = 0;          int ncount = 0;
1315          switch (c)          switch (c)
1316            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1317            case 0x000b:            case 0x000b:
1318            case 0x000c:            case 0x000c:
1319            case 0x0085:            case 0x0085:
1320            case 0x2028:            case 0x2028:
1321            case 0x2029:            case 0x2029:
1322              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323              goto ANYNL02;
1324    
1325              case 0x000d:
1326              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327              /* Fall through */
1328    
1329              ANYNL02:
1330              case 0x000a:
1331            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1332                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1333              {              {
# Line 1324  for (;;) Line 1336  for (;;)
1336              }              }
1337            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1338            break;            break;
1339    
1340            default:            default:
1341            break;            break;
1342            }            }
# Line 1346  for (;;) Line 1359  for (;;)
1359        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1360        if (clen > 0)        if (clen > 0)
1361          {          {
1362          BOOL OK;          BOOL OK;
1363          switch (c)          switch (c)
1364            {            {
1365            case 0x000a:            case 0x000a:
# Line 1358  for (;;) Line 1371  for (;;)
1371            case 0x2029:            case 0x2029:
1372            OK = TRUE;            OK = TRUE;
1373            break;            break;
1374    
1375            default:            default:
1376            OK = FALSE;            OK = FALSE;
1377            break;            break;
1378            }            }
1379          if (OK == (d == OP_VSPACE))          if (OK == (d == OP_VSPACE))
1380            {            {
1381            if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1382                codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1383              {              {
# Line 1392  for (;;) Line 1405  for (;;)
1405        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1406        if (clen > 0)        if (clen > 0)
1407          {          {
1408          BOOL OK;          BOOL OK;
1409          switch (c)          switch (c)
1410            {            {
1411            case 0x09:      /* HT */            case 0x09:      /* HT */
# Line 1416  for (;;) Line 1429  for (;;)
1429            case 0x3000:    /* IDEOGRAPHIC SPACE */            case 0x3000:    /* IDEOGRAPHIC SPACE */
1430            OK = TRUE;            OK = TRUE;
1431            break;            break;
1432    
1433            default:            default:
1434            OK = FALSE;            OK = FALSE;
1435            break;            break;
1436            }            }
1437    
1438          if (OK == (d == OP_HSPACE))          if (OK == (d == OP_HSPACE))
1439            {            {
1440            if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1441                codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1442              {              {
# Line 1447  for (;;) Line 1460  for (;;)
1460        if (clen > 0)        if (clen > 0)
1461          {          {
1462          BOOL OK;          BOOL OK;
1463          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1464          switch(code[4])          switch(code[4])
1465            {            {
1466            case PT_ANY:            case PT_ANY:
# Line 1455  for (;;) Line 1468  for (;;)
1468            break;            break;
1469    
1470            case PT_LAMP:            case PT_LAMP:
1471            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1472            break;            break;
1473    
1474            case PT_GC:            case PT_GC:
1475            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1476            break;            break;
1477    
1478            case PT_PC:            case PT_PC:
1479            OK = chartype == code[5];            OK = prop->chartype == code[5];
1480            break;            break;
1481    
1482            case PT_SC:            case PT_SC:
1483            OK = script == code[5];            OK = prop->script == code[5];
1484            break;            break;
1485    
1486            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1500  for (;;) Line 1513  for (;;)
1513        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1514          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1515        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1516        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1517          {          {
1518          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1519          int ncount = 0;          int ncount = 0;
# Line 1514  for (;;) Line 1527  for (;;)
1527            int nd;            int nd;
1528            int ndlen = 1;            int ndlen = 1;
1529            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1530            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1531            ncount++;            ncount++;
1532            nptr += ndlen;            nptr += ndlen;
1533            }            }
# Line 1539  for (;;) Line 1552  for (;;)
1552          int ncount = 0;          int ncount = 0;
1553          switch (c)          switch (c)
1554            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1555            case 0x000b:            case 0x000b:
1556            case 0x000c:            case 0x000c:
1557            case 0x0085:            case 0x0085:
1558            case 0x2028:            case 0x2028:
1559            case 0x2029:            case 0x2029:
1560              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1561              goto ANYNL03;
1562    
1563              case 0x000d:
1564              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1565              /* Fall through */
1566    
1567              ANYNL03:
1568              case 0x000a:
1569            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1570              {              {
1571              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1558  for (;;) Line 1576  for (;;)
1576            else            else
1577              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1578            break;            break;
1579    
1580            default:            default:
1581            break;            break;
1582            }            }
# Line 1574  for (;;) Line 1593  for (;;)
1593        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1594        if (clen > 0)        if (clen > 0)
1595          {          {
1596          BOOL OK;          BOOL OK;
1597          switch (c)          switch (c)
1598            {            {
1599            case 0x000a:            case 0x000a:
# Line 1586  for (;;) Line 1605  for (;;)
1605            case 0x2029:            case 0x2029:
1606            OK = TRUE;            OK = TRUE;
1607            break;            break;
1608    
1609            default:            default:
1610            OK = FALSE;            OK = FALSE;
1611            }            }
1612    
1613          if (OK == (d == OP_VSPACE))          if (OK == (d == OP_VSPACE))
1614            {            {
1615            if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1616              {              {
1617              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1616  for (;;) Line 1635  for (;;)
1635        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1636        if (clen > 0)        if (clen > 0)
1637          {          {
1638          BOOL OK;          BOOL OK;
1639          switch (c)          switch (c)
1640            {            {
1641            case 0x09:      /* HT */            case 0x09:      /* HT */
# Line 1640  for (;;) Line 1659  for (;;)
1659            case 0x3000:    /* IDEOGRAPHIC SPACE */            case 0x3000:    /* IDEOGRAPHIC SPACE */
1660            OK = TRUE;            OK = TRUE;
1661            break;            break;
1662    
1663            default:            default:
1664            OK = FALSE;            OK = FALSE;
1665            break;            break;
1666            }            }
1667    
1668          if (OK == (d == OP_HSPACE))          if (OK == (d == OP_HSPACE))
1669            {            {
1670            if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1671              {              {
1672              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1688  for (;;) Line 1707  for (;;)
1707            other case of the character. */            other case of the character. */
1708    
1709  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1710            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
1711  #else  #else
1712            othercase = NOTACHAR;            othercase = NOTACHAR;
1713  #endif  #endif
# Line 1713  for (;;) Line 1732  for (;;)
1732        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1733    
1734        case OP_EXTUNI:        case OP_EXTUNI:
1735        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1736          {          {
1737          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1738          int ncount = 0;          int ncount = 0;
# Line 1721  for (;;) Line 1740  for (;;)
1740            {            {
1741            int nclen = 1;            int nclen = 1;
1742            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1743            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
1744            ncount++;            ncount++;
1745            nptr += nclen;            nptr += nclen;
1746            }            }
# Line 1738  for (;;) Line 1757  for (;;)
1757        case OP_ANYNL:        case OP_ANYNL:
1758        if (clen > 0) switch(c)        if (clen > 0) switch(c)
1759          {          {
         case 0x000a:  
1760          case 0x000b:          case 0x000b:
1761          case 0x000c:          case 0x000c:
1762          case 0x0085:          case 0x0085:
1763          case 0x2028:          case 0x2028:
1764          case 0x2029:          case 0x2029:
1765            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1766    
1767            case 0x000a:
1768          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1769          break;          break;
1770    
1771          case 0x000d:          case 0x000d:
1772          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1773            {            {
# Line 1771  for (;;) Line 1793  for (;;)
1793          case 0x2028:          case 0x2028:
1794          case 0x2029:          case 0x2029:
1795          break;          break;
1796    
1797          default:          default:
1798          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1799          break;          break;
1800          }          }
# Line 1791  for (;;) Line 1813  for (;;)
1813          case 0x2029:          case 0x2029:
1814          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1815          break;          break;
1816    
1817          default: break;          default: break;
1818          }          }
1819        break;        break;
# Line 1820  for (;;) Line 1842  for (;;)
1842          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1843          case 0x3000:    /* IDEOGRAPHIC SPACE */          case 0x3000:    /* IDEOGRAPHIC SPACE */
1844          break;          break;
1845    
1846          default:          default:
1847          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1848          break;          break;
1849          }          }
# Line 1886  for (;;) Line 1908  for (;;)
1908            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1909              {              {
1910  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1911              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1912  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1913              }              }
1914            else            else
# Line 1924  for (;;) Line 1946  for (;;)
1946            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1947              {              {
1948  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1949              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1950  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1951              }              }
1952            else            else
# Line 1960  for (;;) Line 1982  for (;;)
1982            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1983              {              {
1984  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1985              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1986  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1987              }              }
1988            else            else
# Line 1992  for (;;) Line 2014  for (;;)
2014            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2015              {              {
2016  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2017              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2018  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2019              }              }
2020            else            else
# Line 2027  for (;;) Line 2049  for (;;)
2049            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2050              {              {
2051  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2052              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2053  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2054              }              }
2055            else            else
# Line 2135  for (;;) Line 2157  for (;;)
2157    
2158  /* ========================================================================== */  /* ========================================================================== */
2159        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2160        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assersion
2161          (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2162          though the other "backtracking verbs" are not supported. */
2163    
2164          case OP_FAIL:
2165          break;
2166    
2167        case OP_ASSERT:        case OP_ASSERT:
2168        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 2478  Returns:          > 0 => number of match Line 2505  Returns:          > 0 => number of match
2505                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2506  */  */
2507    
2508  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2509  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2510    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2511    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2568  md->end_subject = end_subject; Line 2595  md->end_subject = end_subject;
2595  md->moptions = options;  md->moptions = options;
2596  md->poptions = re->options;  md->poptions = re->options;
2597    
2598    /* If the BSR option is not set at match time, copy what was set
2599    at compile time. */
2600    
2601    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2602      {
2603      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2604        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2605    #ifdef BSR_ANYCRLF
2606      else md->moptions |= PCRE_BSR_ANYCRLF;
2607    #endif
2608      }
2609    
2610  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
2611  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
2612    
# Line 2638  if (md->tables == NULL) md->tables = _pc Line 2677  if (md->tables == NULL) md->tables = _pc
2677  used in a loop when finding where to start. */  used in a loop when finding where to start. */
2678    
2679  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
2680  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
2681  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
2682    
2683  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2649  studied, there may be a bitmap of possib Line 2688  studied, there may be a bitmap of possib
2688    
2689  if (!anchored)  if (!anchored)
2690    {    {
2691    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
2692      {      {
2693      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
2694      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 2666  if (!anchored) Line 2705  if (!anchored)
2705  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
2706  character" set. */  character" set. */
2707    
2708  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
2709    {    {
2710    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
2711    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 2716  for (;;) Line 2755  for (;;)
2755        {        {
2756        if (current_subject > md->start_subject + start_offset)        if (current_subject > md->start_subject + start_offset)
2757          {          {
2758          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2759            current_subject++;            current_subject++;
2760    
2761          /* If we have just passed a CR and the newline option is ANY or          /* If we have just passed a CR and the newline option is ANY or
# Line 2836  for (;;) Line 2875  for (;;)
2875      }      }
2876    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
2877    
2878    /* If we have just passed a CR and the newline option is CRLF or ANY or    /* If we have just passed a CR and we are now at a LF, and the pattern does
2879    ANYCRLF, and we are now at a LF, advance the match position by one more    not contain any explicit matches for \r or \n, and the newline option is CRLF
2880    character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2881    
2882    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == '\r' &&
2883         (md->nltype == NLTYPE_ANY ||        current_subject < end_subject &&
2884          md->nltype == NLTYPE_ANYCRLF ||        *current_subject == '\n' &&
2885          md->nllen == 2) &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2886         current_subject < end_subject &&          (md->nltype == NLTYPE_ANY ||
2887         *current_subject == '\n')           md->nltype == NLTYPE_ANYCRLF ||
2888             md->nllen == 2))
2889      current_subject++;      current_subject++;
2890    
2891    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.178  
changed lines
  Added in v.361

  ViewVC Help
Powered by ViewVC 1.1.5