/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 231 by ph10, Tue Sep 11 11:15:33 2007 UTC revision 397 by ph10, Fri Mar 20 19:40:08 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 45  applications. */ Line 46  applications. */
46    
47    
48  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
49  #include <config.h>  #include "config.h"
50  #endif  #endif
51    
52  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
# Line 84  centralize the loading of these characte Line 85  centralize the loading of these characte
85  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. ***NOTE*** If the start of this table is modified, the two tables
86  that follow must also be modified. */  that follow must also be modified. */
87    
88  static uschar coptable[] = {  static const uschar coptable[] = {
89    0,                             /* End                                    */    0,                             /* End                                    */
90    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
91    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
92    0, 0,                          /* Any, Anybyte                           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
93    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
94    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
95    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
# Line 132  static uschar coptable[] = { Line 133  static uschar coptable[] = {
133    0,                             /* DEF                                    */    0,                             /* DEF                                    */
134    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
135    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
136    0, 0                           /* FAIL, ACCEPT                           */    0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
137  };  };
138    
139  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
140  and \w */  and \w */
141    
142  static uschar toptable1[] = {  static const uschar toptable1[] = {
143    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
144    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
145    ctype_space, ctype_space,    ctype_space, ctype_space,
146    ctype_word,  ctype_word,    ctype_word,  ctype_word,
147    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
148  };  };
149    
150  static uschar toptable2[] = {  static const uschar toptable2[] = {
151    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
152    ctype_digit, 0,    ctype_digit, 0,
153    ctype_space, 0,    ctype_space, 0,
154    ctype_word,  0,    ctype_word,  0,
155    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
156  };  };
157    
158    
# Line 223  Arguments: Line 224  Arguments:
224    rlevel            function call recursion level    rlevel            function call recursion level
225    recursing         regex recursive call level    recursing         regex recursive call level
226    
227  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
228                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
229                       -1 => failed to match                       -1 => failed to match
230                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
231    
# Line 511  for (;;) Line 512  for (;;)
512      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
513      const uschar *code;      const uschar *code;
514      int state_offset = current_state->offset;      int state_offset = current_state->offset;
515      int count, codevalue;      int count, codevalue, rrc;
 #ifdef SUPPORT_UCP  
     int chartype, script;  
 #endif  
516    
517  #ifdef DEBUG  #ifdef DEBUG
518      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 694  for (;;) Line 692  for (;;)
692        break;        break;
693    
694        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
695          case OP_SKIPZERO:
696          code += 1 + GET(code, 2);
697          while (*code == OP_ALT) code += GET(code, 1);
698          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
699          break;
700    
701          /*-----------------------------------------------------------------*/
702        case OP_CIRC:        case OP_CIRC:
703        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
704            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
# Line 732  for (;;) Line 737  for (;;)
737    
738        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
739        case OP_ANY:        case OP_ANY:
740        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
741            { ADD_NEW(state_offset + 1, 0); }
742          break;
743    
744          /*-----------------------------------------------------------------*/
745          case OP_ALLANY:
746          if (clen > 0)
747          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
748        break;        break;
749    
# Line 747  for (;;) Line 758  for (;;)
758        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
759          {          {
760          if (clen == 0 ||          if (clen == 0 ||
761              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
762                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
763              ))              ))
764            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 812  for (;;) Line 823  for (;;)
823        if (clen > 0)        if (clen > 0)
824          {          {
825          BOOL OK;          BOOL OK;
826          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
827          switch(code[1])          switch(code[1])
828            {            {
829            case PT_ANY:            case PT_ANY:
# Line 820  for (;;) Line 831  for (;;)
831            break;            break;
832    
833            case PT_LAMP:            case PT_LAMP:
834            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
835            break;            break;
836    
837            case PT_GC:            case PT_GC:
838            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
839            break;            break;
840    
841            case PT_PC:            case PT_PC:
842            OK = chartype == code[2];            OK = prop->chartype == code[2];
843            break;            break;
844    
845            case PT_SC:            case PT_SC:
846            OK = script == code[2];            OK = prop->script == code[2];
847            break;            break;
848    
849            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 852  for (;;) Line 863  for (;;)
863  /* ========================================================================== */  /* ========================================================================== */
864        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
865        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
866        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
867        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
868    
869        case OP_TYPEPLUS:        case OP_TYPEPLUS:
870        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 864  for (;;) Line 875  for (;;)
875          {          {
876          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
877              (c < 256 &&              (c < 256 &&
878                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
879                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
880            {            {
881            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 890  for (;;) Line 898  for (;;)
898          {          {
899          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
900              (c < 256 &&              (c < 256 &&
901                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
902                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
903            {            {
904            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 915  for (;;) Line 920  for (;;)
920          {          {
921          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
922              (c < 256 &&              (c < 256 &&
923                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
924                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
925            {            {
926            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 938  for (;;) Line 940  for (;;)
940          {          {
941          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
942              (c < 256 &&              (c < 256 &&
943                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
944                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
945            {            {
946            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 962  for (;;) Line 961  for (;;)
961          {          {
962          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
963              (c < 256 &&              (c < 256 &&
964                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
965                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
966            {            {
967            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 996  for (;;) Line 992  for (;;)
992        if (clen > 0)        if (clen > 0)
993          {          {
994          BOOL OK;          BOOL OK;
995          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
996          switch(code[2])          switch(code[2])
997            {            {
998            case PT_ANY:            case PT_ANY:
# Line 1004  for (;;) Line 1000  for (;;)
1000            break;            break;
1001    
1002            case PT_LAMP:            case PT_LAMP:
1003            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1004            break;            break;
1005    
1006            case PT_GC:            case PT_GC:
1007            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1008            break;            break;
1009    
1010            case PT_PC:            case PT_PC:
1011            OK = chartype == code[3];            OK = prop->chartype == code[3];
1012            break;            break;
1013    
1014            case PT_SC:            case PT_SC:
1015            OK = script == code[3];            OK = prop->script == code[3];
1016            break;            break;
1017    
1018            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1045  for (;;) Line 1041  for (;;)
1041        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1042        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1043        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1044        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1045          {          {
1046          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1047          int ncount = 0;          int ncount = 0;
# Line 1059  for (;;) Line 1055  for (;;)
1055            int nd;            int nd;
1056            int ndlen = 1;            int ndlen = 1;
1057            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1058            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1059            ncount++;            ncount++;
1060            nptr += ndlen;            nptr += ndlen;
1061            }            }
# Line 1218  for (;;) Line 1214  for (;;)
1214        if (clen > 0)        if (clen > 0)
1215          {          {
1216          BOOL OK;          BOOL OK;
1217          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1218          switch(code[2])          switch(code[2])
1219            {            {
1220            case PT_ANY:            case PT_ANY:
# Line 1226  for (;;) Line 1222  for (;;)
1222            break;            break;
1223    
1224            case PT_LAMP:            case PT_LAMP:
1225            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1226            break;            break;
1227    
1228            case PT_GC:            case PT_GC:
1229            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1230            break;            break;
1231    
1232            case PT_PC:            case PT_PC:
1233            OK = chartype == code[3];            OK = prop->chartype == code[3];
1234            break;            break;
1235    
1236            case PT_SC:            case PT_SC:
1237            OK = script == code[3];            OK = prop->script == code[3];
1238            break;            break;
1239    
1240            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1276  for (;;) Line 1272  for (;;)
1272        QS2:        QS2:
1273    
1274        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1275        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1276          {          {
1277          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1278          int ncount = 0;          int ncount = 0;
# Line 1291  for (;;) Line 1287  for (;;)
1287            int nd;            int nd;
1288            int ndlen = 1;            int ndlen = 1;
1289            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1290            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1291            ncount++;            ncount++;
1292            nptr += ndlen;            nptr += ndlen;
1293            }            }
# Line 1465  for (;;) Line 1461  for (;;)
1461        if (clen > 0)        if (clen > 0)
1462          {          {
1463          BOOL OK;          BOOL OK;
1464          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1465          switch(code[4])          switch(code[4])
1466            {            {
1467            case PT_ANY:            case PT_ANY:
# Line 1473  for (;;) Line 1469  for (;;)
1469            break;            break;
1470    
1471            case PT_LAMP:            case PT_LAMP:
1472            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1473            break;            break;
1474    
1475            case PT_GC:            case PT_GC:
1476            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1477            break;            break;
1478    
1479            case PT_PC:            case PT_PC:
1480            OK = chartype == code[5];            OK = prop->chartype == code[5];
1481            break;            break;
1482    
1483            case PT_SC:            case PT_SC:
1484            OK = script == code[5];            OK = prop->script == code[5];
1485            break;            break;
1486    
1487            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1518  for (;;) Line 1514  for (;;)
1514        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1515          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1516        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1517        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1518          {          {
1519          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1520          int ncount = 0;          int ncount = 0;
# Line 1532  for (;;) Line 1528  for (;;)
1528            int nd;            int nd;
1529            int ndlen = 1;            int ndlen = 1;
1530            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1531            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1532            ncount++;            ncount++;
1533            nptr += ndlen;            nptr += ndlen;
1534            }            }
# Line 1712  for (;;) Line 1708  for (;;)
1708            other case of the character. */            other case of the character. */
1709    
1710  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1711            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
1712  #else  #else
1713            othercase = NOTACHAR;            othercase = NOTACHAR;
1714  #endif  #endif
# Line 1737  for (;;) Line 1733  for (;;)
1733        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1734    
1735        case OP_EXTUNI:        case OP_EXTUNI:
1736        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1737          {          {
1738          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1739          int ncount = 0;          int ncount = 0;
# Line 1745  for (;;) Line 1741  for (;;)
1741            {            {
1742            int nclen = 1;            int nclen = 1;
1743            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1744            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
1745            ncount++;            ncount++;
1746            nptr += nclen;            nptr += nclen;
1747            }            }
# Line 1913  for (;;) Line 1909  for (;;)
1909            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1910              {              {
1911  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1912              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1913  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1914              }              }
1915            else            else
# Line 1951  for (;;) Line 1947  for (;;)
1947            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1948              {              {
1949  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1950              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1951  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1952              }              }
1953            else            else
# Line 1987  for (;;) Line 1983  for (;;)
1983            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1984              {              {
1985  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1986              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1987  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1988              }              }
1989            else            else
# Line 2019  for (;;) Line 2015  for (;;)
2015            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2016              {              {
2017  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2018              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2019  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2020              }              }
2021            else            else
# Line 2054  for (;;) Line 2050  for (;;)
2050            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2051              {              {
2052  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2053              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2054  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2055              }              }
2056            else            else
# Line 2162  for (;;) Line 2158  for (;;)
2158    
2159  /* ========================================================================== */  /* ========================================================================== */
2160        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2161        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assersion
2162          (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2163          though the other "backtracking verbs" are not supported. */
2164    
2165          case OP_FAIL:
2166          break;
2167    
2168        case OP_ASSERT:        case OP_ASSERT:
2169        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 2200  for (;;) Line 2201  for (;;)
2201          {          {
2202          int local_offsets[1000];          int local_offsets[1000];
2203          int local_workspace[1000];          int local_workspace[1000];
2204          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2205            int condcode;
2206    
2207            /* Because of the way auto-callout works during compile, a callout item
2208            is inserted between OP_COND and an assertion condition. */
2209    
2210            if (code[LINK_SIZE+1] == OP_CALLOUT)
2211              {
2212              if (pcre_callout != NULL)
2213                {
2214                int rrc;
2215                pcre_callout_block cb;
2216                cb.version          = 1;   /* Version 1 of the callout block */
2217                cb.callout_number   = code[LINK_SIZE+2];
2218                cb.offset_vector    = offsets;
2219                cb.subject          = (PCRE_SPTR)start_subject;
2220                cb.subject_length   = end_subject - start_subject;
2221                cb.start_match      = current_subject - start_subject;
2222                cb.current_position = ptr - start_subject;
2223                cb.pattern_position = GET(code, LINK_SIZE + 3);
2224                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2225                cb.capture_top      = 1;
2226                cb.capture_last     = -1;
2227                cb.callout_data     = md->callout_data;
2228                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2229                if (rrc == 0) { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2230                }
2231              code += _pcre_OP_lengths[OP_CALLOUT];
2232              }
2233    
2234            condcode = code[LINK_SIZE+1];
2235    
2236          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2237    
2238          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
# Line 2210  for (;;) Line 2241  for (;;)
2241    
2242          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2243            {            {
2244            ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);            ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0);
2245            }            }
2246    
2247          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
# Line 2222  for (;;) Line 2253  for (;;)
2253            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2254            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2255            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2256              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2257            }            }
2258    
2259          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2250  for (;;) Line 2281  for (;;)
2281    
2282            if ((rc >= 0) ==            if ((rc >= 0) ==
2283                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2284              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              {
2285                ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0);
2286                }
2287            else            else
2288              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2289            }            }
2290          }          }
2291        break;        break;
# Line 2404  for (;;) Line 2437  for (;;)
2437        /* Handle callouts */        /* Handle callouts */
2438    
2439        case OP_CALLOUT:        case OP_CALLOUT:
2440          rrc = 0;
2441        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2442          {          {
         int rrc;  
2443          pcre_callout_block cb;          pcre_callout_block cb;
2444          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2445          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 2421  for (;;) Line 2454  for (;;)
2454          cb.capture_last     = -1;          cb.capture_last     = -1;
2455          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2456          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2457          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          }
2458          }        if (rrc == 0)
2459            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2460        break;        break;
2461    
2462    
# Line 2505  Returns:          > 0 => number of match Line 2539  Returns:          > 0 => number of match
2539                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2540  */  */
2541    
2542  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2543  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2544    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2545    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2604  if ((md->moptions & (PCRE_BSR_ANYCRLF|PC Line 2638  if ((md->moptions & (PCRE_BSR_ANYCRLF|PC
2638      md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);      md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2639  #ifdef BSR_ANYCRLF  #ifdef BSR_ANYCRLF
2640    else md->moptions |= PCRE_BSR_ANYCRLF;    else md->moptions |= PCRE_BSR_ANYCRLF;
2641  #endif  #endif
2642    }    }
2643    
2644  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
2645  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
# Line 2614  switch ((((options & PCRE_NEWLINE_BITS) Line 2648  switch ((((options & PCRE_NEWLINE_BITS)
2648           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2649    {    {
2650    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2651    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2652    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2653    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2654         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2655    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2656    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2657    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2713  if ((re->flags & PCRE_REQCHSET) != 0) Line 2747  if ((re->flags & PCRE_REQCHSET) != 0)
2747    }    }
2748    
2749  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2750  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2751  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2752    
2753  for (;;)  for (;;)
2754    {    {
# Line 2725  for (;;) Line 2758  for (;;)
2758      {      {
2759      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2760    
2761      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2762      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2763      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2764      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2765    
2766      if (firstline)      if (firstline)
2767        {        {
2768        const uschar *t = current_subject;        USPTR t = current_subject;
2769    #ifdef SUPPORT_UTF8
2770          if (utf8)
2771            {
2772            while (t < md->end_subject && !IS_NEWLINE(t))
2773              {
2774              t++;
2775              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2776              }
2777            }
2778          else
2779    #endif
2780        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2781        end_subject = t;        end_subject = t;
2782        }        }
2783    
2784      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2785        starting point is not found, or if a known later character is not present.
2786        However, there is an option that disables these, for testing and for
2787        ensuring that all callouts do actually occur. */
2788    
2789        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2790        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2791    
2792      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2793    
2794      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + start_offset)  
2795          {          {
2796          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (first_byte_caseless)
2797            current_subject++;            while (current_subject < end_subject &&
2798                     lcc[*current_subject] != first_byte)
2799                current_subject++;
2800            else
2801              while (current_subject < end_subject &&
2802                     *current_subject != first_byte)
2803                current_subject++;
2804            }
2805    
2806          /* Or to just after a linebreak for a multiline match if possible */
2807    
2808          else if (startline)
2809            {
2810            if (current_subject > md->start_subject + start_offset)
2811              {
2812    #ifdef SUPPORT_UTF8
2813              if (utf8)
2814                {
2815                while (current_subject < end_subject &&
2816                       !WAS_NEWLINE(current_subject))
2817                  {
2818                  current_subject++;
2819                  while(current_subject < end_subject &&
2820                        (*current_subject & 0xc0) == 0x80)
2821                    current_subject++;
2822                  }
2823                }
2824              else
2825    #endif
2826              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2827                current_subject++;
2828    
2829          /* If we have just passed a CR and the newline option is ANY or            /* If we have just passed a CR and the newline option is ANY or
2830          ANYCRLF, and we are now at a LF, advance the match position by one more            ANYCRLF, and we are now at a LF, advance the match position by one
2831          character. */            more character. */
2832    
2833          if (current_subject[-1] == '\r' &&            if (current_subject[-1] == CHAR_CR &&
2834               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2835               current_subject < end_subject &&                 current_subject < end_subject &&
2836               *current_subject == '\n')                 *current_subject == CHAR_NL)
2837            current_subject++;              current_subject++;
2838              }
2839          }          }
       }  
2840    
2841      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2842    
2843      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2844          {          {
2845          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2846          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2847            else break;            register unsigned int c = *current_subject;
2848              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2849                else break;
2850              }
2851          }          }
2852        }        }
2853    
# Line 2800  for (;;) Line 2869  for (;;)
2869    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2870    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2871    
2872    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2873    */    also be explicitly deactivated. */
2874    
2875    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2876          req_byte >= 0 &&
2877        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2878        (options & PCRE_PARTIAL) == 0)        (options & PCRE_PARTIAL) == 0)
2879      {      {
# Line 2879  for (;;) Line 2949  for (;;)
2949    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
2950    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2951    
2952    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2953        current_subject < end_subject &&        current_subject < end_subject &&
2954        *current_subject == '\n' &&        *current_subject == CHAR_NL &&
2955        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2956          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
2957           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.231  
changed lines
  Added in v.397

  ViewVC Help
Powered by ViewVC 1.1.5