/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 97 by ph10, Mon Mar 5 12:36:47 2007 UTC revision 435 by ph10, Sat Sep 5 10:20:44 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    #ifdef HAVE_CONFIG_H
49    #include "config.h"
50    #endif
51    
52  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
53  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
54  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 56  applications. */ Line 61  applications. */
61  #define SP "                   "  #define SP "                   "
62    
63    
   
64  /*************************************************  /*************************************************
65  *      Code parameters and static tables         *  *      Code parameters and static tables         *
66  *************************************************/  *************************************************/
67    
68  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
70  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
71    never stored, so we push them well clear of the normal opcodes. */
72    
73  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
74  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
75  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
76    #define OP_HSPACE_EXTRA     360
77    #define OP_VSPACE_EXTRA     380
78    
79    
80  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
81  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes is possible to
82  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
83  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84  small value. */  small value. ***NOTE*** If the start of this table is modified, the two tables
85    that follow must also be modified. */
86    
87  static uschar coptable[] = {  static const uschar coptable[] = {
88    0,                             /* End                                    */    0,                             /* End                                    */
89    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
90    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
91    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
92      0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
93      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
94    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
95    1,                             /* Char                                   */    1,                             /* Char                                   */
96    1,                             /* Charnc                                 */    1,                             /* Charnc                                 */
# Line 120  static uschar coptable[] = { Line 130  static uschar coptable[] = {
130    0,                             /* CREF                                   */    0,                             /* CREF                                   */
131    0,                             /* RREF                                   */    0,                             /* RREF                                   */
132    0,                             /* DEF                                    */    0,                             /* DEF                                    */
133    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
134      0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
135      0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
136  };  };
137    
138  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139  and \w */  and \w */
140    
141  static uschar toptable1[] = {  static const uschar toptable1[] = {
142    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
143    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
144    ctype_space, ctype_space,    ctype_space, ctype_space,
145    ctype_word,  ctype_word,    ctype_word,  ctype_word,
146    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
147  };  };
148    
149  static uschar toptable2[] = {  static const uschar toptable2[] = {
150    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
151    ctype_digit, 0,    ctype_digit, 0,
152    ctype_space, 0,    ctype_space, 0,
153    ctype_word,  0,    ctype_word,  0,
154    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
155  };  };
156    
157    
# Line 211  Arguments: Line 223  Arguments:
223    rlevel            function call recursion level    rlevel            function call recursion level
224    recursing         regex recursive call level    recursing         regex recursive call level
225    
226  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
227                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
228                       -1 => failed to match                       -1 => failed to match
229                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
230    
# Line 377  if (*first_op == OP_REVERSE) Line 389  if (*first_op == OP_REVERSE)
389        current_subject - start_subject : max_back;        current_subject - start_subject : max_back;
390      current_subject -= gone_back;      current_subject -= gone_back;
391      }      }
392    
393      /* Save the earliest consulted character */
394    
395      if (current_subject < md->start_used_ptr)
396        md->start_used_ptr = current_subject;
397    
398    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
399    
# Line 442  for (;;) Line 459  for (;;)
459    int i, j;    int i, j;
460    int clen, dlen;    int clen, dlen;
461    unsigned int c, d;    unsigned int c, d;
462      int forced_fail = 0;
463      int reached_end = 0;
464    
465    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
466    new state list. */    new state list. */
# Line 499  for (;;) Line 518  for (;;)
518      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
519      const uschar *code;      const uschar *code;
520      int state_offset = current_state->offset;      int state_offset = current_state->offset;
521      int count, codevalue;      int count, codevalue, rrc;
     int chartype, script;  
522    
523  #ifdef DEBUG  #ifdef DEBUG
524      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 555  for (;;) Line 573  for (;;)
573      permitted.      permitted.
574    
575      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
576      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long. We
577      Unfortunately, we have to take special action to deal with  \P, \p, and      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
578      \X in this case. To keep the other cases fast, convert these ones to new      this case. To keep the other cases fast, convert these ones to new opcodes.
579      opcodes. */      */
580    
581      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
582        {        {
# Line 576  for (;;) Line 594  for (;;)
594            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
595            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
596            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
597              case OP_NOT_HSPACE:
598              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
599              case OP_NOT_VSPACE:
600              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
601            default: break;            default: break;
602            }            }
603          }          }
# Line 609  for (;;) Line 631  for (;;)
631            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
632            }            }
633          }          }
634        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
635          {          {
636          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          reached_end++;    /* Count branches that reach the end */
637            else if (match_count > 0 && ++match_count * 2 >= offsetcount)          if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
638              match_count = 0;            {
639          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
640          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));              else if (match_count > 0 && ++match_count * 2 >= offsetcount)
641          if (offsetcount >= 2)                match_count = 0;
642            {            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
643            offsets[0] = current_subject - start_subject;            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
644            offsets[1] = ptr - start_subject;            if (offsetcount >= 2)
645            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              {
646              offsets[1] - offsets[0], current_subject));              offsets[0] = current_subject - start_subject;
647            }              offsets[1] = ptr - start_subject;
648          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
649            {                offsets[1] - offsets[0], current_subject));
650            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              }
651              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
652              match_count, rlevel*2-2, SP));              {
653            return match_count;              DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
654            }                "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
655                  match_count, rlevel*2-2, SP));
656                return match_count;
657                }
658              }
659          }          }
660        break;        break;
661    
# Line 676  for (;;) Line 702  for (;;)
702        break;        break;
703    
704        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
705          case OP_SKIPZERO:
706          code += 1 + GET(code, 2);
707          while (*code == OP_ALT) code += GET(code, 1);
708          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
709          break;
710    
711          /*-----------------------------------------------------------------*/
712        case OP_CIRC:        case OP_CIRC:
713        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
714            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
# Line 714  for (;;) Line 747  for (;;)
747    
748        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
749        case OP_ANY:        case OP_ANY:
750        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
751            { ADD_NEW(state_offset + 1, 0); }
752          break;
753    
754          /*-----------------------------------------------------------------*/
755          case OP_ALLANY:
756          if (clen > 0)
757          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
758        break;        break;
759    
# Line 729  for (;;) Line 768  for (;;)
768        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
769          {          {
770          if (clen == 0 ||          if (clen == 0 ||
771              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
772                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
773              ))              ))
774            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 766  for (;;) Line 805  for (;;)
805          if (ptr > start_subject)          if (ptr > start_subject)
806            {            {
807            const uschar *temp = ptr - 1;            const uschar *temp = ptr - 1;
808              if (temp < md->start_used_ptr) md->start_used_ptr = temp;
809  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
810            if (utf8) BACKCHAR(temp);            if (utf8) BACKCHAR(temp);
811  #endif  #endif
# Line 774  for (;;) Line 814  for (;;)
814            }            }
815          else left_word = 0;          else left_word = 0;
816    
817          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
818            else right_word = 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
819            else              /* This is a fudge to ensure that if this is the */
820              {               /* last item in the pattern, we don't count it as */
821              reached_end--;  /* reached, thus disabling a partial match. */
822              right_word = 0;
823              }
824    
825          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
826            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 783  for (;;) Line 828  for (;;)
828        break;        break;
829    
830    
 #ifdef SUPPORT_UCP  
   
831        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
832        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
833        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
834        */        */
835    
836    #ifdef SUPPORT_UCP
837        case OP_PROP:        case OP_PROP:
838        case OP_NOTPROP:        case OP_NOTPROP:
839        if (clen > 0)        if (clen > 0)
840          {          {
841          BOOL OK;          BOOL OK;
842          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
843          switch(code[1])          switch(code[1])
844            {            {
845            case PT_ANY:            case PT_ANY:
# Line 803  for (;;) Line 847  for (;;)
847            break;            break;
848    
849            case PT_LAMP:            case PT_LAMP:
850            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
851            break;            break;
852    
853            case PT_GC:            case PT_GC:
854            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
855            break;            break;
856    
857            case PT_PC:            case PT_PC:
858            OK = chartype == code[2];            OK = prop->chartype == code[2];
859            break;            break;
860    
861            case PT_SC:            case PT_SC:
862            OK = script == code[2];            OK = prop->script == code[2];
863            break;            break;
864    
865            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 835  for (;;) Line 879  for (;;)
879  /* ========================================================================== */  /* ========================================================================== */
880        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
881        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
882        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
883        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
884    
885        case OP_TYPEPLUS:        case OP_TYPEPLUS:
886        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 847  for (;;) Line 891  for (;;)
891          {          {
892          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
893              (c < 256 &&              (c < 256 &&
894                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
895                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
896            {            {
897            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 873  for (;;) Line 914  for (;;)
914          {          {
915          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
916              (c < 256 &&              (c < 256 &&
917                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
918                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
919            {            {
920            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 898  for (;;) Line 936  for (;;)
936          {          {
937          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
938              (c < 256 &&              (c < 256 &&
939                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
940                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
941            {            {
942            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 921  for (;;) Line 956  for (;;)
956          {          {
957          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
958              (c < 256 &&              (c < 256 &&
959                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
960                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
961            {            {
962            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 945  for (;;) Line 977  for (;;)
977          {          {
978          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
979              (c < 256 &&              (c < 256 &&
980                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
981                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
982            {            {
983            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 970  for (;;) Line 999  for (;;)
999        argument. It keeps the code above fast for the other cases. The argument        argument. It keeps the code above fast for the other cases. The argument
1000        is in the d variable. */        is in the d variable. */
1001    
1002    #ifdef SUPPORT_UCP
1003        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
1004        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1005        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
# Line 978  for (;;) Line 1008  for (;;)
1008        if (clen > 0)        if (clen > 0)
1009          {          {
1010          BOOL OK;          BOOL OK;
1011          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1012          switch(code[2])          switch(code[2])
1013            {            {
1014            case PT_ANY:            case PT_ANY:
# Line 986  for (;;) Line 1016  for (;;)
1016            break;            break;
1017    
1018            case PT_LAMP:            case PT_LAMP:
1019            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1020            break;            break;
1021    
1022            case PT_GC:            case PT_GC:
1023            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1024            break;            break;
1025    
1026            case PT_PC:            case PT_PC:
1027            OK = chartype == code[3];            OK = prop->chartype == code[3];
1028            break;            break;
1029    
1030            case PT_SC:            case PT_SC:
1031            OK = script == code[3];            OK = prop->script == code[3];
1032            break;            break;
1033    
1034            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1027  for (;;) Line 1057  for (;;)
1057        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1058        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1059        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1060        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1061          {          {
1062          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1063          int ncount = 0;          int ncount = 0;
# Line 1041  for (;;) Line 1071  for (;;)
1071            int nd;            int nd;
1072            int ndlen = 1;            int ndlen = 1;
1073            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1074            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1075            ncount++;            ncount++;
1076            nptr += ndlen;            nptr += ndlen;
1077            }            }
# Line 1049  for (;;) Line 1079  for (;;)
1079          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1080          }          }
1081        break;        break;
1082    #endif
1083    
1084        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1085        case OP_ANYNL_EXTRA + OP_TYPEPLUS:        case OP_ANYNL_EXTRA + OP_TYPEPLUS:
# Line 1061  for (;;) Line 1092  for (;;)
1092          int ncount = 0;          int ncount = 0;
1093          switch (c)          switch (c)
1094            {            {
1095              case 0x000b:
1096              case 0x000c:
1097              case 0x0085:
1098              case 0x2028:
1099              case 0x2029:
1100              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1101              goto ANYNL01;
1102    
1103            case 0x000d:            case 0x000d:
1104            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1105            /* Fall through */            /* Fall through */
1106    
1107              ANYNL01:
1108              case 0x000a:
1109              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1110                {
1111                active_count--;           /* Remove non-match possibility */
1112                next_active_state--;
1113                }
1114              count++;
1115              ADD_NEW_DATA(-state_offset, count, ncount);
1116              break;
1117    
1118              default:
1119              break;
1120              }
1121            }
1122          break;
1123    
1124          /*-----------------------------------------------------------------*/
1125          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1126          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1127          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1128          count = current_state->count;  /* Already matched */
1129          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1130          if (clen > 0)
1131            {
1132            BOOL OK;
1133            switch (c)
1134              {
1135            case 0x000a:            case 0x000a:
1136            case 0x000b:            case 0x000b:
1137            case 0x000c:            case 0x000c:
1138              case 0x000d:
1139            case 0x0085:            case 0x0085:
1140            case 0x2028:            case 0x2028:
1141            case 0x2029:            case 0x2029:
1142            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            OK = TRUE;
1143              break;
1144    
1145              default:
1146              OK = FALSE;
1147              break;
1148              }
1149    
1150            if (OK == (d == OP_VSPACE))
1151              {
1152              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1153              {              {
1154              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1155              next_active_state--;              next_active_state--;
1156              }              }
1157            count++;            count++;
1158            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, 0);
1159              }
1160            }
1161          break;
1162    
1163          /*-----------------------------------------------------------------*/
1164          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1165          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1166          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1167          count = current_state->count;  /* Already matched */
1168          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1169          if (clen > 0)
1170            {
1171            BOOL OK;
1172            switch (c)
1173              {
1174              case 0x09:      /* HT */
1175              case 0x20:      /* SPACE */
1176              case 0xa0:      /* NBSP */
1177              case 0x1680:    /* OGHAM SPACE MARK */
1178              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1179              case 0x2000:    /* EN QUAD */
1180              case 0x2001:    /* EM QUAD */
1181              case 0x2002:    /* EN SPACE */
1182              case 0x2003:    /* EM SPACE */
1183              case 0x2004:    /* THREE-PER-EM SPACE */
1184              case 0x2005:    /* FOUR-PER-EM SPACE */
1185              case 0x2006:    /* SIX-PER-EM SPACE */
1186              case 0x2007:    /* FIGURE SPACE */
1187              case 0x2008:    /* PUNCTUATION SPACE */
1188              case 0x2009:    /* THIN SPACE */
1189              case 0x200A:    /* HAIR SPACE */
1190              case 0x202f:    /* NARROW NO-BREAK SPACE */
1191              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1192              case 0x3000:    /* IDEOGRAPHIC SPACE */
1193              OK = TRUE;
1194            break;            break;
1195    
1196            default:            default:
1197              OK = FALSE;
1198            break;            break;
1199            }            }
1200    
1201            if (OK == (d == OP_HSPACE))
1202              {
1203              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1204                {
1205                active_count--;           /* Remove non-match possibility */
1206                next_active_state--;
1207                }
1208              count++;
1209              ADD_NEW_DATA(-state_offset, count, 0);
1210              }
1211          }          }
1212        break;        break;
1213    
1214        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1215    #ifdef SUPPORT_UCP
1216        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1217        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1218        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
# Line 1102  for (;;) Line 1230  for (;;)
1230        if (clen > 0)        if (clen > 0)
1231          {          {
1232          BOOL OK;          BOOL OK;
1233          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1234          switch(code[2])          switch(code[2])
1235            {            {
1236            case PT_ANY:            case PT_ANY:
# Line 1110  for (;;) Line 1238  for (;;)
1238            break;            break;
1239    
1240            case PT_LAMP:            case PT_LAMP:
1241            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1242            break;            break;
1243    
1244            case PT_GC:            case PT_GC:
1245            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1246            break;            break;
1247    
1248            case PT_PC:            case PT_PC:
1249            OK = chartype == code[3];            OK = prop->chartype == code[3];
1250            break;            break;
1251    
1252            case PT_SC:            case PT_SC:
1253            OK = script == code[3];            OK = prop->script == code[3];
1254            break;            break;
1255    
1256            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1160  for (;;) Line 1288  for (;;)
1288        QS2:        QS2:
1289    
1290        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1291        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1292          {          {
1293          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1294          int ncount = 0;          int ncount = 0;
# Line 1175  for (;;) Line 1303  for (;;)
1303            int nd;            int nd;
1304            int ndlen = 1;            int ndlen = 1;
1305            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1306            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1307            ncount++;            ncount++;
1308            nptr += ndlen;            nptr += ndlen;
1309            }            }
1310          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1311          }          }
1312        break;        break;
1313    #endif
1314    
1315        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1316        case OP_ANYNL_EXTRA + OP_TYPEQUERY:        case OP_ANYNL_EXTRA + OP_TYPEQUERY:
# Line 1202  for (;;) Line 1331  for (;;)
1331          int ncount = 0;          int ncount = 0;
1332          switch (c)          switch (c)
1333            {            {
1334              case 0x000b:
1335              case 0x000c:
1336              case 0x0085:
1337              case 0x2028:
1338              case 0x2029:
1339              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1340              goto ANYNL02;
1341    
1342            case 0x000d:            case 0x000d:
1343            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1344            /* Fall through */            /* Fall through */
1345    
1346              ANYNL02:
1347              case 0x000a:
1348              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1349                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1350                {
1351                active_count--;           /* Remove non-match possibility */
1352                next_active_state--;
1353                }
1354              ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1355              break;
1356    
1357              default:
1358              break;
1359              }
1360            }
1361          break;
1362    
1363          /*-----------------------------------------------------------------*/
1364          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1365          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1366          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1367          count = 2;
1368          goto QS4;
1369    
1370          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1371          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1372          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1373          count = 0;
1374    
1375          QS4:
1376          ADD_ACTIVE(state_offset + 2, 0);
1377          if (clen > 0)
1378            {
1379            BOOL OK;
1380            switch (c)
1381              {
1382            case 0x000a:            case 0x000a:
1383            case 0x000b:            case 0x000b:
1384            case 0x000c:            case 0x000c:
1385              case 0x000d:
1386            case 0x0085:            case 0x0085:
1387            case 0x2028:            case 0x2028:
1388            case 0x2029:            case 0x2029:
1389            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            OK = TRUE;
1390                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)            break;
1391    
1392              default:
1393              OK = FALSE;
1394              break;
1395              }
1396            if (OK == (d == OP_VSPACE))
1397              {
1398              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1399                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1400              {              {
1401              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1402              next_active_state--;              next_active_state--;
1403              }              }
1404            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, 0);
1405              }
1406            }
1407          break;
1408    
1409          /*-----------------------------------------------------------------*/
1410          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1411          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1412          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1413          count = 2;
1414          goto QS5;
1415    
1416          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1417          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1418          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1419          count = 0;
1420    
1421          QS5:
1422          ADD_ACTIVE(state_offset + 2, 0);
1423          if (clen > 0)
1424            {
1425            BOOL OK;
1426            switch (c)
1427              {
1428              case 0x09:      /* HT */
1429              case 0x20:      /* SPACE */
1430              case 0xa0:      /* NBSP */
1431              case 0x1680:    /* OGHAM SPACE MARK */
1432              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1433              case 0x2000:    /* EN QUAD */
1434              case 0x2001:    /* EM QUAD */
1435              case 0x2002:    /* EN SPACE */
1436              case 0x2003:    /* EM SPACE */
1437              case 0x2004:    /* THREE-PER-EM SPACE */
1438              case 0x2005:    /* FOUR-PER-EM SPACE */
1439              case 0x2006:    /* SIX-PER-EM SPACE */
1440              case 0x2007:    /* FIGURE SPACE */
1441              case 0x2008:    /* PUNCTUATION SPACE */
1442              case 0x2009:    /* THIN SPACE */
1443              case 0x200A:    /* HAIR SPACE */
1444              case 0x202f:    /* NARROW NO-BREAK SPACE */
1445              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1446              case 0x3000:    /* IDEOGRAPHIC SPACE */
1447              OK = TRUE;
1448            break;            break;
1449    
1450            default:            default:
1451              OK = FALSE;
1452            break;            break;
1453            }            }
1454    
1455            if (OK == (d == OP_HSPACE))
1456              {
1457              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1458                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1459                {
1460                active_count--;           /* Remove non-match possibility */
1461                next_active_state--;
1462                }
1463              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1464              }
1465          }          }
1466        break;        break;
1467    
1468        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1469    #ifdef SUPPORT_UCP
1470        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_PROP_EXTRA + OP_TYPEEXACT:
1471        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_PROP_EXTRA + OP_TYPEUPTO:
1472        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
# Line 1236  for (;;) Line 1477  for (;;)
1477        if (clen > 0)        if (clen > 0)
1478          {          {
1479          BOOL OK;          BOOL OK;
1480          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1481          switch(code[4])          switch(code[4])
1482            {            {
1483            case PT_ANY:            case PT_ANY:
# Line 1244  for (;;) Line 1485  for (;;)
1485            break;            break;
1486    
1487            case PT_LAMP:            case PT_LAMP:
1488            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1489            break;            break;
1490    
1491            case PT_GC:            case PT_GC:
1492            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1493            break;            break;
1494    
1495            case PT_PC:            case PT_PC:
1496            OK = chartype == code[5];            OK = prop->chartype == code[5];
1497            break;            break;
1498    
1499            case PT_SC:            case PT_SC:
1500            OK = script == code[5];            OK = prop->script == code[5];
1501            break;            break;
1502    
1503            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1289  for (;;) Line 1530  for (;;)
1530        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1531          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1532        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1533        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1534          {          {
1535          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1536          int ncount = 0;          int ncount = 0;
# Line 1303  for (;;) Line 1544  for (;;)
1544            int nd;            int nd;
1545            int ndlen = 1;            int ndlen = 1;
1546            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1547            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1548            ncount++;            ncount++;
1549            nptr += ndlen;            nptr += ndlen;
1550            }            }
# Line 1313  for (;;) Line 1554  for (;;)
1554            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1555          }          }
1556        break;        break;
1557    #endif
1558    
1559        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1560        case OP_ANYNL_EXTRA + OP_TYPEEXACT:        case OP_ANYNL_EXTRA + OP_TYPEEXACT:
# Line 1327  for (;;) Line 1569  for (;;)
1569          int ncount = 0;          int ncount = 0;
1570          switch (c)          switch (c)
1571            {            {
1572              case 0x000b:
1573              case 0x000c:
1574              case 0x0085:
1575              case 0x2028:
1576              case 0x2029:
1577              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1578              goto ANYNL03;
1579    
1580            case 0x000d:            case 0x000d:
1581            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1582            /* Fall through */            /* Fall through */
1583    
1584              ANYNL03:
1585              case 0x000a:
1586              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1587                {
1588                active_count--;           /* Remove non-match possibility */
1589                next_active_state--;
1590                }
1591              if (++count >= GET2(code, 1))
1592                { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1593              else
1594                { ADD_NEW_DATA(-state_offset, count, ncount); }
1595              break;
1596    
1597              default:
1598              break;
1599              }
1600            }
1601          break;
1602    
1603          /*-----------------------------------------------------------------*/
1604          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1605          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1606          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1607          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1608          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1609            { ADD_ACTIVE(state_offset + 4, 0); }
1610          count = current_state->count;  /* Number already matched */
1611          if (clen > 0)
1612            {
1613            BOOL OK;
1614            switch (c)
1615              {
1616            case 0x000a:            case 0x000a:
1617            case 0x000b:            case 0x000b:
1618            case 0x000c:            case 0x000c:
1619              case 0x000d:
1620            case 0x0085:            case 0x0085:
1621            case 0x2028:            case 0x2028:
1622            case 0x2029:            case 0x2029:
1623            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            OK = TRUE;
1624              break;
1625    
1626              default:
1627              OK = FALSE;
1628              }
1629    
1630            if (OK == (d == OP_VSPACE))
1631              {
1632              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1633              {              {
1634              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1635              next_active_state--;              next_active_state--;
1636              }              }
1637            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1638              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1639            else            else
1640              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1641              }
1642            }
1643          break;
1644    
1645          /*-----------------------------------------------------------------*/
1646          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1647          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1648          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1649          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1650          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1651            { ADD_ACTIVE(state_offset + 4, 0); }
1652          count = current_state->count;  /* Number already matched */
1653          if (clen > 0)
1654            {
1655            BOOL OK;
1656            switch (c)
1657              {
1658              case 0x09:      /* HT */
1659              case 0x20:      /* SPACE */
1660              case 0xa0:      /* NBSP */
1661              case 0x1680:    /* OGHAM SPACE MARK */
1662              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1663              case 0x2000:    /* EN QUAD */
1664              case 0x2001:    /* EM QUAD */
1665              case 0x2002:    /* EN SPACE */
1666              case 0x2003:    /* EM SPACE */
1667              case 0x2004:    /* THREE-PER-EM SPACE */
1668              case 0x2005:    /* FOUR-PER-EM SPACE */
1669              case 0x2006:    /* SIX-PER-EM SPACE */
1670              case 0x2007:    /* FIGURE SPACE */
1671              case 0x2008:    /* PUNCTUATION SPACE */
1672              case 0x2009:    /* THIN SPACE */
1673              case 0x200A:    /* HAIR SPACE */
1674              case 0x202f:    /* NARROW NO-BREAK SPACE */
1675              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1676              case 0x3000:    /* IDEOGRAPHIC SPACE */
1677              OK = TRUE;
1678            break;            break;
1679    
1680            default:            default:
1681              OK = FALSE;
1682            break;            break;
1683            }            }
1684    
1685            if (OK == (d == OP_HSPACE))
1686              {
1687              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1688                {
1689                active_count--;           /* Remove non-match possibility */
1690                next_active_state--;
1691                }
1692              if (++count >= GET2(code, 1))
1693                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1694              else
1695                { ADD_NEW_DATA(-state_offset, count, 0); }
1696              }
1697          }          }
1698        break;        break;
1699    
# Line 1379  for (;;) Line 1724  for (;;)
1724            other case of the character. */            other case of the character. */
1725    
1726  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1727            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
1728  #else  #else
1729            othercase = NOTACHAR;            othercase = NOTACHAR;
1730  #endif  #endif
# Line 1404  for (;;) Line 1749  for (;;)
1749        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1750    
1751        case OP_EXTUNI:        case OP_EXTUNI:
1752        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1753          {          {
1754          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1755          int ncount = 0;          int ncount = 0;
# Line 1412  for (;;) Line 1757  for (;;)
1757            {            {
1758            int nclen = 1;            int nclen = 1;
1759            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1760            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
1761            ncount++;            ncount++;
1762            nptr += nclen;            nptr += nclen;
1763            }            }
# Line 1429  for (;;) Line 1774  for (;;)
1774        case OP_ANYNL:        case OP_ANYNL:
1775        if (clen > 0) switch(c)        if (clen > 0) switch(c)
1776          {          {
         case 0x000a:  
1777          case 0x000b:          case 0x000b:
1778          case 0x000c:          case 0x000c:
1779          case 0x0085:          case 0x0085:
1780          case 0x2028:          case 0x2028:
1781          case 0x2029:          case 0x2029:
1782            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1783    
1784            case 0x000a:
1785          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1786          break;          break;
1787    
1788          case 0x000d:          case 0x000d:
1789          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1790            {            {
# Line 1451  for (;;) Line 1799  for (;;)
1799        break;        break;
1800    
1801        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1802          case OP_NOT_VSPACE:
1803          if (clen > 0) switch(c)
1804            {
1805            case 0x000a:
1806            case 0x000b:
1807            case 0x000c:
1808            case 0x000d:
1809            case 0x0085:
1810            case 0x2028:
1811            case 0x2029:
1812            break;
1813    
1814            default:
1815            ADD_NEW(state_offset + 1, 0);
1816            break;
1817            }
1818          break;
1819    
1820          /*-----------------------------------------------------------------*/
1821          case OP_VSPACE:
1822          if (clen > 0) switch(c)
1823            {
1824            case 0x000a:
1825            case 0x000b:
1826            case 0x000c:
1827            case 0x000d:
1828            case 0x0085:
1829            case 0x2028:
1830            case 0x2029:
1831            ADD_NEW(state_offset + 1, 0);
1832            break;
1833    
1834            default: break;
1835            }
1836          break;
1837    
1838          /*-----------------------------------------------------------------*/
1839          case OP_NOT_HSPACE:
1840          if (clen > 0) switch(c)
1841            {
1842            case 0x09:      /* HT */
1843            case 0x20:      /* SPACE */
1844            case 0xa0:      /* NBSP */
1845            case 0x1680:    /* OGHAM SPACE MARK */
1846            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1847            case 0x2000:    /* EN QUAD */
1848            case 0x2001:    /* EM QUAD */
1849            case 0x2002:    /* EN SPACE */
1850            case 0x2003:    /* EM SPACE */
1851            case 0x2004:    /* THREE-PER-EM SPACE */
1852            case 0x2005:    /* FOUR-PER-EM SPACE */
1853            case 0x2006:    /* SIX-PER-EM SPACE */
1854            case 0x2007:    /* FIGURE SPACE */
1855            case 0x2008:    /* PUNCTUATION SPACE */
1856            case 0x2009:    /* THIN SPACE */
1857            case 0x200A:    /* HAIR SPACE */
1858            case 0x202f:    /* NARROW NO-BREAK SPACE */
1859            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1860            case 0x3000:    /* IDEOGRAPHIC SPACE */
1861            break;
1862    
1863            default:
1864            ADD_NEW(state_offset + 1, 0);
1865            break;
1866            }
1867          break;
1868    
1869          /*-----------------------------------------------------------------*/
1870          case OP_HSPACE:
1871          if (clen > 0) switch(c)
1872            {
1873            case 0x09:      /* HT */
1874            case 0x20:      /* SPACE */
1875            case 0xa0:      /* NBSP */
1876            case 0x1680:    /* OGHAM SPACE MARK */
1877            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1878            case 0x2000:    /* EN QUAD */
1879            case 0x2001:    /* EM QUAD */
1880            case 0x2002:    /* EN SPACE */
1881            case 0x2003:    /* EM SPACE */
1882            case 0x2004:    /* THREE-PER-EM SPACE */
1883            case 0x2005:    /* FOUR-PER-EM SPACE */
1884            case 0x2006:    /* SIX-PER-EM SPACE */
1885            case 0x2007:    /* FIGURE SPACE */
1886            case 0x2008:    /* PUNCTUATION SPACE */
1887            case 0x2009:    /* THIN SPACE */
1888            case 0x200A:    /* HAIR SPACE */
1889            case 0x202f:    /* NARROW NO-BREAK SPACE */
1890            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1891            case 0x3000:    /* IDEOGRAPHIC SPACE */
1892            ADD_NEW(state_offset + 1, 0);
1893            break;
1894            }
1895          break;
1896    
1897          /*-----------------------------------------------------------------*/
1898        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character. This is only used for one-byte
1899        characters, that is, we know that d < 256. The character we are        characters, that is, we know that d < 256. The character we are
1900        checking (c) can be multibyte. */        checking (c) can be multibyte. */
# Line 1481  for (;;) Line 1925  for (;;)
1925            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1926              {              {
1927  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1928              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1929  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1930              }              }
1931            else            else
# Line 1519  for (;;) Line 1963  for (;;)
1963            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1964              {              {
1965  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1966              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1967  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1968              }              }
1969            else            else
# Line 1555  for (;;) Line 1999  for (;;)
1999            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2000              {              {
2001  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2002              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2003  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2004              }              }
2005            else            else
# Line 1587  for (;;) Line 2031  for (;;)
2031            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2032              {              {
2033  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2034              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2035  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2036              }              }
2037            else            else
# Line 1622  for (;;) Line 2066  for (;;)
2066            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2067              {              {
2068  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2069              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2070  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2071              }              }
2072            else            else
# Line 1730  for (;;) Line 2174  for (;;)
2174    
2175  /* ========================================================================== */  /* ========================================================================== */
2176        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2177        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2178          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2179          though the other "backtracking verbs" are not supported. */
2180    
2181          case OP_FAIL:
2182          forced_fail++;    /* Count FAILs for multiple states */
2183          break;
2184    
2185        case OP_ASSERT:        case OP_ASSERT:
2186        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1768  for (;;) Line 2218  for (;;)
2218          {          {
2219          int local_offsets[1000];          int local_offsets[1000];
2220          int local_workspace[1000];          int local_workspace[1000];
2221          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2222            int condcode;
2223    
2224            /* Because of the way auto-callout works during compile, a callout item
2225            is inserted between OP_COND and an assertion condition. This does not
2226            happen for the other conditions. */
2227    
2228            if (code[LINK_SIZE+1] == OP_CALLOUT)
2229              {
2230              rrc = 0;
2231              if (pcre_callout != NULL)
2232                {
2233                pcre_callout_block cb;
2234                cb.version          = 1;   /* Version 1 of the callout block */
2235                cb.callout_number   = code[LINK_SIZE+2];
2236                cb.offset_vector    = offsets;
2237                cb.subject          = (PCRE_SPTR)start_subject;
2238                cb.subject_length   = end_subject - start_subject;
2239                cb.start_match      = current_subject - start_subject;
2240                cb.current_position = ptr - start_subject;
2241                cb.pattern_position = GET(code, LINK_SIZE + 3);
2242                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2243                cb.capture_top      = 1;
2244                cb.capture_last     = -1;
2245                cb.callout_data     = md->callout_data;
2246                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2247                }
2248              if (rrc > 0) break;                      /* Fail this thread */
2249              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2250              }
2251    
2252            condcode = code[LINK_SIZE+1];
2253    
2254          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2255    
# Line 1777  for (;;) Line 2258  for (;;)
2258          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2259    
2260          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2261            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2262    
2263          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2264          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
# Line 1789  for (;;) Line 2268  for (;;)
2268            {            {
2269            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2270            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2271            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0)
2272              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2273              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2274            }            }
2275    
2276          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1820  for (;;) Line 2300  for (;;)
2300                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2301              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2302            else            else
2303              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2304            }            }
2305          }          }
2306        break;        break;
# Line 1972  for (;;) Line 2452  for (;;)
2452        /* Handle callouts */        /* Handle callouts */
2453    
2454        case OP_CALLOUT:        case OP_CALLOUT:
2455          rrc = 0;
2456        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2457          {          {
         int rrc;  
2458          pcre_callout_block cb;          pcre_callout_block cb;
2459          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2460          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 1989  for (;;) Line 2469  for (;;)
2469          cb.capture_last     = -1;          cb.capture_last     = -1;
2470          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2471          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2472          }          }
2473          if (rrc == 0)
2474            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2475        break;        break;
2476    
2477    
# Line 2006  for (;;) Line 2487  for (;;)
2487    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2488    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2489    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2490    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions. The "forced_
2491      fail" variable counts the number of (*F) encountered for the character. If it
2492      is equal to the original active_count (saved in workspace[1]) it means that
2493      (*F) was found on every active state. In this case we don't want to give a
2494      partial match. */
2495    
2496    if (new_count <= 0)    if (new_count <= 0)
2497      {      {
2498      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2499          rlevel == 1 &&                         /* Top level match function */          reached_end != workspace[1] &&               /* Not all reached end */
2500          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2501          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
2502          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2503            ||                                           /* or... */
2504            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2505             match_count < 0)                            /* no matches */
2506            ) &&                                         /* And... */
2507            ptr >= end_subject &&                     /* Reached end of subject */
2508            ptr > current_subject)                    /* Matched non-empty string */
2509        {        {
2510        if (offsetcount >= 2)        if (offsetcount >= 2)
2511          {          {
2512          offsets[0] = current_subject - start_subject;          offsets[0] = md->start_used_ptr - start_subject;
2513          offsets[1] = end_subject - start_subject;          offsets[1] = end_subject - start_subject;
2514          }          }
2515        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
# Line 2073  Returns:          > 0 => number of match Line 2564  Returns:          > 0 => number of match
2564                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2565  */  */
2566    
2567  PCRE_DATA_SCOPE int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2568  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2569    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2570    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2163  md->end_subject = end_subject; Line 2654  md->end_subject = end_subject;
2654  md->moptions = options;  md->moptions = options;
2655  md->poptions = re->options;  md->poptions = re->options;
2656    
2657    /* If the BSR option is not set at match time, copy what was set
2658    at compile time. */
2659    
2660    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2661      {
2662      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2663        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2664    #ifdef BSR_ANYCRLF
2665      else md->moptions |= PCRE_BSR_ANYCRLF;
2666    #endif
2667      }
2668    
2669  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
2670  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
2671    
2672  switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &  switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2673           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2674    {    {
2675    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2676    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2677    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2678    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2679         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2680    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2681      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2682    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
2683    }    }
2684    
2685  if (newline < 0)  if (newline == -2)
2686      {
2687      md->nltype = NLTYPE_ANYCRLF;
2688      }
2689    else if (newline < 0)
2690    {    {
2691    md->nltype = NLTYPE_ANY;    md->nltype = NLTYPE_ANY;
2692    }    }
# Line 2228  if (md->tables == NULL) md->tables = _pc Line 2736  if (md->tables == NULL) md->tables = _pc
2736  used in a loop when finding where to start. */  used in a loop when finding where to start. */
2737    
2738  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
2739  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
2740  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
2741    
2742  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2239  studied, there may be a bitmap of possib Line 2747  studied, there may be a bitmap of possib
2747    
2748  if (!anchored)  if (!anchored)
2749    {    {
2750    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
2751      {      {
2752      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
2753      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 2256  if (!anchored) Line 2764  if (!anchored)
2764  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
2765  character" set. */  character" set. */
2766    
2767  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
2768    {    {
2769    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
2770    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 2264  if ((re->options & PCRE_REQCHSET) != 0) Line 2772  if ((re->options & PCRE_REQCHSET) != 0)
2772    }    }
2773    
2774  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2775  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2776  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2777    
2778  for (;;)  for (;;)
2779    {    {
# Line 2276  for (;;) Line 2783  for (;;)
2783      {      {
2784      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2785    
2786      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2787      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2788      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2789      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2790    
2791      if (firstline)      if (firstline)
2792        {        {
2793        const uschar *t = current_subject;        USPTR t = current_subject;
2794    #ifdef SUPPORT_UTF8
2795          if (utf8)
2796            {
2797            while (t < md->end_subject && !IS_NEWLINE(t))
2798              {
2799              t++;
2800              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2801              }
2802            }
2803          else
2804    #endif
2805        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2806        end_subject = t;        end_subject = t;
2807        }        }
2808    
2809      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2810        starting point is not found, or if a known later character is not present.
2811        However, there is an option that disables these, for testing and for
2812        ensuring that all callouts do actually occur. */
2813    
2814        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2815        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2816    
2817      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2818    
2819      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + start_offset)  
2820          {          {
2821          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (first_byte_caseless)
2822            current_subject++;            while (current_subject < end_subject &&
2823                     lcc[*current_subject] != first_byte)
2824                current_subject++;
2825            else
2826              while (current_subject < end_subject &&
2827                     *current_subject != first_byte)
2828                current_subject++;
2829          }          }
       }  
2830    
2831      /* Or to a non-unique first char after study */        /* Or to just after a linebreak for a multiline match if possible */
2832    
2833      else if (start_bits != NULL)        else if (startline)
2834        {          {
2835        while (current_subject < end_subject)          if (current_subject > md->start_subject + start_offset)
2836              {
2837    #ifdef SUPPORT_UTF8
2838              if (utf8)
2839                {
2840                while (current_subject < end_subject &&
2841                       !WAS_NEWLINE(current_subject))
2842                  {
2843                  current_subject++;
2844                  while(current_subject < end_subject &&
2845                        (*current_subject & 0xc0) == 0x80)
2846                    current_subject++;
2847                  }
2848                }
2849              else
2850    #endif
2851              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2852                current_subject++;
2853    
2854              /* If we have just passed a CR and the newline option is ANY or
2855              ANYCRLF, and we are now at a LF, advance the match position by one
2856              more character. */
2857    
2858              if (current_subject[-1] == CHAR_CR &&
2859                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2860                   current_subject < end_subject &&
2861                   *current_subject == CHAR_NL)
2862                current_subject++;
2863              }
2864            }
2865    
2866          /* Or to a non-unique first char after study */
2867    
2868          else if (start_bits != NULL)
2869          {          {
2870          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2871          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2872            else break;            register unsigned int c = *current_subject;
2873              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2874                else break;
2875              }
2876          }          }
2877        }        }
2878    
# Line 2341  for (;;) Line 2894  for (;;)
2894    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2895    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2896    
2897    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2898    */    also be explicitly deactivated. Furthermore, we have to disable when
2899      restarting after a partial match, because the required character may have
2900      already been matched. */
2901    
2902    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2903          req_byte >= 0 &&
2904        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2905        (options & PCRE_PARTIAL) == 0)        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2906      {      {
2907      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2908    
# Line 2386  for (;;) Line 2942  for (;;)
2942    
2943    /* OK, now we can do the business */    /* OK, now we can do the business */
2944    
2945      md->start_used_ptr = current_subject;
2946    
2947    rc = internal_dfa_exec(    rc = internal_dfa_exec(
2948      md,                                /* fixed match data */      md,                                /* fixed match data */
2949      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2416  for (;;) Line 2974  for (;;)
2974      }      }
2975    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
2976    
2977    /* If we have just passed a CR and the newline option is CRLF or ANY, and we    /* If we have just passed a CR and we are now at a LF, and the pattern does
2978    are now at a LF, advance the match position by one more character. */    not contain any explicit matches for \r or \n, and the newline option is CRLF
2979      or ANY or ANYCRLF, advance the match position by one more character. */
2980    if (current_subject[-1] == '\r' &&  
2981         (md->nltype == NLTYPE_ANY || md->nllen == 2) &&    if (current_subject[-1] == CHAR_CR &&
2982         current_subject < end_subject &&        current_subject < end_subject &&
2983         *current_subject == '\n')        *current_subject == CHAR_NL &&
2984          (re->flags & PCRE_HASCRORLF) == 0 &&
2985            (md->nltype == NLTYPE_ANY ||
2986             md->nltype == NLTYPE_ANYCRLF ||
2987             md->nllen == 2))
2988      current_subject++;      current_subject++;
2989    
2990    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.97  
changed lines
  Added in v.435

  ViewVC Help
Powered by ViewVC 1.1.5