/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 145 by ph10, Wed Apr 4 14:06:52 2007 UTC revision 510 by ph10, Sat Mar 27 17:45:29 2010 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49    the performance of his patterns greatly. I could not use it as it stood, as it
50    was not thread safe, and made assumptions about pattern sizes. Also, it caused
51    test 7 to loop, and test 9 to crash with a segfault.
52    
53    The issue is the check for duplicate states, which is done by a simple linear
54    search up the state list. (Grep for "duplicate" below to find the code.) For
55    many patterns, there will never be many states active at one time, so a simple
56    linear search is fine. In patterns that have many active states, it might be a
57    bottleneck. The suggested code used an indexing scheme to remember which states
58    had previously been used for each character, and avoided the linear search when
59    it knew there was no chance of a duplicate. This was implemented when adding
60    states to the state lists.
61    
62    I wrote some thread-safe, not-limited code to try something similar at the time
63    of checking for duplicates (instead of when adding states), using index vectors
64    on the stack. It did give a 13% improvement with one specially constructed
65    pattern for certain subject strings, but on other strings and on many of the
66    simpler patterns in the test suite it did worse. The major problem, I think,
67    was the extra time to initialize the index. This had to be done for each call
68    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69    only once - I suspect this was the cause of the problems with the tests.)
70    
71    Overall, I concluded that the gains in some cases did not outweigh the losses
72    in others, so I abandoned this code. */
73    
74    
75    
76    #ifdef HAVE_CONFIG_H
77    #include "config.h"
78    #endif
79    
80  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
81  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
82  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 56  applications. */ Line 89  applications. */
89  #define SP "                   "  #define SP "                   "
90    
91    
   
92  /*************************************************  /*************************************************
93  *      Code parameters and static tables         *  *      Code parameters and static tables         *
94  *************************************************/  *************************************************/
95    
96  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
98  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
99    never stored, so we push them well clear of the normal opcodes. */
100    
101  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
102  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
103  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
104    #define OP_HSPACE_EXTRA     360
105    #define OP_VSPACE_EXTRA     380
106    
107    
108  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
109  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
110  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
111  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112  small value. */  small value. Non-zero values in the table are the offsets from the opcode where
113    the character is to be found. ***NOTE*** If the start of this table is
114    modified, the three tables that follow must also be modified. */
115    
116  static uschar coptable[] = {  static const uschar coptable[] = {
117    0,                             /* End                                    */    0,                             /* End                                    */
118    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121      0, 0,                          /* \P, \p                                 */
122      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123      0,                             /* \X                                     */
124    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
125    1,                             /* Char                                   */    1,                             /* Char                                   */
126    1,                             /* Charnc                                 */    1,                             /* Charnc                                 */
# Line 117  static uschar coptable[] = { Line 157  static uschar coptable[] = {
157    0,                             /* Reverse                                */    0,                             /* Reverse                                */
158    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
159    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
160    0,                             /* CREF                                   */    0, 0,                          /* CREF, NCREF                            */
161    0,                             /* RREF                                   */    0, 0,                          /* RREF, NRREF                            */
162      0,                             /* DEF                                    */
163      0, 0,                          /* BRAZERO, BRAMINZERO                    */
164      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */
165      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */
166      0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */
167    };
168    
169    /* This table identifies those opcodes that inspect a character. It is used to
170    remember the fact that a character could have been inspected when the end of
171    the subject is reached. ***NOTE*** If the start of this table is modified, the
172    two tables that follow must also be modified. */
173    
174    static const uschar poptable[] = {
175      0,                             /* End                                    */
176      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
177      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
178      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
179      1, 1,                          /* \P, \p                                 */
180      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
181      1,                             /* \X                                     */
182      0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
183      1,                             /* Char                                   */
184      1,                             /* Charnc                                 */
185      1,                             /* not                                    */
186      /* Positive single-char repeats                                          */
187      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
188      1, 1, 1,                       /* upto, minupto, exact                   */
189      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
190      /* Negative single-char repeats - only for chars < 256                   */
191      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
192      1, 1, 1,                       /* NOT upto, minupto, exact               */
193      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
194      /* Positive type repeats                                                 */
195      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
196      1, 1, 1,                       /* Type upto, minupto, exact              */
197      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
198      /* Character class & ref repeats                                         */
199      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
200      1, 1,                          /* CRRANGE, CRMINRANGE                    */
201      1,                             /* CLASS                                  */
202      1,                             /* NCLASS                                 */
203      1,                             /* XCLASS - variable length               */
204      0,                             /* REF                                    */
205      0,                             /* RECURSE                                */
206      0,                             /* CALLOUT                                */
207      0,                             /* Alt                                    */
208      0,                             /* Ket                                    */
209      0,                             /* KetRmax                                */
210      0,                             /* KetRmin                                */
211      0,                             /* Assert                                 */
212      0,                             /* Assert not                             */
213      0,                             /* Assert behind                          */
214      0,                             /* Assert behind not                      */
215      0,                             /* Reverse                                */
216      0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
217      0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
218      0, 0,                          /* CREF, NCREF                            */
219      0, 0,                          /* RREF, NRREF                            */
220    0,                             /* DEF                                    */    0,                             /* DEF                                    */
221    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
222      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */
223      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */
224      0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */
225  };  };
226    
227  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
228  and \w */  and \w */
229    
230  static uschar toptable1[] = {  static const uschar toptable1[] = {
231    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
232    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
233    ctype_space, ctype_space,    ctype_space, ctype_space,
234    ctype_word,  ctype_word,    ctype_word,  ctype_word,
235    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
236  };  };
237    
238  static uschar toptable2[] = {  static const uschar toptable2[] = {
239    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
240    ctype_digit, 0,    ctype_digit, 0,
241    ctype_space, 0,    ctype_space, 0,
242    ctype_word,  0,    ctype_word,  0,
243    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
244  };  };
245    
246    
# Line 158  typedef struct stateblock { Line 259  typedef struct stateblock {
259  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
260    
261    
262  #ifdef DEBUG  #ifdef PCRE_DEBUG
263  /*************************************************  /*************************************************
264  *             Print character string             *  *             Print character string             *
265  *************************************************/  *************************************************/
# Line 211  Arguments: Line 312  Arguments:
312    rlevel            function call recursion level    rlevel            function call recursion level
313    recursing         regex recursive call level    recursing         regex recursive call level
314    
315  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
316                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
317                       -1 => failed to match                       -1 => failed to match
318                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
319    
# Line 378  if (*first_op == OP_REVERSE) Line 479  if (*first_op == OP_REVERSE)
479      current_subject -= gone_back;      current_subject -= gone_back;
480      }      }
481    
482      /* Save the earliest consulted character */
483    
484      if (current_subject < md->start_used_ptr)
485        md->start_used_ptr = current_subject;
486    
487    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
488    
489    end_code = this_start_code;    end_code = this_start_code;
# Line 442  for (;;) Line 548  for (;;)
548    int i, j;    int i, j;
549    int clen, dlen;    int clen, dlen;
550    unsigned int c, d;    unsigned int c, d;
551      int forced_fail = 0;
552      BOOL could_continue = FALSE;
553    
554    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
555    new state list. */    new state list. */
# Line 455  for (;;) Line 563  for (;;)
563    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
564    workspace[1] = active_count;    workspace[1] = active_count;
565    
566  #ifdef DEBUG  #ifdef PCRE_DEBUG
567    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
568    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars((uschar *)ptr, strlen((char *)ptr), stdout);
569    printf("\"\n");    printf("\"\n");
# Line 499  for (;;) Line 607  for (;;)
607      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
608      const uschar *code;      const uschar *code;
609      int state_offset = current_state->offset;      int state_offset = current_state->offset;
610      int count, codevalue;      int count, codevalue, rrc;
     int chartype, script;  
611    
612  #ifdef DEBUG  #ifdef PCRE_DEBUG
613      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
614      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
615        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
# Line 532  for (;;) Line 639  for (;;)
639          }          }
640        }        }
641    
642      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
643        See the note at the head of this module about the possibility of improving
644        performance here. */
645    
646      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
647        {        {
# Line 549  for (;;) Line 658  for (;;)
658      code = start_code + state_offset;      code = start_code + state_offset;
659      codevalue = *code;      codevalue = *code;
660    
661        /* If this opcode inspects a character, but we are at the end of the
662        subject, remember the fact for use when testing for a partial match. */
663    
664        if (clen == 0 && poptable[codevalue] != 0)
665          could_continue = TRUE;
666    
667      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
668      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
669      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
670      permitted.      permitted.
671    
672      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
673      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long. We
674      Unfortunately, we have to take special action to deal with  \P, \p, and      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
675      \X in this case. To keep the other cases fast, convert these ones to new      this case. To keep the other cases fast, convert these ones to new opcodes.
676      opcodes. */      */
677    
678      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
679        {        {
# Line 576  for (;;) Line 691  for (;;)
691            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
692            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
693            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
694              case OP_NOT_HSPACE:
695              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
696              case OP_NOT_VSPACE:
697              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
698            default: break;            default: break;
699            }            }
700          }          }
# Line 591  for (;;) Line 710  for (;;)
710    
711      switch (codevalue)      switch (codevalue)
712        {        {
713    /* ========================================================================== */
714          /* These cases are never obeyed. This is a fudge that causes a compile-
715          time error if the vectors coptable or poptable, which are indexed by
716          opcode, are not the correct length. It seems to be the only way to do
717          such a check at compile time, as the sizeof() operator does not work
718          in the C preprocessor. */
719    
720          case OP_TABLE_LENGTH:
721          case OP_TABLE_LENGTH +
722            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
723             (sizeof(poptable) == OP_TABLE_LENGTH)):
724          break;
725    
726  /* ========================================================================== */  /* ========================================================================== */
727        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
728        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. Otherwise, unless we have an empty string and
729        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
730          start of the subject, save the match data, shifting up all previous
731        matches so we always have the longest first. */        matches so we always have the longest first. */
732    
733        case OP_KET:        case OP_KET:
# Line 609  for (;;) Line 741  for (;;)
741            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
742            }            }
743          }          }
744        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
745          {          {
746          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
747            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
748              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
749          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
750          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
751          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
752            {              else if (match_count > 0 && ++match_count * 2 >= offsetcount)
753            offsets[0] = current_subject - start_subject;                match_count = 0;
754            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
755            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
756              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
757            }              {
758          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = current_subject - start_subject;
759            {              offsets[1] = ptr - start_subject;
760            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
761              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], current_subject));
762              match_count, rlevel*2-2, SP));              }
763            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
764                {
765                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
766                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
767                  match_count, rlevel*2-2, SP));
768                return match_count;
769                }
770            }            }
771          }          }
772        break;        break;
# Line 676  for (;;) Line 814  for (;;)
814        break;        break;
815    
816        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
817          case OP_SKIPZERO:
818          code += 1 + GET(code, 2);
819          while (*code == OP_ALT) code += GET(code, 1);
820          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
821          break;
822    
823          /*-----------------------------------------------------------------*/
824        case OP_CIRC:        case OP_CIRC:
825        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
826            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
# Line 714  for (;;) Line 859  for (;;)
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_ANY:        case OP_ANY:
862        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
863            { ADD_NEW(state_offset + 1, 0); }
864          break;
865    
866          /*-----------------------------------------------------------------*/
867          case OP_ALLANY:
868          if (clen > 0)
869          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
870        break;        break;
871    
# Line 729  for (;;) Line 880  for (;;)
880        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
881          {          {
882          if (clen == 0 ||          if (clen == 0 ||
883              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
884                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
885              ))              ))
886            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 766  for (;;) Line 917  for (;;)
917          if (ptr > start_subject)          if (ptr > start_subject)
918            {            {
919            const uschar *temp = ptr - 1;            const uschar *temp = ptr - 1;
920              if (temp < md->start_used_ptr) md->start_used_ptr = temp;
921  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
922            if (utf8) BACKCHAR(temp);            if (utf8) BACKCHAR(temp);
923  #endif  #endif
# Line 774  for (;;) Line 926  for (;;)
926            }            }
927          else left_word = 0;          else left_word = 0;
928    
929          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
930            else right_word = 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
931            else right_word = 0;
932    
933          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
934            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 783  for (;;) Line 936  for (;;)
936        break;        break;
937    
938    
 #ifdef SUPPORT_UCP  
   
939        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
940        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
941        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
942        */        */
943    
944    #ifdef SUPPORT_UCP
945        case OP_PROP:        case OP_PROP:
946        case OP_NOTPROP:        case OP_NOTPROP:
947        if (clen > 0)        if (clen > 0)
948          {          {
949          BOOL OK;          BOOL OK;
950          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
951          switch(code[1])          switch(code[1])
952            {            {
953            case PT_ANY:            case PT_ANY:
# Line 803  for (;;) Line 955  for (;;)
955            break;            break;
956    
957            case PT_LAMP:            case PT_LAMP:
958            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
959            break;            break;
960    
961            case PT_GC:            case PT_GC:
962            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
963            break;            break;
964    
965            case PT_PC:            case PT_PC:
966            OK = chartype == code[2];            OK = prop->chartype == code[2];
967            break;            break;
968    
969            case PT_SC:            case PT_SC:
970            OK = script == code[2];            OK = prop->script == code[2];
971            break;            break;
972    
973            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 835  for (;;) Line 987  for (;;)
987  /* ========================================================================== */  /* ========================================================================== */
988        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
989        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
990        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
991        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
992    
993        case OP_TYPEPLUS:        case OP_TYPEPLUS:
994        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 847  for (;;) Line 999  for (;;)
999          {          {
1000          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1001              (c < 256 &&              (c < 256 &&
1002                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1003                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1004            {            {
1005            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 873  for (;;) Line 1022  for (;;)
1022          {          {
1023          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1024              (c < 256 &&              (c < 256 &&
1025                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1026                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1027            {            {
1028            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 898  for (;;) Line 1044  for (;;)
1044          {          {
1045          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1046              (c < 256 &&              (c < 256 &&
1047                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1048                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1049            {            {
1050            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 921  for (;;) Line 1064  for (;;)
1064          {          {
1065          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1066              (c < 256 &&              (c < 256 &&
1067                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1068                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1069            {            {
1070            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 945  for (;;) Line 1085  for (;;)
1085          {          {
1086          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1087              (c < 256 &&              (c < 256 &&
1088                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1089                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1090            {            {
1091            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 970  for (;;) Line 1107  for (;;)
1107        argument. It keeps the code above fast for the other cases. The argument        argument. It keeps the code above fast for the other cases. The argument
1108        is in the d variable. */        is in the d variable. */
1109    
1110    #ifdef SUPPORT_UCP
1111        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
1112        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1113        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
# Line 978  for (;;) Line 1116  for (;;)
1116        if (clen > 0)        if (clen > 0)
1117          {          {
1118          BOOL OK;          BOOL OK;
1119          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1120          switch(code[2])          switch(code[2])
1121            {            {
1122            case PT_ANY:            case PT_ANY:
# Line 986  for (;;) Line 1124  for (;;)
1124            break;            break;
1125    
1126            case PT_LAMP:            case PT_LAMP:
1127            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1128            break;            break;
1129    
1130            case PT_GC:            case PT_GC:
1131            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1132            break;            break;
1133    
1134            case PT_PC:            case PT_PC:
1135            OK = chartype == code[3];            OK = prop->chartype == code[3];
1136            break;            break;
1137    
1138            case PT_SC:            case PT_SC:
1139            OK = script == code[3];            OK = prop->script == code[3];
1140            break;            break;
1141    
1142            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1027  for (;;) Line 1165  for (;;)
1165        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1166        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1167        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1168        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1169          {          {
1170          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1171          int ncount = 0;          int ncount = 0;
# Line 1041  for (;;) Line 1179  for (;;)
1179            int nd;            int nd;
1180            int ndlen = 1;            int ndlen = 1;
1181            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1182            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1183            ncount++;            ncount++;
1184            nptr += ndlen;            nptr += ndlen;
1185            }            }
# Line 1049  for (;;) Line 1187  for (;;)
1187          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1188          }          }
1189        break;        break;
1190    #endif
1191    
1192        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1193        case OP_ANYNL_EXTRA + OP_TYPEPLUS:        case OP_ANYNL_EXTRA + OP_TYPEPLUS:
# Line 1061  for (;;) Line 1200  for (;;)
1200          int ncount = 0;          int ncount = 0;
1201          switch (c)          switch (c)
1202            {            {
1203              case 0x000b:
1204              case 0x000c:
1205              case 0x0085:
1206              case 0x2028:
1207              case 0x2029:
1208              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1209              goto ANYNL01;
1210    
1211            case 0x000d:            case 0x000d:
1212            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1213            /* Fall through */            /* Fall through */
1214    
1215              ANYNL01:
1216              case 0x000a:
1217              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1218                {
1219                active_count--;           /* Remove non-match possibility */
1220                next_active_state--;
1221                }
1222              count++;
1223              ADD_NEW_DATA(-state_offset, count, ncount);
1224              break;
1225    
1226              default:
1227              break;
1228              }
1229            }
1230          break;
1231    
1232          /*-----------------------------------------------------------------*/
1233          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1234          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1235          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1236          count = current_state->count;  /* Already matched */
1237          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1238          if (clen > 0)
1239            {
1240            BOOL OK;
1241            switch (c)
1242              {
1243            case 0x000a:            case 0x000a:
1244            case 0x000b:            case 0x000b:
1245            case 0x000c:            case 0x000c:
1246              case 0x000d:
1247            case 0x0085:            case 0x0085:
1248            case 0x2028:            case 0x2028:
1249            case 0x2029:            case 0x2029:
1250            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            OK = TRUE;
1251              break;
1252    
1253              default:
1254              OK = FALSE;
1255              break;
1256              }
1257    
1258            if (OK == (d == OP_VSPACE))
1259              {
1260              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1261              {              {
1262              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1263              next_active_state--;              next_active_state--;
1264              }              }
1265            count++;            count++;
1266            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, 0);
1267              }
1268            }
1269          break;
1270    
1271          /*-----------------------------------------------------------------*/
1272          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1273          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1274          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1275          count = current_state->count;  /* Already matched */
1276          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1277          if (clen > 0)
1278            {
1279            BOOL OK;
1280            switch (c)
1281              {
1282              case 0x09:      /* HT */
1283              case 0x20:      /* SPACE */
1284              case 0xa0:      /* NBSP */
1285              case 0x1680:    /* OGHAM SPACE MARK */
1286              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1287              case 0x2000:    /* EN QUAD */
1288              case 0x2001:    /* EM QUAD */
1289              case 0x2002:    /* EN SPACE */
1290              case 0x2003:    /* EM SPACE */
1291              case 0x2004:    /* THREE-PER-EM SPACE */
1292              case 0x2005:    /* FOUR-PER-EM SPACE */
1293              case 0x2006:    /* SIX-PER-EM SPACE */
1294              case 0x2007:    /* FIGURE SPACE */
1295              case 0x2008:    /* PUNCTUATION SPACE */
1296              case 0x2009:    /* THIN SPACE */
1297              case 0x200A:    /* HAIR SPACE */
1298              case 0x202f:    /* NARROW NO-BREAK SPACE */
1299              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1300              case 0x3000:    /* IDEOGRAPHIC SPACE */
1301              OK = TRUE;
1302            break;            break;
1303    
1304            default:            default:
1305              OK = FALSE;
1306            break;            break;
1307            }            }
1308    
1309            if (OK == (d == OP_HSPACE))
1310              {
1311              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1312                {
1313                active_count--;           /* Remove non-match possibility */
1314                next_active_state--;
1315                }
1316              count++;
1317              ADD_NEW_DATA(-state_offset, count, 0);
1318              }
1319          }          }
1320        break;        break;
1321    
1322        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1323    #ifdef SUPPORT_UCP
1324        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1325        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1326        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
# Line 1102  for (;;) Line 1338  for (;;)
1338        if (clen > 0)        if (clen > 0)
1339          {          {
1340          BOOL OK;          BOOL OK;
1341          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1342          switch(code[2])          switch(code[2])
1343            {            {
1344            case PT_ANY:            case PT_ANY:
# Line 1110  for (;;) Line 1346  for (;;)
1346            break;            break;
1347    
1348            case PT_LAMP:            case PT_LAMP:
1349            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1350            break;            break;
1351    
1352            case PT_GC:            case PT_GC:
1353            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1354            break;            break;
1355    
1356            case PT_PC:            case PT_PC:
1357            OK = chartype == code[3];            OK = prop->chartype == code[3];
1358            break;            break;
1359    
1360            case PT_SC:            case PT_SC:
1361            OK = script == code[3];            OK = prop->script == code[3];
1362            break;            break;
1363    
1364            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1160  for (;;) Line 1396  for (;;)
1396        QS2:        QS2:
1397    
1398        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1399        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1400          {          {
1401          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1402          int ncount = 0;          int ncount = 0;
# Line 1175  for (;;) Line 1411  for (;;)
1411            int nd;            int nd;
1412            int ndlen = 1;            int ndlen = 1;
1413            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1414            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1415            ncount++;            ncount++;
1416            nptr += ndlen;            nptr += ndlen;
1417            }            }
1418          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1419          }          }
1420        break;        break;
1421    #endif
1422    
1423        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1424        case OP_ANYNL_EXTRA + OP_TYPEQUERY:        case OP_ANYNL_EXTRA + OP_TYPEQUERY:
# Line 1202  for (;;) Line 1439  for (;;)
1439          int ncount = 0;          int ncount = 0;
1440          switch (c)          switch (c)
1441            {            {
1442              case 0x000b:
1443              case 0x000c:
1444              case 0x0085:
1445              case 0x2028:
1446              case 0x2029:
1447              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1448              goto ANYNL02;
1449    
1450            case 0x000d:            case 0x000d:
1451            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1452            /* Fall through */            /* Fall through */
1453    
1454              ANYNL02:
1455              case 0x000a:
1456              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1457                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1458                {
1459                active_count--;           /* Remove non-match possibility */
1460                next_active_state--;
1461                }
1462              ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1463              break;
1464    
1465              default:
1466              break;
1467              }
1468            }
1469          break;
1470    
1471          /*-----------------------------------------------------------------*/
1472          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1473          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1474          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1475          count = 2;
1476          goto QS4;
1477    
1478          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1479          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1480          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1481          count = 0;
1482    
1483          QS4:
1484          ADD_ACTIVE(state_offset + 2, 0);
1485          if (clen > 0)
1486            {
1487            BOOL OK;
1488            switch (c)
1489              {
1490            case 0x000a:            case 0x000a:
1491            case 0x000b:            case 0x000b:
1492            case 0x000c:            case 0x000c:
1493              case 0x000d:
1494            case 0x0085:            case 0x0085:
1495            case 0x2028:            case 0x2028:
1496            case 0x2029:            case 0x2029:
1497            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            OK = TRUE;
1498                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)            break;
1499    
1500              default:
1501              OK = FALSE;
1502              break;
1503              }
1504            if (OK == (d == OP_VSPACE))
1505              {
1506              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1507                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1508              {              {
1509              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1510              next_active_state--;              next_active_state--;
1511              }              }
1512            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, 0);
1513              }
1514            }
1515          break;
1516    
1517          /*-----------------------------------------------------------------*/
1518          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1519          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1520          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1521          count = 2;
1522          goto QS5;
1523    
1524          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1525          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1526          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1527          count = 0;
1528    
1529          QS5:
1530          ADD_ACTIVE(state_offset + 2, 0);
1531          if (clen > 0)
1532            {
1533            BOOL OK;
1534            switch (c)
1535              {
1536              case 0x09:      /* HT */
1537              case 0x20:      /* SPACE */
1538              case 0xa0:      /* NBSP */
1539              case 0x1680:    /* OGHAM SPACE MARK */
1540              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1541              case 0x2000:    /* EN QUAD */
1542              case 0x2001:    /* EM QUAD */
1543              case 0x2002:    /* EN SPACE */
1544              case 0x2003:    /* EM SPACE */
1545              case 0x2004:    /* THREE-PER-EM SPACE */
1546              case 0x2005:    /* FOUR-PER-EM SPACE */
1547              case 0x2006:    /* SIX-PER-EM SPACE */
1548              case 0x2007:    /* FIGURE SPACE */
1549              case 0x2008:    /* PUNCTUATION SPACE */
1550              case 0x2009:    /* THIN SPACE */
1551              case 0x200A:    /* HAIR SPACE */
1552              case 0x202f:    /* NARROW NO-BREAK SPACE */
1553              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1554              case 0x3000:    /* IDEOGRAPHIC SPACE */
1555              OK = TRUE;
1556            break;            break;
1557    
1558            default:            default:
1559              OK = FALSE;
1560            break;            break;
1561            }            }
1562    
1563            if (OK == (d == OP_HSPACE))
1564              {
1565              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1566                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1567                {
1568                active_count--;           /* Remove non-match possibility */
1569                next_active_state--;
1570                }
1571              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1572              }
1573          }          }
1574        break;        break;
1575    
1576        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1577    #ifdef SUPPORT_UCP
1578        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_PROP_EXTRA + OP_TYPEEXACT:
1579        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_PROP_EXTRA + OP_TYPEUPTO:
1580        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
# Line 1236  for (;;) Line 1585  for (;;)
1585        if (clen > 0)        if (clen > 0)
1586          {          {
1587          BOOL OK;          BOOL OK;
1588          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1589          switch(code[4])          switch(code[4])
1590            {            {
1591            case PT_ANY:            case PT_ANY:
# Line 1244  for (;;) Line 1593  for (;;)
1593            break;            break;
1594    
1595            case PT_LAMP:            case PT_LAMP:
1596            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1597            break;            break;
1598    
1599            case PT_GC:            case PT_GC:
1600            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1601            break;            break;
1602    
1603            case PT_PC:            case PT_PC:
1604            OK = chartype == code[5];            OK = prop->chartype == code[5];
1605            break;            break;
1606    
1607            case PT_SC:            case PT_SC:
1608            OK = script == code[5];            OK = prop->script == code[5];
1609            break;            break;
1610    
1611            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1289  for (;;) Line 1638  for (;;)
1638        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1639          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1640        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1641        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1642          {          {
1643          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1644          int ncount = 0;          int ncount = 0;
# Line 1303  for (;;) Line 1652  for (;;)
1652            int nd;            int nd;
1653            int ndlen = 1;            int ndlen = 1;
1654            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1655            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1656            ncount++;            ncount++;
1657            nptr += ndlen;            nptr += ndlen;
1658            }            }
# Line 1313  for (;;) Line 1662  for (;;)
1662            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1663          }          }
1664        break;        break;
1665    #endif
1666    
1667        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1668        case OP_ANYNL_EXTRA + OP_TYPEEXACT:        case OP_ANYNL_EXTRA + OP_TYPEEXACT:
# Line 1327  for (;;) Line 1677  for (;;)
1677          int ncount = 0;          int ncount = 0;
1678          switch (c)          switch (c)
1679            {            {
1680              case 0x000b:
1681              case 0x000c:
1682              case 0x0085:
1683              case 0x2028:
1684              case 0x2029:
1685              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1686              goto ANYNL03;
1687    
1688            case 0x000d:            case 0x000d:
1689            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1690            /* Fall through */            /* Fall through */
1691    
1692              ANYNL03:
1693              case 0x000a:
1694              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1695                {
1696                active_count--;           /* Remove non-match possibility */
1697                next_active_state--;
1698                }
1699              if (++count >= GET2(code, 1))
1700                { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1701              else
1702                { ADD_NEW_DATA(-state_offset, count, ncount); }
1703              break;
1704    
1705              default:
1706              break;
1707              }
1708            }
1709          break;
1710    
1711          /*-----------------------------------------------------------------*/
1712          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1713          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1714          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1715          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1716          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1717            { ADD_ACTIVE(state_offset + 4, 0); }
1718          count = current_state->count;  /* Number already matched */
1719          if (clen > 0)
1720            {
1721            BOOL OK;
1722            switch (c)
1723              {
1724            case 0x000a:            case 0x000a:
1725            case 0x000b:            case 0x000b:
1726            case 0x000c:            case 0x000c:
1727              case 0x000d:
1728            case 0x0085:            case 0x0085:
1729            case 0x2028:            case 0x2028:
1730            case 0x2029:            case 0x2029:
1731            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            OK = TRUE;
1732              break;
1733    
1734              default:
1735              OK = FALSE;
1736              }
1737    
1738            if (OK == (d == OP_VSPACE))
1739              {
1740              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1741              {              {
1742              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1743              next_active_state--;              next_active_state--;
1744              }              }
1745            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1746              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1747            else            else
1748              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1749              }
1750            }
1751          break;
1752    
1753          /*-----------------------------------------------------------------*/
1754          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1755          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1756          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1757          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1758          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1759            { ADD_ACTIVE(state_offset + 4, 0); }
1760          count = current_state->count;  /* Number already matched */
1761          if (clen > 0)
1762            {
1763            BOOL OK;
1764            switch (c)
1765              {
1766              case 0x09:      /* HT */
1767              case 0x20:      /* SPACE */
1768              case 0xa0:      /* NBSP */
1769              case 0x1680:    /* OGHAM SPACE MARK */
1770              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1771              case 0x2000:    /* EN QUAD */
1772              case 0x2001:    /* EM QUAD */
1773              case 0x2002:    /* EN SPACE */
1774              case 0x2003:    /* EM SPACE */
1775              case 0x2004:    /* THREE-PER-EM SPACE */
1776              case 0x2005:    /* FOUR-PER-EM SPACE */
1777              case 0x2006:    /* SIX-PER-EM SPACE */
1778              case 0x2007:    /* FIGURE SPACE */
1779              case 0x2008:    /* PUNCTUATION SPACE */
1780              case 0x2009:    /* THIN SPACE */
1781              case 0x200A:    /* HAIR SPACE */
1782              case 0x202f:    /* NARROW NO-BREAK SPACE */
1783              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1784              case 0x3000:    /* IDEOGRAPHIC SPACE */
1785              OK = TRUE;
1786            break;            break;
1787    
1788            default:            default:
1789              OK = FALSE;
1790            break;            break;
1791            }            }
1792    
1793            if (OK == (d == OP_HSPACE))
1794              {
1795              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1796                {
1797                active_count--;           /* Remove non-match possibility */
1798                next_active_state--;
1799                }
1800              if (++count >= GET2(code, 1))
1801                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1802              else
1803                { ADD_NEW_DATA(-state_offset, count, 0); }
1804              }
1805          }          }
1806        break;        break;
1807    
# Line 1379  for (;;) Line 1832  for (;;)
1832            other case of the character. */            other case of the character. */
1833    
1834  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1835            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
1836  #else  #else
1837            othercase = NOTACHAR;            othercase = NOTACHAR;
1838  #endif  #endif
# Line 1404  for (;;) Line 1857  for (;;)
1857        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1858    
1859        case OP_EXTUNI:        case OP_EXTUNI:
1860        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1861          {          {
1862          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1863          int ncount = 0;          int ncount = 0;
# Line 1412  for (;;) Line 1865  for (;;)
1865            {            {
1866            int nclen = 1;            int nclen = 1;
1867            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1868            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
1869            ncount++;            ncount++;
1870            nptr += nclen;            nptr += nclen;
1871            }            }
# Line 1429  for (;;) Line 1882  for (;;)
1882        case OP_ANYNL:        case OP_ANYNL:
1883        if (clen > 0) switch(c)        if (clen > 0) switch(c)
1884          {          {
         case 0x000a:  
1885          case 0x000b:          case 0x000b:
1886          case 0x000c:          case 0x000c:
1887          case 0x0085:          case 0x0085:
1888          case 0x2028:          case 0x2028:
1889          case 0x2029:          case 0x2029:
1890            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1891    
1892            case 0x000a:
1893          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1894          break;          break;
1895    
1896          case 0x000d:          case 0x000d:
1897          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1898            {            {
# Line 1451  for (;;) Line 1907  for (;;)
1907        break;        break;
1908    
1909        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1910          case OP_NOT_VSPACE:
1911          if (clen > 0) switch(c)
1912            {
1913            case 0x000a:
1914            case 0x000b:
1915            case 0x000c:
1916            case 0x000d:
1917            case 0x0085:
1918            case 0x2028:
1919            case 0x2029:
1920            break;
1921    
1922            default:
1923            ADD_NEW(state_offset + 1, 0);
1924            break;
1925            }
1926          break;
1927    
1928          /*-----------------------------------------------------------------*/
1929          case OP_VSPACE:
1930          if (clen > 0) switch(c)
1931            {
1932            case 0x000a:
1933            case 0x000b:
1934            case 0x000c:
1935            case 0x000d:
1936            case 0x0085:
1937            case 0x2028:
1938            case 0x2029:
1939            ADD_NEW(state_offset + 1, 0);
1940            break;
1941    
1942            default: break;
1943            }
1944          break;
1945    
1946          /*-----------------------------------------------------------------*/
1947          case OP_NOT_HSPACE:
1948          if (clen > 0) switch(c)
1949            {
1950            case 0x09:      /* HT */
1951            case 0x20:      /* SPACE */
1952            case 0xa0:      /* NBSP */
1953            case 0x1680:    /* OGHAM SPACE MARK */
1954            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1955            case 0x2000:    /* EN QUAD */
1956            case 0x2001:    /* EM QUAD */
1957            case 0x2002:    /* EN SPACE */
1958            case 0x2003:    /* EM SPACE */
1959            case 0x2004:    /* THREE-PER-EM SPACE */
1960            case 0x2005:    /* FOUR-PER-EM SPACE */
1961            case 0x2006:    /* SIX-PER-EM SPACE */
1962            case 0x2007:    /* FIGURE SPACE */
1963            case 0x2008:    /* PUNCTUATION SPACE */
1964            case 0x2009:    /* THIN SPACE */
1965            case 0x200A:    /* HAIR SPACE */
1966            case 0x202f:    /* NARROW NO-BREAK SPACE */
1967            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1968            case 0x3000:    /* IDEOGRAPHIC SPACE */
1969            break;
1970    
1971            default:
1972            ADD_NEW(state_offset + 1, 0);
1973            break;
1974            }
1975          break;
1976    
1977          /*-----------------------------------------------------------------*/
1978          case OP_HSPACE:
1979          if (clen > 0) switch(c)
1980            {
1981            case 0x09:      /* HT */
1982            case 0x20:      /* SPACE */
1983            case 0xa0:      /* NBSP */
1984            case 0x1680:    /* OGHAM SPACE MARK */
1985            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1986            case 0x2000:    /* EN QUAD */
1987            case 0x2001:    /* EM QUAD */
1988            case 0x2002:    /* EN SPACE */
1989            case 0x2003:    /* EM SPACE */
1990            case 0x2004:    /* THREE-PER-EM SPACE */
1991            case 0x2005:    /* FOUR-PER-EM SPACE */
1992            case 0x2006:    /* SIX-PER-EM SPACE */
1993            case 0x2007:    /* FIGURE SPACE */
1994            case 0x2008:    /* PUNCTUATION SPACE */
1995            case 0x2009:    /* THIN SPACE */
1996            case 0x200A:    /* HAIR SPACE */
1997            case 0x202f:    /* NARROW NO-BREAK SPACE */
1998            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1999            case 0x3000:    /* IDEOGRAPHIC SPACE */
2000            ADD_NEW(state_offset + 1, 0);
2001            break;
2002            }
2003          break;
2004    
2005          /*-----------------------------------------------------------------*/
2006        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character. This is only used for one-byte
2007        characters, that is, we know that d < 256. The character we are        characters, that is, we know that d < 256. The character we are
2008        checking (c) can be multibyte. */        checking (c) can be multibyte. */
# Line 1481  for (;;) Line 2033  for (;;)
2033            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2034              {              {
2035  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2036              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2037  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2038              }              }
2039            else            else
# Line 1519  for (;;) Line 2071  for (;;)
2071            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2072              {              {
2073  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2074              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2075  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2076              }              }
2077            else            else
# Line 1555  for (;;) Line 2107  for (;;)
2107            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2108              {              {
2109  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2110              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2111  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2112              }              }
2113            else            else
# Line 1587  for (;;) Line 2139  for (;;)
2139            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2140              {              {
2141  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2142              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2143  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2144              }              }
2145            else            else
# Line 1622  for (;;) Line 2174  for (;;)
2174            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2175              {              {
2176  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2177              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2178  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2179              }              }
2180            else            else
# Line 1730  for (;;) Line 2282  for (;;)
2282    
2283  /* ========================================================================== */  /* ========================================================================== */
2284        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2285        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2286          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2287          though the other "backtracking verbs" are not supported. */
2288    
2289          case OP_FAIL:
2290          forced_fail++;    /* Count FAILs for multiple states */
2291          break;
2292    
2293        case OP_ASSERT:        case OP_ASSERT:
2294        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1757  for (;;) Line 2315  for (;;)
2315            rlevel,                               /* function recursion level */            rlevel,                               /* function recursion level */
2316            recursing);                           /* pass on regex recursion */            recursing);                           /* pass on regex recursion */
2317    
2318            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2319          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2320              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2321          }          }
# Line 1768  for (;;) Line 2327  for (;;)
2327          {          {
2328          int local_offsets[1000];          int local_offsets[1000];
2329          int local_workspace[1000];          int local_workspace[1000];
2330          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2331            int condcode;
2332    
2333            /* Because of the way auto-callout works during compile, a callout item
2334            is inserted between OP_COND and an assertion condition. This does not
2335            happen for the other conditions. */
2336    
2337            if (code[LINK_SIZE+1] == OP_CALLOUT)
2338              {
2339              rrc = 0;
2340              if (pcre_callout != NULL)
2341                {
2342                pcre_callout_block cb;
2343                cb.version          = 1;   /* Version 1 of the callout block */
2344                cb.callout_number   = code[LINK_SIZE+2];
2345                cb.offset_vector    = offsets;
2346                cb.subject          = (PCRE_SPTR)start_subject;
2347                cb.subject_length   = end_subject - start_subject;
2348                cb.start_match      = current_subject - start_subject;
2349                cb.current_position = ptr - start_subject;
2350                cb.pattern_position = GET(code, LINK_SIZE + 3);
2351                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2352                cb.capture_top      = 1;
2353                cb.capture_last     = -1;
2354                cb.callout_data     = md->callout_data;
2355                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2356                }
2357              if (rrc > 0) break;                      /* Fail this thread */
2358              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2359              }
2360    
2361            condcode = code[LINK_SIZE+1];
2362    
2363          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2364    
2365          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2366              return PCRE_ERROR_DFA_UCOND;
2367    
2368          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2369    
2370          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2371            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2372    
2373          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2374          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2375          recursed groups. */          recursed groups. */
2376    
2377          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2378            {            {
2379            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2380            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2381            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0)
2382              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2383              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2384            }            }
2385    
2386          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1816  for (;;) Line 2406  for (;;)
2406              rlevel,                               /* function recursion level */              rlevel,                               /* function recursion level */
2407              recursing);                           /* pass on regex recursion */              recursing);                           /* pass on regex recursion */
2408    
2409              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2410            if ((rc >= 0) ==            if ((rc >= 0) ==
2411                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2412              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2413            else            else
2414              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2415            }            }
2416          }          }
2417        break;        break;
# Line 1972  for (;;) Line 2563  for (;;)
2563        /* Handle callouts */        /* Handle callouts */
2564    
2565        case OP_CALLOUT:        case OP_CALLOUT:
2566          rrc = 0;
2567        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2568          {          {
         int rrc;  
2569          pcre_callout_block cb;          pcre_callout_block cb;
2570          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2571          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 1989  for (;;) Line 2580  for (;;)
2580          cb.capture_last     = -1;          cb.capture_last     = -1;
2581          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2582          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2583          }          }
2584          if (rrc == 0)
2585            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2586        break;        break;
2587    
2588    
# Line 2006  for (;;) Line 2598  for (;;)
2598    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2599    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2600    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2601    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
2602    
2603      The "forced_ fail" variable counts the number of (*F) encountered for the
2604      character. If it is equal to the original active_count (saved in
2605      workspace[1]) it means that (*F) was found on every active state. In this
2606      case we don't want to give a partial match.
2607    
2608      The "could_continue" variable is true if a state could have continued but
2609      for the fact that the end of the subject was reached. */
2610    
2611    if (new_count <= 0)    if (new_count <= 0)
2612      {      {
2613      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2614          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on */
2615          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2616          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
2617          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2618            ||                                           /* or... */
2619            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2620             match_count < 0)                            /* no matches */
2621            ) &&                                         /* And... */
2622            ptr >= end_subject &&                     /* Reached end of subject */
2623            ptr > current_subject)                    /* Matched non-empty string */
2624        {        {
2625        if (offsetcount >= 2)        if (offsetcount >= 2)
2626          {          {
2627          offsets[0] = current_subject - start_subject;          offsets[0] = md->start_used_ptr - start_subject;
2628          offsets[1] = end_subject - start_subject;          offsets[1] = end_subject - start_subject;
2629          }          }
2630        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
# Line 2073  Returns:          > 0 => number of match Line 2679  Returns:          > 0 => number of match
2679                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2680  */  */
2681    
2682  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2683  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2684    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2685    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2160  md->start_code = (const uschar *)argumen Line 2766  md->start_code = (const uschar *)argumen
2766      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
2767  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const unsigned char *)subject;
2768  md->end_subject = end_subject;  md->end_subject = end_subject;
2769    md->start_offset = start_offset;
2770  md->moptions = options;  md->moptions = options;
2771  md->poptions = re->options;  md->poptions = re->options;
2772    
2773    /* If the BSR option is not set at match time, copy what was set
2774    at compile time. */
2775    
2776    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2777      {
2778      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2779        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2780    #ifdef BSR_ANYCRLF
2781      else md->moptions |= PCRE_BSR_ANYCRLF;
2782    #endif
2783      }
2784    
2785  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
2786  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
2787    
# Line 2170  switch ((((options & PCRE_NEWLINE_BITS) Line 2789  switch ((((options & PCRE_NEWLINE_BITS)
2789           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2790    {    {
2791    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2792    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2793    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2794    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2795         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2796    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2797      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2798    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
2799    }    }
2800    
2801  if (newline < 0)  if (newline == -2)
2802      {
2803      md->nltype = NLTYPE_ANYCRLF;
2804      }
2805    else if (newline < 0)
2806    {    {
2807    md->nltype = NLTYPE_ANY;    md->nltype = NLTYPE_ANY;
2808    }    }
# Line 2228  if (md->tables == NULL) md->tables = _pc Line 2852  if (md->tables == NULL) md->tables = _pc
2852  used in a loop when finding where to start. */  used in a loop when finding where to start. */
2853    
2854  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
2855  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
2856  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
2857    
2858  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2239  studied, there may be a bitmap of possib Line 2863  studied, there may be a bitmap of possib
2863    
2864  if (!anchored)  if (!anchored)
2865    {    {
2866    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
2867      {      {
2868      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
2869      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 2247  if (!anchored) Line 2871  if (!anchored)
2871      }      }
2872    else    else
2873      {      {
2874      if (startline && study != NULL &&      if (!startline && study != NULL &&
2875           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
2876        start_bits = study->start_bits;        start_bits = study->start_bits;
2877      }      }
2878    }    }
# Line 2256  if (!anchored) Line 2880  if (!anchored)
2880  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
2881  character" set. */  character" set. */
2882    
2883  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
2884    {    {
2885    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
2886    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 2264  if ((re->options & PCRE_REQCHSET) != 0) Line 2888  if ((re->options & PCRE_REQCHSET) != 0)
2888    }    }
2889    
2890  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2891  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2892  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2893    
2894  for (;;)  for (;;)
2895    {    {
# Line 2276  for (;;) Line 2899  for (;;)
2899      {      {
2900      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2901    
2902      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2903      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2904      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2905      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2906    
2907      if (firstline)      if (firstline)
2908        {        {
2909        const uschar *t = current_subject;        USPTR t = current_subject;
2910    #ifdef SUPPORT_UTF8
2911          if (utf8)
2912            {
2913            while (t < md->end_subject && !IS_NEWLINE(t))
2914              {
2915              t++;
2916              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2917              }
2918            }
2919          else
2920    #endif
2921        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2922        end_subject = t;        end_subject = t;
2923        }        }
2924    
2925      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2926        starting point is not found. However, there is an option that disables
2927        these, for testing and for ensuring that all callouts do actually occur. */
2928    
2929        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2930        {        {
2931        if (first_byte_caseless)        /* Advance to a known first byte. */
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2932    
2933      /* Or to just after a linebreak for a multiline match if possible */        if (first_byte >= 0)
2934            {
2935            if (first_byte_caseless)
2936              while (current_subject < end_subject &&
2937                     lcc[*current_subject] != first_byte)
2938                current_subject++;
2939            else
2940              while (current_subject < end_subject &&
2941                     *current_subject != first_byte)
2942                current_subject++;
2943            }
2944    
2945      else if (startline)        /* Or to just after a linebreak for a multiline match if possible */
2946        {  
2947        if (current_subject > md->start_subject + start_offset)        else if (startline)
2948          {          {
2949          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (current_subject > md->start_subject + start_offset)
2950            current_subject++;            {
2951    #ifdef SUPPORT_UTF8
2952              if (utf8)
2953                {
2954                while (current_subject < end_subject &&
2955                       !WAS_NEWLINE(current_subject))
2956                  {
2957                  current_subject++;
2958                  while(current_subject < end_subject &&
2959                        (*current_subject & 0xc0) == 0x80)
2960                    current_subject++;
2961                  }
2962                }
2963              else
2964    #endif
2965              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2966                current_subject++;
2967    
2968          /* If we have just passed a CR and the newline option is ANY, and we            /* If we have just passed a CR and the newline option is ANY or
2969          are now at a LF, advance the match position by one more character. */            ANYCRLF, and we are now at a LF, advance the match position by one
2970              more character. */
2971    
2972          if (current_subject[-1] == '\r' &&            if (current_subject[-1] == CHAR_CR &&
2973               md->nltype == NLTYPE_ANY &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2974               current_subject < end_subject &&                 current_subject < end_subject &&
2975               *current_subject == '\n')                 *current_subject == CHAR_NL)
2976            current_subject++;              current_subject++;
2977              }
2978          }          }
       }  
2979    
2980      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2981    
2982      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2983          {          {
2984          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2985          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2986            else break;            register unsigned int c = *current_subject;
2987              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2988                else break;
2989              }
2990          }          }
2991        }        }
2992    
2993      /* Restore fudged end_subject */      /* Restore fudged end_subject */
2994    
2995      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
   
   /* If req_byte is set, we know that that character must appear in the subject  
   for the match to succeed. If the first character is set, req_byte must be  
   later in the subject; otherwise the test starts at the match point. This  
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
2996    
2997      /* We don't need to repeat the search if we haven't yet reached the      /* The following two optimizations are disabled for partial matching or if
2998      place we found it at last time. */      disabling is explicitly requested (and of course, by the test above, this
2999        code is not obeyed when restarting after a partial match). */
3000    
3001      if (p > req_byte_ptr)      if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3002            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3003        {        {
3004        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3005          {        is a lower bound; no actual string of that length may actually match the
3006          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3007            {        bytes to avoid spending too much time in this optimization. */
3008            register int pp = *p++;  
3009            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3010            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3011          }          return PCRE_ERROR_NOMATCH;
3012        else  
3013          /* If req_byte is set, we know that that character must appear in the
3014          subject for the match to succeed. If the first character is set, req_byte
3015          must be later in the subject; otherwise the test starts at the match
3016          point. This optimization can save a huge amount of work in patterns with
3017          nested unlimited repeats that aren't going to match. Writing separate
3018          code for cased/caseless versions makes it go faster, as does using an
3019          autoincrement and backing off on a match.
3020    
3021          HOWEVER: when the subject string is very, very long, searching to its end
3022          can take a long time, and give bad performance on quite ordinary
3023          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3024          string... so we don't do this when the string is sufficiently long. */
3025    
3026          if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3027          {          {
3028          while (p < end_subject)          register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3029    
3030            /* We don't need to repeat the search if we haven't yet reached the
3031            place we found it at last time. */
3032    
3033            if (p > req_byte_ptr)
3034            {            {
3035            if (*p++ == req_byte) { p--; break; }            if (req_byte_caseless)
3036            }              {
3037          }              while (p < end_subject)
3038                  {
3039                  register int pp = *p++;
3040                  if (pp == req_byte || pp == req_byte2) { p--; break; }
3041                  }
3042                }
3043              else
3044                {
3045                while (p < end_subject)
3046                  {
3047                  if (*p++ == req_byte) { p--; break; }
3048                  }
3049                }
3050    
3051        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3052        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3053    
3054        if (p >= end_subject) break;            if (p >= end_subject) break;
3055    
3056        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3057        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3058        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3059    
3060        req_byte_ptr = p;            req_byte_ptr = p;
3061              }
3062            }
3063        }        }
3064      }      }   /* End of optimizations that are done when not restarting */
3065    
3066    /* OK, now we can do the business */    /* OK, now we can do the business */
3067    
3068      md->start_used_ptr = current_subject;
3069    
3070    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3071      md,                                /* fixed match data */      md,                                /* fixed match data */
3072      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2425  for (;;) Line 3097  for (;;)
3097      }      }
3098    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3099    
3100    /* If we have just passed a CR and the newline option is CRLF or ANY, and we    /* If we have just passed a CR and we are now at a LF, and the pattern does
3101    are now at a LF, advance the match position by one more character. */    not contain any explicit matches for \r or \n, and the newline option is CRLF
3102      or ANY or ANYCRLF, advance the match position by one more character. */
3103    if (current_subject[-1] == '\r' &&  
3104         (md->nltype == NLTYPE_ANY || md->nllen == 2) &&    if (current_subject[-1] == CHAR_CR &&
3105         current_subject < end_subject &&        current_subject < end_subject &&
3106         *current_subject == '\n')        *current_subject == CHAR_NL &&
3107          (re->flags & PCRE_HASCRORLF) == 0 &&
3108            (md->nltype == NLTYPE_ANY ||
3109             md->nltype == NLTYPE_ANYCRLF ||
3110             md->nllen == 2))
3111      current_subject++;      current_subject++;
3112    
3113    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.145  
changed lines
  Added in v.510

  ViewVC Help
Powered by ViewVC 1.1.5