/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 1425 by ph10, Tue Dec 31 17:44:40 2013 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2013 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 37  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a DFA algorithm. This is NOT Perl-  alternative matching function that uses a sort of DFA algorithm (not a true
43  compatible, but it has advantages in certain applications. */  FSM). This is NOT Perl-compatible, but it has advantages in certain
44    applications. */
45    
46    
47    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48    the performance of his patterns greatly. I could not use it as it stood, as it
49    was not thread safe, and made assumptions about pattern sizes. Also, it caused
50    test 7 to loop, and test 9 to crash with a segfault.
51    
52    The issue is the check for duplicate states, which is done by a simple linear
53    search up the state list. (Grep for "duplicate" below to find the code.) For
54    many patterns, there will never be many states active at one time, so a simple
55    linear search is fine. In patterns that have many active states, it might be a
56    bottleneck. The suggested code used an indexing scheme to remember which states
57    had previously been used for each character, and avoided the linear search when
58    it knew there was no chance of a duplicate. This was implemented when adding
59    states to the state lists.
60    
61    I wrote some thread-safe, not-limited code to try something similar at the time
62    of checking for duplicates (instead of when adding states), using index vectors
63    on the stack. It did give a 13% improvement with one specially constructed
64    pattern for certain subject strings, but on other strings and on many of the
65    simpler patterns in the test suite it did worse. The major problem, I think,
66    was the extra time to initialize the index. This had to be done for each call
67    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68    only once - I suspect this was the cause of the problems with the tests.)
69    
70    Overall, I concluded that the gains in some cases did not outweigh the losses
71    in others, so I abandoned this code. */
72    
73    
74    
75    #ifdef HAVE_CONFIG_H
76    #include "config.h"
77    #endif
78    
79    #define NLBLOCK md             /* Block containing newline information */
80    #define PSSTART start_subject  /* Field containing processed string start */
81    #define PSEND   end_subject    /* Field containing processed string end */
82    
83  #include "pcre_internal.h"  #include "pcre_internal.h"
84    
# Line 51  compatible, but it has advantages in cer Line 88  compatible, but it has advantages in cer
88  #define SP "                   "  #define SP "                   "
89    
90    
   
91  /*************************************************  /*************************************************
92  *      Code parameters and static tables         *  *      Code parameters and static tables         *
93  *************************************************/  *************************************************/
94    
95  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96  into others, under special conditions. A gap of 10 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
97  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
98    never stored, so we push them well clear of the normal opcodes. */
99  #define OP_PROP_EXTRA    (EXTRACT_BASIC_MAX+1)  
100  #define OP_EXTUNI_EXTRA  (EXTRACT_BASIC_MAX+11)  #define OP_PROP_EXTRA       300
101    #define OP_EXTUNI_EXTRA     320
102    #define OP_ANYNL_EXTRA      340
103    #define OP_HSPACE_EXTRA     360
104    #define OP_VSPACE_EXTRA     380
105    
106    
107  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
108  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
109  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
110  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111  small value. */  small value. Non-zero values in the table are the offsets from the opcode where
112    the character is to be found. ***NOTE*** If the start of this table is
113    modified, the three tables that follow must also be modified. */
114    
115  static uschar coptable[] = {  static const pcre_uint8 coptable[] = {
116    0,                             /* End                                    */    0,                             /* End                                    */
117    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0,                          /* \P, \p                                 */
121      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122      0,                             /* \X                                     */
123      0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124    1,                             /* Char                                   */    1,                             /* Char                                   */
125    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
126    1,                             /* not                                    */    1,                             /* not                                    */
127      1,                             /* noti                                   */
128    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
129    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131      1+IMM2_SIZE,                   /* exact                                  */
132      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135      1+IMM2_SIZE,                   /* exact I                                */
136      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
138    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140      1+IMM2_SIZE,                   /* NOT exact                              */
141      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144      1+IMM2_SIZE,                   /* NOT exact I                            */
145      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
147    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149      1+IMM2_SIZE,                   /* Type exact                             */
150      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
152    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
154      0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
155    0,                             /* CLASS                                  */    0,                             /* CLASS                                  */
156    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
157    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
158    0,                             /* REF                                    */    0,                             /* REF                                    */
159      0,                             /* REFI                                   */
160      0,                             /* DNREF                                  */
161      0,                             /* DNREFI                                 */
162    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
163    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
164    0,                             /* Alt                                    */    0,                             /* Alt                                    */
165    0,                             /* Ket                                    */    0,                             /* Ket                                    */
166    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
167    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
168      0,                             /* KetRpos                                */
169      0,                             /* Reverse                                */
170    0,                             /* Assert                                 */    0,                             /* Assert                                 */
171    0,                             /* Assert not                             */    0,                             /* Assert not                             */
172    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
173    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
174      0, 0,                          /* ONCE, ONCE_NC                          */
175      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
176      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
177      0, 0,                          /* CREF, DNCREF                           */
178      0, 0,                          /* RREF, DNRREF                           */
179      0,                             /* DEF                                    */
180      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
181      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
182      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
183      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
184      0, 0                           /* CLOSE, SKIPZERO  */
185    };
186    
187    /* This table identifies those opcodes that inspect a character. It is used to
188    remember the fact that a character could have been inspected when the end of
189    the subject is reached. ***NOTE*** If the start of this table is modified, the
190    two tables that follow must also be modified. */
191    
192    static const pcre_uint8 poptable[] = {
193      0,                             /* End                                    */
194      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
195      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
196      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
197      1, 1,                          /* \P, \p                                 */
198      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
199      1,                             /* \X                                     */
200      0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
201      1,                             /* Char                                   */
202      1,                             /* Chari                                  */
203      1,                             /* not                                    */
204      1,                             /* noti                                   */
205      /* Positive single-char repeats                                          */
206      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
207      1, 1, 1,                       /* upto, minupto, exact                   */
208      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
209      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
210      1, 1, 1,                       /* upto I, minupto I, exact I             */
211      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
212      /* Negative single-char repeats - only for chars < 256                   */
213      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
214      1, 1, 1,                       /* NOT upto, minupto, exact               */
215      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
216      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
217      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
218      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
219      /* Positive type repeats                                                 */
220      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
221      1, 1, 1,                       /* Type upto, minupto, exact              */
222      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
223      /* Character class & ref repeats                                         */
224      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
225      1, 1,                          /* CRRANGE, CRMINRANGE                    */
226      1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
227      1,                             /* CLASS                                  */
228      1,                             /* NCLASS                                 */
229      1,                             /* XCLASS - variable length               */
230      0,                             /* REF                                    */
231      0,                             /* REFI                                   */
232      0,                             /* DNREF                                  */
233      0,                             /* DNREFI                                 */
234      0,                             /* RECURSE                                */
235      0,                             /* CALLOUT                                */
236      0,                             /* Alt                                    */
237      0,                             /* Ket                                    */
238      0,                             /* KetRmax                                */
239      0,                             /* KetRmin                                */
240      0,                             /* KetRpos                                */
241    0,                             /* Reverse                                */    0,                             /* Reverse                                */
242    0,                             /* Once                                   */    0,                             /* Assert                                 */
243    0,                             /* COND                                   */    0,                             /* Assert not                             */
244    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
245    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0,                             /* Assert behind not                      */
246    0,                             /* BRANUMBER                              */    0, 0,                          /* ONCE, ONCE_NC                          */
247    0                              /* BRA                                    */    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
248      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
249      0, 0,                          /* CREF, DNCREF                           */
250      0, 0,                          /* RREF, DNRREF                           */
251      0,                             /* DEF                                    */
252      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
253      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
254      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
255      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
256      0, 0                           /* CLOSE, SKIPZERO                        */
257  };  };
258    
259  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
260  and \w */  and \w */
261    
262  static uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
263    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
264    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
265    ctype_space, ctype_space,    ctype_space, ctype_space,
266    ctype_word,  ctype_word,    ctype_word,  ctype_word,
267    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
268  };  };
269    
270  static uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
271    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
272    ctype_digit, 0,    ctype_digit, 0,
273    ctype_space, 0,    ctype_space, 0,
274    ctype_word,  0,    ctype_word,  0,
275    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
276  };  };
277    
278    
# Line 142  these structures in, is a vector of ints Line 284  these structures in, is a vector of ints
284  typedef struct stateblock {  typedef struct stateblock {
285    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
286    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
287    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
288  } stateblock;  } stateblock;
289    
290  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
291    
292    
293  #ifdef DEBUG  #ifdef PCRE_DEBUG
294  /*************************************************  /*************************************************
295  *             Print character string             *  *             Print character string             *
296  *************************************************/  *************************************************/
# Line 165  Returns:       nothing Line 306  Returns:       nothing
306  */  */
307    
308  static void  static void
309  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
310  {  {
311  int c;  pcre_uint32 c;
312  while (length-- > 0)  while (length-- > 0)
313    {    {
314    if (isprint(c = *(p++)))    if (isprint(c = *(p++)))
315      fprintf(f, "%c", c);      fprintf(f, "%c", c);
316    else    else
317      fprintf(f, "\\x%02x", c);      fprintf(f, "\\x{%02x}", c);
318    }    }
319  }  }
320  #endif  #endif
# Line 198  Arguments: Line 339  Arguments:
339    offsetcount       size of same    offsetcount       size of same
340    workspace         vector of workspace    workspace         vector of workspace
341    wscount           size of same    wscount           size of same
   ims               the current ims flags  
342    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
343    
344  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
345                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
346                       -1 => failed to match                       -1 => failed to match
347                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
348    
# Line 215  for the current character, one for the f Line 354  for the current character, one for the f
354      { \      { \
355      next_active_state->offset = (x); \      next_active_state->offset = (x); \
356      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
357      next_active_state++; \      next_active_state++; \
358      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
359      } \      } \
# Line 226  for the current character, one for the f Line 364  for the current character, one for the f
364      { \      { \
365      next_active_state->offset = (x); \      next_active_state->offset = (x); \
366      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
367      next_active_state->data   = (z); \      next_active_state->data   = (z); \
368      next_active_state++; \      next_active_state++; \
369      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 238  for the current character, one for the f Line 375  for the current character, one for the f
375      { \      { \
376      next_new_state->offset = (x); \      next_new_state->offset = (x); \
377      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
378      next_new_state++; \      next_new_state++; \
379      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
380      } \      } \
# Line 249  for the current character, one for the f Line 385  for the current character, one for the f
385      { \      { \
386      next_new_state->offset = (x); \      next_new_state->offset = (x); \
387      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
388      next_new_state->data   = (z); \      next_new_state->data   = (z); \
389      next_new_state++; \      next_new_state++; \
390      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
391          (x), (y), (z), __LINE__)); \
392      } \      } \
393    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
394    
# Line 261  for the current character, one for the f Line 397  for the current character, one for the f
397  static int  static int
398  internal_dfa_exec(  internal_dfa_exec(
399    dfa_match_data *md,    dfa_match_data *md,
400    const uschar *this_start_code,    const pcre_uchar *this_start_code,
401    const uschar *current_subject,    const pcre_uchar *current_subject,
402    int start_offset,    int start_offset,
403    int *offsets,    int *offsets,
404    int offsetcount,    int offsetcount,
405    int *workspace,    int *workspace,
406    int wscount,    int wscount,
407    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
408  {  {
409  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
410  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
411    
412  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
413  const uschar *ptr;  const pcre_uchar *ptr;
414  const uschar *end_code;  const pcre_uchar *end_code, *first_op;
415    
416    dfa_recursion_info new_recursive;
417    
418  int active_count, new_count, match_count;  int active_count, new_count, match_count;
419    
420  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
421  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
422    
423  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
424  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
425  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
426    
427  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
428  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
429    #else
430    BOOL utf = FALSE;
431  #endif  #endif
432    
433    BOOL reset_could_continue = FALSE;
434    
435  rlevel++;  rlevel++;
436  offsetcount &= (-2);  offsetcount &= (-2);
437    
# Line 300  wscount = (wscount - (wscount % (INTS_PE Line 440  wscount = (wscount - (wscount % (INTS_PE
440            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
441    
442  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
443    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
444    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
445    
446  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
447  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 313  active_states = (stateblock *)(workspace Line 453  active_states = (stateblock *)(workspace
453  next_new_state = new_states = active_states + wscount;  next_new_state = new_states = active_states + wscount;
454  new_count = 0;  new_count = 0;
455    
456    first_op = this_start_code + 1 + LINK_SIZE +
457      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
458        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
459        ? IMM2_SIZE:0);
460    
461  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
462  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
463  makes is possible to use this function recursively, when we want to stop at a  makes is possible to use this function recursively, when we want to stop at a
# Line 322  If the first opcode in the first alterna Line 467  If the first opcode in the first alterna
467  a backward assertion. In that case, we have to find out the maximum amount to  a backward assertion. In that case, we have to find out the maximum amount to
468  move back, and set up each alternative appropriately. */  move back, and set up each alternative appropriately. */
469    
470  if (this_start_code[1+LINK_SIZE] == OP_REVERSE)  if (*first_op == OP_REVERSE)
471    {    {
472    int max_back = 0;    int max_back = 0;
473    int gone_back;    int gone_back;
# Line 339  if (this_start_code[1+LINK_SIZE] == OP_R Line 484  if (this_start_code[1+LINK_SIZE] == OP_R
484    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
485    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
486    
487  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
488    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
489    
490    if (utf8)    if (utf)
491      {      {
492      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
493        {        {
494        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
495        current_subject--;        current_subject--;
496        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
497        }        }
498      }      }
499    else    else
# Line 360  if (this_start_code[1+LINK_SIZE] == OP_R Line 503  if (this_start_code[1+LINK_SIZE] == OP_R
503    
504      {      {
505      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
506        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
507      current_subject -= gone_back;      current_subject -= gone_back;
508      }      }
509    
510      /* Save the earliest consulted character */
511    
512      if (current_subject < md->start_used_ptr)
513        md->start_used_ptr = current_subject;
514    
515    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
516    
517    end_code = this_start_code;    end_code = this_start_code;
# Line 372  if (this_start_code[1+LINK_SIZE] == OP_R Line 520  if (this_start_code[1+LINK_SIZE] == OP_R
520      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
521      if (back <= gone_back)      if (back <= gone_back)
522        {        {
523        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
524        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
525        }        }
526      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 404  else Line 552  else
552    
553    else    else
554      {      {
555        int length = 1 + LINK_SIZE +
556          ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
557            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
558            ? IMM2_SIZE:0);
559      do      do
560        {        {
561        ADD_NEW(end_code - start_code + 1 + LINK_SIZE, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
562        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
563          length = 1 + LINK_SIZE;
564        }        }
565      while (*end_code == OP_ALT);      while (*end_code == OP_ALT);
566      }      }
# Line 415  else Line 568  else
568    
569  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
570    
571  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
572    
573  /* Loop for scanning the subject */  /* Loop for scanning the subject */
574    
# Line 423  ptr = current_subject; Line 576  ptr = current_subject;
576  for (;;)  for (;;)
577    {    {
578    int i, j;    int i, j;
579    int c, d, clen, dlen;    int clen, dlen;
580      pcre_uint32 c, d;
581      int forced_fail = 0;
582      BOOL partial_newline = FALSE;
583      BOOL could_continue = reset_could_continue;
584      reset_could_continue = FALSE;
585    
586    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
587    new state list. */    new state list. */
# Line 437  for (;;) Line 595  for (;;)
595    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
596    workspace[1] = active_count;    workspace[1] = active_count;
597    
598  #ifdef DEBUG  #ifdef PCRE_DEBUG
599    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
600    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
601    printf("\"\n");    printf("\"\n");
602    
603    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 459  for (;;) Line 617  for (;;)
617    
618    if (ptr < end_subject)    if (ptr < end_subject)
619      {      {
620      clen = 1;      clen = 1;        /* Number of data items in the character */
621  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
622      if (utf8) { GETCHARLEN(c, ptr, clen); } else      GETCHARLENTEST(c, ptr, clen);
623  #endif  /* SUPPORT_UTF8 */  #else
624      c = *ptr;      c = *ptr;
625    #endif  /* SUPPORT_UTF */
626      }      }
627    else    else
628      {      {
629      clen = 0;    /* At end subject */      clen = 0;        /* This indicates the end of the subject */
630      c = -1;      c = NOTACHAR;    /* This value should never actually be used */
631      }      }
632    
633    /* Scan up the active states and act on each one. The result of an action    /* Scan up the active states and act on each one. The result of an action
# Line 479  for (;;) Line 638  for (;;)
638    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
639      {      {
640      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
641      const uschar *code;      BOOL caseless = FALSE;
642        const pcre_uchar *code;
643      int state_offset = current_state->offset;      int state_offset = current_state->offset;
644      int count, codevalue;      int codevalue, rrc;
645      int chartype, script;      int count;
646    
647  #ifdef DEBUG  #ifdef PCRE_DEBUG
648      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
649      if (c < 0) printf("-1\n");      if (clen == 0) printf("EOL\n");
650        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
651          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
652  #endif  #endif
653    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
654      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
655      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
656      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
657        state, arrange for it to passed on. */
658    
659      if (state_offset < 0)      if (state_offset < 0)
660        {        {
# Line 506  for (;;) Line 663  for (;;)
663          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
664          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
665            current_state->data - 1);            current_state->data - 1);
666            if (could_continue) reset_could_continue = TRUE;
667          continue;          continue;
668          }          }
669        else        else
# Line 514  for (;;) Line 672  for (;;)
672          }          }
673        }        }
674    
675      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
676        See the note at the head of this module about the possibility of improving
677        performance here. */
678    
679      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
680        {        {
# Line 530  for (;;) Line 690  for (;;)
690    
691      code = start_code + state_offset;      code = start_code + state_offset;
692      codevalue = *code;      codevalue = *code;
693      if (codevalue >= OP_BRA) codevalue = OP_BRA; /* All brackets are equal */  
694        /* If this opcode inspects a character, but we are at the end of the
695        subject, remember the fact for use when testing for a partial match. */
696    
697        if (clen == 0 && poptable[codevalue] != 0)
698          could_continue = TRUE;
699    
700      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
701      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
# Line 538  for (;;) Line 703  for (;;)
703      permitted.      permitted.
704    
705      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
706      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long because
707      Unfortunately, we have to take special action to deal with  \P, \p, and      the values are small. We have to take special action to deal with  \P, \p,
708      \X in this case. To keep the other cases fast, convert these ones to new      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
709      opcodes. */      these ones to new opcodes. */
710    
711      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
712        {        {
713        dlen = 1;        dlen = 1;
714  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
715        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
716  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
717        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
718        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
719          {          {
720          if (d == OP_ANYBYTE) return PCRE_ERROR_DFA_UITEM;          switch(d)
721          if (d >= OP_NOTPROP)            {
722            codevalue += (d == OP_EXTUNI)? OP_EXTUNI_EXTRA : OP_PROP_EXTRA;            case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
723              case OP_NOTPROP:
724              case OP_PROP: codevalue += OP_PROP_EXTRA; break;
725              case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
726              case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
727              case OP_NOT_HSPACE:
728              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
729              case OP_NOT_VSPACE:
730              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
731              default: break;
732              }
733          }          }
734        }        }
735      else      else
736        {        {
737        dlen = 0;         /* Not strictly necessary, but compilers moan */        dlen = 0;         /* Not strictly necessary, but compilers moan */
738        d = -1;           /* if these variables are not set. */        d = NOTACHAR;     /* if these variables are not set. */
739        }        }
740    
741    
# Line 568  for (;;) Line 743  for (;;)
743    
744      switch (codevalue)      switch (codevalue)
745        {        {
746    /* ========================================================================== */
747          /* These cases are never obeyed. This is a fudge that causes a compile-
748          time error if the vectors coptable or poptable, which are indexed by
749          opcode, are not the correct length. It seems to be the only way to do
750          such a check at compile time, as the sizeof() operator does not work
751          in the C preprocessor. */
752    
753          case OP_TABLE_LENGTH:
754          case OP_TABLE_LENGTH +
755            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
756             (sizeof(poptable) == OP_TABLE_LENGTH)):
757          break;
758    
759  /* ========================================================================== */  /* ========================================================================== */
760        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
761        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
762        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
763          subpattern, because the possessive subpattern repeats are always handled
764          using recursive calls. Thus, it never adds any new states.
765    
766          At the end of the (sub)pattern, unless we have an empty string and
767          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
768          start of the subject, save the match data, shifting up all previous
769        matches so we always have the longest first. */        matches so we always have the longest first. */
770    
771        case OP_KET:        case OP_KET:
772        case OP_KETRMIN:        case OP_KETRMIN:
773        case OP_KETRMAX:        case OP_KETRMAX:
774          case OP_KETRPOS:
775        if (code != end_code)        if (code != end_code)
776          {          {
777          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 586  for (;;) Line 780  for (;;)
780            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
781            }            }
782          }          }
783        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
784          {          {
785          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
786            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
787              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
788          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
789          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
790          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
791            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
792            offsets[0] = current_subject - start_subject;                match_count = 0;
793            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
794            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
795              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
796            }              {
797          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
798            {              offsets[1] = (int)(ptr - start_subject);
799            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
800              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], (char *)current_subject));
801              match_count, rlevel*2-2, SP));              }
802            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
803                {
804                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
805                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
806                  match_count, rlevel*2-2, SP));
807                return match_count;
808                }
809            }            }
810          }          }
811        break;        break;
# Line 617  for (;;) Line 817  for (;;)
817        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
818        case OP_ALT:        case OP_ALT:
819        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
820        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
821        break;        break;
822    
823        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
824        case OP_BRA:        case OP_BRA:
825          case OP_SBRA:
826        do        do
827          {          {
828          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
829          code += GET(code, 1);          code += GET(code, 1);
830          }          }
831        while (*code == OP_ALT);        while (*code == OP_ALT);
832        break;        break;
833    
834        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
835          case OP_CBRA:
836          case OP_SCBRA:
837          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
838          code += GET(code, 1);
839          while (*code == OP_ALT)
840            {
841            ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
842            code += GET(code, 1);
843            }
844          break;
845    
846          /*-----------------------------------------------------------------*/
847        case OP_BRAZERO:        case OP_BRAZERO:
848        case OP_BRAMINZERO:        case OP_BRAMINZERO:
849        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
850        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
851        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
852        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
853        break;        break;
854    
855        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
856        case OP_BRANUMBER:        case OP_SKIPZERO:
857        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);        code += 1 + GET(code, 2);
858          while (*code == OP_ALT) code += GET(code, 1);
859          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
860        break;        break;
861    
862        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
863        case OP_CIRC:        case OP_CIRC:
864        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE))  
865          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
866        break;        break;
867    
868        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
869        case OP_EOD:        case OP_CIRCM:
870        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
871              (ptr != end_subject && WAS_NEWLINE(ptr)))
872            { ADD_ACTIVE(state_offset + 1, 0); }
873        break;        break;
874    
875        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
876        case OP_OPT:        case OP_EOD:
877        ims = code[1];        if (ptr >= end_subject)
878        ADD_ACTIVE(state_offset + 2, 0);          {
879            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
880              could_continue = TRUE;
881            else { ADD_ACTIVE(state_offset + 1, 0); }
882            }
883        break;        break;
884    
885        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 681  for (;;) Line 901  for (;;)
901    
902        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
903        case OP_ANY:        case OP_ANY:
904        if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0))        if (clen > 0 && !IS_NEWLINE(ptr))
905            {
906            if (ptr + 1 >= md->end_subject &&
907                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
908                NLBLOCK->nltype == NLTYPE_FIXED &&
909                NLBLOCK->nllen == 2 &&
910                c == NLBLOCK->nl[0])
911              {
912              could_continue = partial_newline = TRUE;
913              }
914            else
915              {
916              ADD_NEW(state_offset + 1, 0);
917              }
918            }
919          break;
920    
921          /*-----------------------------------------------------------------*/
922          case OP_ALLANY:
923          if (clen > 0)
924          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
925        break;        break;
926    
927        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
928        case OP_EODN:        case OP_EODN:
929        if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
930            could_continue = TRUE;
931          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
932          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
933        break;        break;
934    
# Line 695  for (;;) Line 936  for (;;)
936        case OP_DOLL:        case OP_DOLL:
937        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
938          {          {
939          if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
940                                  (ims & PCRE_MULTILINE) != 0)))            could_continue = TRUE;
941            else if (clen == 0 ||
942                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
943                   (ptr == end_subject - md->nllen)
944                ))
945              { ADD_ACTIVE(state_offset + 1, 0); }
946            else if (ptr + 1 >= md->end_subject &&
947                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
948                     NLBLOCK->nltype == NLTYPE_FIXED &&
949                     NLBLOCK->nllen == 2 &&
950                     c == NLBLOCK->nl[0])
951              {
952              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
953                {
954                reset_could_continue = TRUE;
955                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
956                }
957              else could_continue = partial_newline = TRUE;
958              }
959            }
960          break;
961    
962          /*-----------------------------------------------------------------*/
963          case OP_DOLLM:
964          if ((md->moptions & PCRE_NOTEOL) == 0)
965            {
966            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
967              could_continue = TRUE;
968            else if (clen == 0 ||
969                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
970            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
971            else if (ptr + 1 >= md->end_subject &&
972                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
973                     NLBLOCK->nltype == NLTYPE_FIXED &&
974                     NLBLOCK->nllen == 2 &&
975                     c == NLBLOCK->nl[0])
976              {
977              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
978                {
979                reset_could_continue = TRUE;
980                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
981                }
982              else could_continue = partial_newline = TRUE;
983              }
984          }          }
985        else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0)        else if (IS_NEWLINE(ptr))
986          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
987        break;        break;
988    
# Line 730  for (;;) Line 1013  for (;;)
1013    
1014          if (ptr > start_subject)          if (ptr > start_subject)
1015            {            {
1016            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1017  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1018            if (utf8) BACKCHAR(temp);  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1019              if (utf) { BACKCHAR(temp); }
1020  #endif  #endif
1021            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1022    #ifdef SUPPORT_UCP
1023              if ((md->poptions & PCRE_UCP) != 0)
1024                {
1025                if (d == '_') left_word = TRUE; else
1026                  {
1027                  int cat = UCD_CATEGORY(d);
1028                  left_word = (cat == ucp_L || cat == ucp_N);
1029                  }
1030                }
1031              else
1032    #endif
1033            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1034            }            }
1035          else left_word = 0;          else left_word = FALSE;
1036    
1037          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
1038            else right_word = 0;            {
1039    #ifdef SUPPORT_UCP
1040              if ((md->poptions & PCRE_UCP) != 0)
1041                {
1042                if (c == '_') right_word = TRUE; else
1043                  {
1044                  int cat = UCD_CATEGORY(c);
1045                  right_word = (cat == ucp_L || cat == ucp_N);
1046                  }
1047                }
1048              else
1049    #endif
1050              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1051              }
1052            else right_word = FALSE;
1053    
1054          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1055            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 748  for (;;) Line 1057  for (;;)
1057        break;        break;
1058    
1059    
 #ifdef SUPPORT_UCP  
   
1060        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1061        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
1062        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
1063        */        */
1064    
1065    #ifdef SUPPORT_UCP
1066        case OP_PROP:        case OP_PROP:
1067        case OP_NOTPROP:        case OP_NOTPROP:
1068        if (clen > 0)        if (clen > 0)
1069          {          {
1070          BOOL OK;          BOOL OK;
1071          int category = _pcre_ucp_findprop(c, &chartype, &script);          const pcre_uint32 *cp;
1072            const ucd_record * prop = GET_UCD(c);
1073          switch(code[1])          switch(code[1])
1074            {            {
1075            case PT_ANY:            case PT_ANY:
# Line 768  for (;;) Line 1077  for (;;)
1077            break;            break;
1078    
1079            case PT_LAMP:            case PT_LAMP:
1080            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1081                   prop->chartype == ucp_Lt;
1082            break;            break;
1083    
1084            case PT_GC:            case PT_GC:
1085            OK = category == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1086            break;            break;
1087    
1088            case PT_PC:            case PT_PC:
1089            OK = chartype == code[2];            OK = prop->chartype == code[2];
1090            break;            break;
1091    
1092            case PT_SC:            case PT_SC:
1093            OK = script == code[2];            OK = prop->script == code[2];
1094              break;
1095    
1096              /* These are specials for combination cases. */
1097    
1098              case PT_ALNUM:
1099              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1100                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1101              break;
1102    
1103              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1104              which means that Perl space and POSIX space are now identical. PCRE
1105              was changed at release 8.34. */
1106    
1107              case PT_SPACE:    /* Perl space */
1108              case PT_PXSPACE:  /* POSIX space */
1109              switch(c)
1110                {
1111                HSPACE_CASES:
1112                VSPACE_CASES:
1113                OK = TRUE;
1114                break;
1115    
1116                default:
1117                OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1118                break;
1119                }
1120              break;
1121    
1122              case PT_WORD:
1123              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1124                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1125                   c == CHAR_UNDERSCORE;
1126              break;
1127    
1128              case PT_CLIST:
1129              cp = PRIV(ucd_caseless_sets) + code[2];
1130              for (;;)
1131                {
1132                if (c < *cp) { OK = FALSE; break; }
1133                if (c == *cp++) { OK = TRUE; break; }
1134                }
1135              break;
1136    
1137              case PT_UCNC:
1138              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1139                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1140                   c >= 0xe000;
1141            break;            break;
1142    
1143            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 800  for (;;) Line 1157  for (;;)
1157  /* ========================================================================== */  /* ========================================================================== */
1158        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
1159        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
1160        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1161        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1162    
1163        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1164        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
1165          case OP_TYPEPOSPLUS:
1166        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1167        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1168        if (clen > 0)        if (clen > 0)
1169          {          {
1170          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1171                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1172                NLBLOCK->nltype == NLTYPE_FIXED &&
1173                NLBLOCK->nllen == 2 &&
1174                c == NLBLOCK->nl[0])
1175              {
1176              could_continue = partial_newline = TRUE;
1177              }
1178            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1179              (c < 256 &&              (c < 256 &&
1180                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1181                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1182            {            {
1183              if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1184                {
1185                active_count--;            /* Remove non-match possibility */
1186                next_active_state--;
1187                }
1188            count++;            count++;
1189            ADD_NEW(state_offset, count);            ADD_NEW(state_offset, count);
1190            }            }
# Line 823  for (;;) Line 1194  for (;;)
1194        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1195        case OP_TYPEQUERY:        case OP_TYPEQUERY:
1196        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
1197          case OP_TYPEPOSQUERY:
1198        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1199        if (clen > 0)        if (clen > 0)
1200          {          {
1201          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1202                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1203                NLBLOCK->nltype == NLTYPE_FIXED &&
1204                NLBLOCK->nllen == 2 &&
1205                c == NLBLOCK->nl[0])
1206              {
1207              could_continue = partial_newline = TRUE;
1208              }
1209            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1210              (c < 256 &&              (c < 256 &&
1211                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1212                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1213            {            {
1214              if (codevalue == OP_TYPEPOSQUERY)
1215                {
1216                active_count--;            /* Remove non-match possibility */
1217                next_active_state--;
1218                }
1219            ADD_NEW(state_offset + 2, 0);            ADD_NEW(state_offset + 2, 0);
1220            }            }
1221          }          }
# Line 839  for (;;) Line 1224  for (;;)
1224        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1225        case OP_TYPESTAR:        case OP_TYPESTAR:
1226        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
1227          case OP_TYPEPOSSTAR:
1228        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1229        if (clen > 0)        if (clen > 0)
1230          {          {
1231          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1232                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1233                NLBLOCK->nltype == NLTYPE_FIXED &&
1234                NLBLOCK->nllen == 2 &&
1235                c == NLBLOCK->nl[0])
1236              {
1237              could_continue = partial_newline = TRUE;
1238              }
1239            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1240              (c < 256 &&              (c < 256 &&
1241                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1242                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1243            {            {
1244              if (codevalue == OP_TYPEPOSSTAR)
1245                {
1246                active_count--;            /* Remove non-match possibility */
1247                next_active_state--;
1248                }
1249            ADD_NEW(state_offset, 0);            ADD_NEW(state_offset, 0);
1250            }            }
1251          }          }
# Line 854  for (;;) Line 1253  for (;;)
1253    
1254        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1255        case OP_TYPEEXACT:        case OP_TYPEEXACT:
1256          count = current_state->count;  /* Number already matched */
1257          if (clen > 0)
1258            {
1259            if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1260                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1261                NLBLOCK->nltype == NLTYPE_FIXED &&
1262                NLBLOCK->nllen == 2 &&
1263                c == NLBLOCK->nl[0])
1264              {
1265              could_continue = partial_newline = TRUE;
1266              }
1267            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1268                (c < 256 &&
1269                  (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1270                  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1271              {
1272              if (++count >= (int)GET2(code, 1))
1273                { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1274              else
1275                { ADD_NEW(state_offset, count); }
1276              }
1277            }
1278          break;
1279    
1280          /*-----------------------------------------------------------------*/
1281        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1282        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1283        if (codevalue != OP_TYPEEXACT)        case OP_TYPEPOSUPTO:
1284          { ADD_ACTIVE(state_offset + 4, 0); }        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1285        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1286        if (clen > 0)        if (clen > 0)
1287          {          {
1288          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1289                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1290                NLBLOCK->nltype == NLTYPE_FIXED &&
1291                NLBLOCK->nllen == 2 &&
1292                c == NLBLOCK->nl[0])
1293              {
1294              could_continue = partial_newline = TRUE;
1295              }
1296            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1297              (c < 256 &&              (c < 256 &&
1298                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1299                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1300            {            {
1301            if (++count >= GET2(code, 1))            if (codevalue == OP_TYPEPOSUPTO)
1302              { ADD_NEW(state_offset + 4, 0); }              {
1303                active_count--;           /* Remove non-match possibility */
1304                next_active_state--;
1305                }
1306              if (++count >= (int)GET2(code, 1))
1307                { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1308            else            else
1309              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1310            }            }
# Line 876  for (;;) Line 1313  for (;;)
1313    
1314  /* ========================================================================== */  /* ========================================================================== */
1315        /* These are virtual opcodes that are used when something like        /* These are virtual opcodes that are used when something like
1316        OP_TYPEPLUS has OP_PROP, OP_NOTPROP, or OP_EXTUNI as its argument. It        OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1317        keeps the code above fast for the other cases. The argument is in the        argument. It keeps the code above fast for the other cases. The argument
1318        d variable. */        is in the d variable. */
1319    
1320    #ifdef SUPPORT_UCP
1321        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
1322        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1323          case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1324        count = current_state->count;           /* Already matched */        count = current_state->count;           /* Already matched */
1325        if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1326        if (clen > 0)        if (clen > 0)
1327          {          {
1328          BOOL OK;          BOOL OK;
1329          int category = _pcre_ucp_findprop(c, &chartype, &script);          const pcre_uint32 *cp;
1330            const ucd_record * prop = GET_UCD(c);
1331          switch(code[2])          switch(code[2])
1332            {            {
1333            case PT_ANY:            case PT_ANY:
# Line 895  for (;;) Line 1335  for (;;)
1335            break;            break;
1336    
1337            case PT_LAMP:            case PT_LAMP:
1338            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1339                prop->chartype == ucp_Lt;
1340            break;            break;
1341    
1342            case PT_GC:            case PT_GC:
1343            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1344            break;            break;
1345    
1346            case PT_PC:            case PT_PC:
1347            OK = chartype == code[3];            OK = prop->chartype == code[3];
1348            break;            break;
1349    
1350            case PT_SC:            case PT_SC:
1351            OK = script == code[3];            OK = prop->script == code[3];
1352              break;
1353    
1354              /* These are specials for combination cases. */
1355    
1356              case PT_ALNUM:
1357              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1358                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1359              break;
1360    
1361              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1362              which means that Perl space and POSIX space are now identical. PCRE
1363              was changed at release 8.34. */
1364    
1365              case PT_SPACE:    /* Perl space */
1366              case PT_PXSPACE:  /* POSIX space */
1367              switch(c)
1368                {
1369                HSPACE_CASES:
1370                VSPACE_CASES:
1371                OK = TRUE;
1372                break;
1373    
1374                default:
1375                OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1376                break;
1377                }
1378              break;
1379    
1380              case PT_WORD:
1381              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1382                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1383                   c == CHAR_UNDERSCORE;
1384              break;
1385    
1386              case PT_CLIST:
1387              cp = PRIV(ucd_caseless_sets) + code[3];
1388              for (;;)
1389                {
1390                if (c < *cp) { OK = FALSE; break; }
1391                if (c == *cp++) { OK = TRUE; break; }
1392                }
1393              break;
1394    
1395              case PT_UCNC:
1396              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1397                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1398                   c >= 0xe000;
1399            break;            break;
1400    
1401            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 917  for (;;) Line 1405  for (;;)
1405            break;            break;
1406            }            }
1407    
1408          if (OK == (d == OP_PROP)) { count++; ADD_NEW(state_offset, count); }          if (OK == (d == OP_PROP))
1409              {
1410              if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1411                {
1412                active_count--;           /* Remove non-match possibility */
1413                next_active_state--;
1414                }
1415              count++;
1416              ADD_NEW(state_offset, count);
1417              }
1418          }          }
1419        break;        break;
1420    
1421        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1422        case OP_EXTUNI_EXTRA + OP_TYPEPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1423        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1424          case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1425        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1426        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1427        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1428          {          {
1429          const uschar *nptr = ptr + clen;          int lgb, rgb;
1430            const pcre_uchar *nptr = ptr + clen;
1431          int ncount = 0;          int ncount = 0;
1432            if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1433              {
1434              active_count--;           /* Remove non-match possibility */
1435              next_active_state--;
1436              }
1437            lgb = UCD_GRAPHBREAK(c);
1438          while (nptr < end_subject)          while (nptr < end_subject)
1439            {            {
1440            int nd;            dlen = 1;
1441            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1442            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1443            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1444            ncount++;            ncount++;
1445            nptr += ndlen;            lgb = rgb;
1446              nptr += dlen;
1447            }            }
1448          count++;          count++;
1449          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1450          }          }
1451        break;        break;
1452    #endif
1453    
1454          /*-----------------------------------------------------------------*/
1455          case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1456          case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1457          case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1458          count = current_state->count;  /* Already matched */
1459          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1460          if (clen > 0)
1461            {
1462            int ncount = 0;
1463            switch (c)
1464              {
1465              case CHAR_VT:
1466              case CHAR_FF:
1467              case CHAR_NEL:
1468    #ifndef EBCDIC
1469              case 0x2028:
1470              case 0x2029:
1471    #endif  /* Not EBCDIC */
1472              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1473              goto ANYNL01;
1474    
1475              case CHAR_CR:
1476              if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1477              /* Fall through */
1478    
1479              ANYNL01:
1480              case CHAR_LF:
1481              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1482                {
1483                active_count--;           /* Remove non-match possibility */
1484                next_active_state--;
1485                }
1486              count++;
1487              ADD_NEW_DATA(-state_offset, count, ncount);
1488              break;
1489    
1490              default:
1491              break;
1492              }
1493            }
1494          break;
1495    
1496          /*-----------------------------------------------------------------*/
1497          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1498          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1499          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1500          count = current_state->count;  /* Already matched */
1501          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1502          if (clen > 0)
1503            {
1504            BOOL OK;
1505            switch (c)
1506              {
1507              VSPACE_CASES:
1508              OK = TRUE;
1509              break;
1510    
1511              default:
1512              OK = FALSE;
1513              break;
1514              }
1515    
1516            if (OK == (d == OP_VSPACE))
1517              {
1518              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1519                {
1520                active_count--;           /* Remove non-match possibility */
1521                next_active_state--;
1522                }
1523              count++;
1524              ADD_NEW_DATA(-state_offset, count, 0);
1525              }
1526            }
1527          break;
1528    
1529          /*-----------------------------------------------------------------*/
1530          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1531          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1532          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1533          count = current_state->count;  /* Already matched */
1534          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1535          if (clen > 0)
1536            {
1537            BOOL OK;
1538            switch (c)
1539              {
1540              HSPACE_CASES:
1541              OK = TRUE;
1542              break;
1543    
1544              default:
1545              OK = FALSE;
1546              break;
1547              }
1548    
1549            if (OK == (d == OP_HSPACE))
1550              {
1551              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1552                {
1553                active_count--;           /* Remove non-match possibility */
1554                next_active_state--;
1555                }
1556              count++;
1557              ADD_NEW_DATA(-state_offset, count, 0);
1558              }
1559            }
1560          break;
1561    
1562        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1563    #ifdef SUPPORT_UCP
1564        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1565        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1566          case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1567        count = 4;        count = 4;
1568        goto QS1;        goto QS1;
1569    
1570        case OP_PROP_EXTRA + OP_TYPESTAR:        case OP_PROP_EXTRA + OP_TYPESTAR:
1571        case OP_PROP_EXTRA + OP_TYPEMINSTAR:        case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1572          case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1573        count = 0;        count = 0;
1574    
1575        QS1:        QS1:
# Line 960  for (;;) Line 1578  for (;;)
1578        if (clen > 0)        if (clen > 0)
1579          {          {
1580          BOOL OK;          BOOL OK;
1581          int category = _pcre_ucp_findprop(c, &chartype, &script);          const pcre_uint32 *cp;
1582            const ucd_record * prop = GET_UCD(c);
1583          switch(code[2])          switch(code[2])
1584            {            {
1585            case PT_ANY:            case PT_ANY:
# Line 968  for (;;) Line 1587  for (;;)
1587            break;            break;
1588    
1589            case PT_LAMP:            case PT_LAMP:
1590            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1591                prop->chartype == ucp_Lt;
1592            break;            break;
1593    
1594            case PT_GC:            case PT_GC:
1595            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1596            break;            break;
1597    
1598            case PT_PC:            case PT_PC:
1599            OK = chartype == code[3];            OK = prop->chartype == code[3];
1600            break;            break;
1601    
1602            case PT_SC:            case PT_SC:
1603            OK = script == code[3];            OK = prop->script == code[3];
1604            break;            break;
1605    
1606            /* Should never occur, but keep compilers from grumbling. */            /* These are specials for combination cases. */
1607    
1608            default:            case PT_ALNUM:
1609            OK = codevalue != OP_PROP;            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1610                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1611              break;
1612    
1613              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1614              which means that Perl space and POSIX space are now identical. PCRE
1615              was changed at release 8.34. */
1616    
1617              case PT_SPACE:    /* Perl space */
1618              case PT_PXSPACE:  /* POSIX space */
1619              switch(c)
1620                {
1621                HSPACE_CASES:
1622                VSPACE_CASES:
1623                OK = TRUE;
1624                break;
1625    
1626                default:
1627                OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1628                break;
1629                }
1630              break;
1631    
1632              case PT_WORD:
1633              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1634                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1635                   c == CHAR_UNDERSCORE;
1636              break;
1637    
1638              case PT_CLIST:
1639              cp = PRIV(ucd_caseless_sets) + code[3];
1640              for (;;)
1641                {
1642                if (c < *cp) { OK = FALSE; break; }
1643                if (c == *cp++) { OK = TRUE; break; }
1644                }
1645              break;
1646    
1647              case PT_UCNC:
1648              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1649                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1650                   c >= 0xe000;
1651              break;
1652    
1653              /* Should never occur, but keep compilers from grumbling. */
1654    
1655              default:
1656              OK = codevalue != OP_PROP;
1657            break;            break;
1658            }            }
1659    
1660          if (OK == (d == OP_PROP)) { ADD_NEW(state_offset + count, 0); }          if (OK == (d == OP_PROP))
1661              {
1662              if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1663                  codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1664                {
1665                active_count--;           /* Remove non-match possibility */
1666                next_active_state--;
1667                }
1668              ADD_NEW(state_offset + count, 0);
1669              }
1670          }          }
1671        break;        break;
1672    
1673        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1674        case OP_EXTUNI_EXTRA + OP_TYPEQUERY:        case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1675        case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:        case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1676          case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1677        count = 2;        count = 2;
1678        goto QS2;        goto QS2;
1679    
1680        case OP_EXTUNI_EXTRA + OP_TYPESTAR:        case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1681        case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:        case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1682          case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1683        count = 0;        count = 0;
1684    
1685        QS2:        QS2:
1686    
1687        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1688        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1689          {          {
1690          const uschar *nptr = ptr + clen;          int lgb, rgb;
1691            const pcre_uchar *nptr = ptr + clen;
1692          int ncount = 0;          int ncount = 0;
1693            if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1694                codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1695              {
1696              active_count--;           /* Remove non-match possibility */
1697              next_active_state--;
1698              }
1699            lgb = UCD_GRAPHBREAK(c);
1700          while (nptr < end_subject)          while (nptr < end_subject)
1701            {            {
1702            int nd;            dlen = 1;
1703            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1704            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1705            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1706            ncount++;            ncount++;
1707            nptr += ndlen;            lgb = rgb;
1708              nptr += dlen;
1709            }            }
1710          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1711          }          }
1712        break;        break;
1713    #endif
1714    
1715          /*-----------------------------------------------------------------*/
1716          case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1717          case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1718          case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1719          count = 2;
1720          goto QS3;
1721    
1722          case OP_ANYNL_EXTRA + OP_TYPESTAR:
1723          case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1724          case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1725          count = 0;
1726    
1727          QS3:
1728          ADD_ACTIVE(state_offset + 2, 0);
1729          if (clen > 0)
1730            {
1731            int ncount = 0;
1732            switch (c)
1733              {
1734              case CHAR_VT:
1735              case CHAR_FF:
1736              case CHAR_NEL:
1737    #ifndef EBCDIC
1738              case 0x2028:
1739              case 0x2029:
1740    #endif  /* Not EBCDIC */
1741              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1742              goto ANYNL02;
1743    
1744              case CHAR_CR:
1745              if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1746              /* Fall through */
1747    
1748              ANYNL02:
1749              case CHAR_LF:
1750              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1751                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1752                {
1753                active_count--;           /* Remove non-match possibility */
1754                next_active_state--;
1755                }
1756              ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1757              break;
1758    
1759              default:
1760              break;
1761              }
1762            }
1763          break;
1764    
1765          /*-----------------------------------------------------------------*/
1766          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1767          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1768          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1769          count = 2;
1770          goto QS4;
1771    
1772          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1773          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1774          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1775          count = 0;
1776    
1777          QS4:
1778          ADD_ACTIVE(state_offset + 2, 0);
1779          if (clen > 0)
1780            {
1781            BOOL OK;
1782            switch (c)
1783              {
1784              VSPACE_CASES:
1785              OK = TRUE;
1786              break;
1787    
1788              default:
1789              OK = FALSE;
1790              break;
1791              }
1792            if (OK == (d == OP_VSPACE))
1793              {
1794              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1795                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1796                {
1797                active_count--;           /* Remove non-match possibility */
1798                next_active_state--;
1799                }
1800              ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1801              }
1802            }
1803          break;
1804    
1805          /*-----------------------------------------------------------------*/
1806          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1807          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1808          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1809          count = 2;
1810          goto QS5;
1811    
1812          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1813          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1814          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1815          count = 0;
1816    
1817          QS5:
1818          ADD_ACTIVE(state_offset + 2, 0);
1819          if (clen > 0)
1820            {
1821            BOOL OK;
1822            switch (c)
1823              {
1824              HSPACE_CASES:
1825              OK = TRUE;
1826              break;
1827    
1828              default:
1829              OK = FALSE;
1830              break;
1831              }
1832    
1833            if (OK == (d == OP_HSPACE))
1834              {
1835              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1836                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1837                {
1838                active_count--;           /* Remove non-match possibility */
1839                next_active_state--;
1840                }
1841              ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1842              }
1843            }
1844          break;
1845    
1846        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1847    #ifdef SUPPORT_UCP
1848        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_PROP_EXTRA + OP_TYPEEXACT:
1849        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_PROP_EXTRA + OP_TYPEUPTO:
1850        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1851          case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1852        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1853          { ADD_ACTIVE(state_offset + 6, 0); }          { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1854        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1855        if (clen > 0)        if (clen > 0)
1856          {          {
1857          BOOL OK;          BOOL OK;
1858          int category = _pcre_ucp_findprop(c, &chartype, &script);          const pcre_uint32 *cp;
1859          switch(code[4])          const ucd_record * prop = GET_UCD(c);
1860            switch(code[1 + IMM2_SIZE + 1])
1861            {            {
1862            case PT_ANY:            case PT_ANY:
1863            OK = TRUE;            OK = TRUE;
1864            break;            break;
1865    
1866            case PT_LAMP:            case PT_LAMP:
1867            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1868                prop->chartype == ucp_Lt;
1869            break;            break;
1870    
1871            case PT_GC:            case PT_GC:
1872            OK = category == code[5];            OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1873            break;            break;
1874    
1875            case PT_PC:            case PT_PC:
1876            OK = chartype == code[5];            OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1877            break;            break;
1878    
1879            case PT_SC:            case PT_SC:
1880            OK = script == code[5];            OK = prop->script == code[1 + IMM2_SIZE + 2];
1881              break;
1882    
1883              /* These are specials for combination cases. */
1884    
1885              case PT_ALNUM:
1886              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1887                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1888              break;
1889    
1890              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1891              which means that Perl space and POSIX space are now identical. PCRE
1892              was changed at release 8.34. */
1893    
1894              case PT_SPACE:    /* Perl space */
1895              case PT_PXSPACE:  /* POSIX space */
1896              switch(c)
1897                {
1898                HSPACE_CASES:
1899                VSPACE_CASES:
1900                OK = TRUE;
1901                break;
1902    
1903                default:
1904                OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1905                break;
1906                }
1907              break;
1908    
1909              case PT_WORD:
1910              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1911                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1912                   c == CHAR_UNDERSCORE;
1913              break;
1914    
1915              case PT_CLIST:
1916              cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1917              for (;;)
1918                {
1919                if (c < *cp) { OK = FALSE; break; }
1920                if (c == *cp++) { OK = TRUE; break; }
1921                }
1922              break;
1923    
1924              case PT_UCNC:
1925              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1926                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1927                   c >= 0xe000;
1928            break;            break;
1929    
1930            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1066  for (;;) Line 1936  for (;;)
1936    
1937          if (OK == (d == OP_PROP))          if (OK == (d == OP_PROP))
1938            {            {
1939            if (++count >= GET2(code, 1))            if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1940              { ADD_NEW(state_offset + 6, 0); }              {
1941                active_count--;           /* Remove non-match possibility */
1942                next_active_state--;
1943                }
1944              if (++count >= (int)GET2(code, 1))
1945                { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1946            else            else
1947              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1948            }            }
# Line 1078  for (;;) Line 1953  for (;;)
1953        case OP_EXTUNI_EXTRA + OP_TYPEEXACT:        case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1954        case OP_EXTUNI_EXTRA + OP_TYPEUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1955        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1956          case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1957        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1958          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1959        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1960        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1961          {          {
1962          const uschar *nptr = ptr + clen;          int lgb, rgb;
1963            const pcre_uchar *nptr = ptr + clen;
1964          int ncount = 0;          int ncount = 0;
1965            if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1966              {
1967              active_count--;           /* Remove non-match possibility */
1968              next_active_state--;
1969              }
1970            lgb = UCD_GRAPHBREAK(c);
1971          while (nptr < end_subject)          while (nptr < end_subject)
1972            {            {
1973            int nd;            dlen = 1;
1974            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1975            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1976            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1977            ncount++;            ncount++;
1978            nptr += ndlen;            lgb = rgb;
1979              nptr += dlen;
1980            }            }
1981          if (++count >= GET2(code, 1))          if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1982            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              reset_could_continue = TRUE;
1983            if (++count >= (int)GET2(code, 1))
1984              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1985          else          else
1986            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1987          }          }
1988        break;        break;
1989    #endif
1990    
1991          /*-----------------------------------------------------------------*/
1992          case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1993          case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1994          case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1995          case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1996          if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1997            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1998          count = current_state->count;  /* Number already matched */
1999          if (clen > 0)
2000            {
2001            int ncount = 0;
2002            switch (c)
2003              {
2004              case CHAR_VT:
2005              case CHAR_FF:
2006              case CHAR_NEL:
2007    #ifndef EBCDIC
2008              case 0x2028:
2009              case 0x2029:
2010    #endif  /* Not EBCDIC */
2011              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2012              goto ANYNL03;
2013    
2014              case CHAR_CR:
2015              if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
2016              /* Fall through */
2017    
2018              ANYNL03:
2019              case CHAR_LF:
2020              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2021                {
2022                active_count--;           /* Remove non-match possibility */
2023                next_active_state--;
2024                }
2025              if (++count >= (int)GET2(code, 1))
2026                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2027              else
2028                { ADD_NEW_DATA(-state_offset, count, ncount); }
2029              break;
2030    
2031              default:
2032              break;
2033              }
2034            }
2035          break;
2036    
2037          /*-----------------------------------------------------------------*/
2038          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2039          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2040          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2041          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2042          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2043            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2044          count = current_state->count;  /* Number already matched */
2045          if (clen > 0)
2046            {
2047            BOOL OK;
2048            switch (c)
2049              {
2050              VSPACE_CASES:
2051              OK = TRUE;
2052              break;
2053    
2054              default:
2055              OK = FALSE;
2056              }
2057    
2058            if (OK == (d == OP_VSPACE))
2059              {
2060              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2061                {
2062                active_count--;           /* Remove non-match possibility */
2063                next_active_state--;
2064                }
2065              if (++count >= (int)GET2(code, 1))
2066                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2067              else
2068                { ADD_NEW_DATA(-state_offset, count, 0); }
2069              }
2070            }
2071          break;
2072    
2073          /*-----------------------------------------------------------------*/
2074          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2075          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2076          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2077          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2078          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2079            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2080          count = current_state->count;  /* Number already matched */
2081          if (clen > 0)
2082            {
2083            BOOL OK;
2084            switch (c)
2085              {
2086              HSPACE_CASES:
2087              OK = TRUE;
2088              break;
2089    
2090              default:
2091              OK = FALSE;
2092              break;
2093              }
2094    
2095            if (OK == (d == OP_HSPACE))
2096              {
2097              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2098                {
2099                active_count--;           /* Remove non-match possibility */
2100                next_active_state--;
2101                }
2102              if (++count >= (int)GET2(code, 1))
2103                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2104              else
2105                { ADD_NEW_DATA(-state_offset, count, 0); }
2106              }
2107            }
2108          break;
2109    
2110  /* ========================================================================== */  /* ========================================================================== */
2111        /* These opcodes are followed by a character that is usually compared        /* These opcodes are followed by a character that is usually compared
# Line 1113  for (;;) Line 2119  for (;;)
2119        break;        break;
2120    
2121        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2122        case OP_CHARNC:        case OP_CHARI:
2123        if (clen == 0) break;        if (clen == 0) break;
2124    
2125  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2126        if (utf8)        if (utf)
2127          {          {
2128          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2129            {            {
2130            int othercase;            unsigned int othercase;
2131            if (c < 128) othercase = fcc[c]; else            if (c < 128)
2132                othercase = fcc[c];
2133            /* If we have Unicode property support, we can use it to test the            else
2134            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2135                other case of the character. */
2136  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2137            othercase = _pcre_ucp_othercase(c);              othercase = UCD_OTHERCASE(c);
2138  #else  #else
2139            othercase = -1;              othercase = NOTACHAR;
2140  #endif  #endif
2141    
2142            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2143            }            }
2144          }          }
2145        else        else
2146  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2147          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2148          {          {
2149          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2150              { ADD_NEW(state_offset + 2, 0); }
2151          }          }
2152        break;        break;
2153    
# Line 1153  for (;;) Line 2159  for (;;)
2159        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2160    
2161        case OP_EXTUNI:        case OP_EXTUNI:
2162        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
2163          {          {
2164          const uschar *nptr = ptr + clen;          int lgb, rgb;
2165            const pcre_uchar *nptr = ptr + clen;
2166          int ncount = 0;          int ncount = 0;
2167            lgb = UCD_GRAPHBREAK(c);
2168          while (nptr < end_subject)          while (nptr < end_subject)
2169            {            {
2170            int nclen = 1;            dlen = 1;
2171            GETCHARLEN(c, nptr, nclen);            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2172            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            rgb = UCD_GRAPHBREAK(d);
2173              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2174            ncount++;            ncount++;
2175            nptr += nclen;            lgb = rgb;
2176              nptr += dlen;
2177            }            }
2178            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2179                reset_could_continue = TRUE;
2180          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2181          }          }
2182        break;        break;
2183  #endif  #endif
2184    
2185        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2186        /* Match a negated single character. This is only used for one-byte        /* This is a tricky like EXTUNI because it too can match more than one
2187        characters, that is, we know that d < 256. The character we are        character (when CR is followed by LF). In this case, set up a negative
2188        checking (c) can be multibyte. */        state to wait for one character to pass before continuing. */
2189    
2190          case OP_ANYNL:
2191          if (clen > 0) switch(c)
2192            {
2193            case CHAR_VT:
2194            case CHAR_FF:
2195            case CHAR_NEL:
2196    #ifndef EBCDIC
2197            case 0x2028:
2198            case 0x2029:
2199    #endif  /* Not EBCDIC */
2200            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2201    
2202            case CHAR_LF:
2203            ADD_NEW(state_offset + 1, 0);
2204            break;
2205    
2206            case CHAR_CR:
2207            if (ptr + 1 >= end_subject)
2208              {
2209              ADD_NEW(state_offset + 1, 0);
2210              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2211                reset_could_continue = TRUE;
2212              }
2213            else if (ptr[1] == CHAR_LF)
2214              {
2215              ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2216              }
2217            else
2218              {
2219              ADD_NEW(state_offset + 1, 0);
2220              }
2221            break;
2222            }
2223          break;
2224    
2225          /*-----------------------------------------------------------------*/
2226          case OP_NOT_VSPACE:
2227          if (clen > 0) switch(c)
2228            {
2229            VSPACE_CASES:
2230            break;
2231    
2232            default:
2233            ADD_NEW(state_offset + 1, 0);
2234            break;
2235            }
2236          break;
2237    
2238          /*-----------------------------------------------------------------*/
2239          case OP_VSPACE:
2240          if (clen > 0) switch(c)
2241            {
2242            VSPACE_CASES:
2243            ADD_NEW(state_offset + 1, 0);
2244            break;
2245    
2246            default:
2247            break;
2248            }
2249          break;
2250    
2251          /*-----------------------------------------------------------------*/
2252          case OP_NOT_HSPACE:
2253          if (clen > 0) switch(c)
2254            {
2255            HSPACE_CASES:
2256            break;
2257    
2258            default:
2259            ADD_NEW(state_offset + 1, 0);
2260            break;
2261            }
2262          break;
2263    
2264          /*-----------------------------------------------------------------*/
2265          case OP_HSPACE:
2266          if (clen > 0) switch(c)
2267            {
2268            HSPACE_CASES:
2269            ADD_NEW(state_offset + 1, 0);
2270            break;
2271    
2272            default:
2273            break;
2274            }
2275          break;
2276    
2277          /*-----------------------------------------------------------------*/
2278          /* Match a negated single character casefully. */
2279    
2280        case OP_NOT:        case OP_NOT:
2281          if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2282          break;
2283    
2284          /*-----------------------------------------------------------------*/
2285          /* Match a negated single character caselessly. */
2286    
2287          case OP_NOTI:
2288        if (clen > 0)        if (clen > 0)
2289          {          {
2290          int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          unsigned int otherd;
2291          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  #ifdef SUPPORT_UTF
2292            if (utf && d >= 128)
2293              {
2294    #ifdef SUPPORT_UCP
2295              otherd = UCD_OTHERCASE(d);
2296    #endif  /* SUPPORT_UCP */
2297              }
2298            else
2299    #endif  /* SUPPORT_UTF */
2300            otherd = TABLE_GET(d, fcc, d);
2301            if (c != d && c != otherd)
2302              { ADD_NEW(state_offset + dlen + 1, 0); }
2303          }          }
2304        break;        break;
2305    
2306        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2307          case OP_PLUSI:
2308          case OP_MINPLUSI:
2309          case OP_POSPLUSI:
2310          case OP_NOTPLUSI:
2311          case OP_NOTMINPLUSI:
2312          case OP_NOTPOSPLUSI:
2313          caseless = TRUE;
2314          codevalue -= OP_STARI - OP_STAR;
2315    
2316          /* Fall through */
2317        case OP_PLUS:        case OP_PLUS:
2318        case OP_MINPLUS:        case OP_MINPLUS:
2319          case OP_POSPLUS:
2320        case OP_NOTPLUS:        case OP_NOTPLUS:
2321        case OP_NOTMINPLUS:        case OP_NOTMINPLUS:
2322          case OP_NOTPOSPLUS:
2323        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
2324        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2325        if (clen > 0)        if (clen > 0)
2326          {          {
2327          int otherd = -1;          pcre_uint32 otherd = NOTACHAR;
2328          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2329            {            {
2330  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2331            if (utf8 && d >= 128)            if (utf && d >= 128)
2332              {              {
2333  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2334              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2335  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2336              }              }
2337            else            else
2338  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2339            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2340            }            }
2341          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2342            { count++; ADD_NEW(state_offset, count); }            {
2343              if (count > 0 &&
2344                  (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2345                {
2346                active_count--;             /* Remove non-match possibility */
2347                next_active_state--;
2348                }
2349              count++;
2350              ADD_NEW(state_offset, count);
2351              }
2352          }          }
2353        break;        break;
2354    
2355        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2356          case OP_QUERYI:
2357          case OP_MINQUERYI:
2358          case OP_POSQUERYI:
2359          case OP_NOTQUERYI:
2360          case OP_NOTMINQUERYI:
2361          case OP_NOTPOSQUERYI:
2362          caseless = TRUE;
2363          codevalue -= OP_STARI - OP_STAR;
2364          /* Fall through */
2365        case OP_QUERY:        case OP_QUERY:
2366        case OP_MINQUERY:        case OP_MINQUERY:
2367          case OP_POSQUERY:
2368        case OP_NOTQUERY:        case OP_NOTQUERY:
2369        case OP_NOTMINQUERY:        case OP_NOTMINQUERY:
2370          case OP_NOTPOSQUERY:
2371        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2372        if (clen > 0)        if (clen > 0)
2373          {          {
2374          int otherd = -1;          pcre_uint32 otherd = NOTACHAR;
2375          if ((ims && PCRE_CASELESS) != 0)          if (caseless)
2376            {            {
2377  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2378            if (utf8 && d >= 128)            if (utf && d >= 128)
2379              {              {
2380  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2381              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2382  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2383              }              }
2384            else            else
2385  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2386            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2387            }            }
2388          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2389            { ADD_NEW(state_offset + dlen + 1, 0); }            {
2390              if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2391                {
2392                active_count--;            /* Remove non-match possibility */
2393                next_active_state--;
2394                }
2395              ADD_NEW(state_offset + dlen + 1, 0);
2396              }
2397          }          }
2398        break;        break;
2399    
2400        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2401          case OP_STARI:
2402          case OP_MINSTARI:
2403          case OP_POSSTARI:
2404          case OP_NOTSTARI:
2405          case OP_NOTMINSTARI:
2406          case OP_NOTPOSSTARI:
2407          caseless = TRUE;
2408          codevalue -= OP_STARI - OP_STAR;
2409          /* Fall through */
2410        case OP_STAR:        case OP_STAR:
2411        case OP_MINSTAR:        case OP_MINSTAR:
2412          case OP_POSSTAR:
2413        case OP_NOTSTAR:        case OP_NOTSTAR:
2414        case OP_NOTMINSTAR:        case OP_NOTMINSTAR:
2415          case OP_NOTPOSSTAR:
2416        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2417        if (clen > 0)        if (clen > 0)
2418          {          {
2419          int otherd = -1;          pcre_uint32 otherd = NOTACHAR;
2420          if ((ims && PCRE_CASELESS) != 0)          if (caseless)
2421            {            {
2422  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2423            if (utf8 && d >= 128)            if (utf && d >= 128)
2424              {              {
2425  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2426              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2427  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2428              }              }
2429            else            else
2430  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2431            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2432            }            }
2433          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2434            { ADD_NEW(state_offset, 0); }            {
2435              if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2436                {
2437                active_count--;            /* Remove non-match possibility */
2438                next_active_state--;
2439                }
2440              ADD_NEW(state_offset, 0);
2441              }
2442          }          }
2443        break;        break;
2444    
2445        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2446          case OP_EXACTI:
2447          case OP_NOTEXACTI:
2448          caseless = TRUE;
2449          codevalue -= OP_STARI - OP_STAR;
2450          /* Fall through */
2451        case OP_EXACT:        case OP_EXACT:
2452          case OP_NOTEXACT:
2453          count = current_state->count;  /* Number already matched */
2454          if (clen > 0)
2455            {
2456            pcre_uint32 otherd = NOTACHAR;
2457            if (caseless)
2458              {
2459    #ifdef SUPPORT_UTF
2460              if (utf && d >= 128)
2461                {
2462    #ifdef SUPPORT_UCP
2463                otherd = UCD_OTHERCASE(d);
2464    #endif  /* SUPPORT_UCP */
2465                }
2466              else
2467    #endif  /* SUPPORT_UTF */
2468              otherd = TABLE_GET(d, fcc, d);
2469              }
2470            if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2471              {
2472              if (++count >= (int)GET2(code, 1))
2473                { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2474              else
2475                { ADD_NEW(state_offset, count); }
2476              }
2477            }
2478          break;
2479    
2480          /*-----------------------------------------------------------------*/
2481          case OP_UPTOI:
2482          case OP_MINUPTOI:
2483          case OP_POSUPTOI:
2484          case OP_NOTUPTOI:
2485          case OP_NOTMINUPTOI:
2486          case OP_NOTPOSUPTOI:
2487          caseless = TRUE;
2488          codevalue -= OP_STARI - OP_STAR;
2489          /* Fall through */
2490        case OP_UPTO:        case OP_UPTO:
2491        case OP_MINUPTO:        case OP_MINUPTO:
2492        case OP_NOTEXACT:        case OP_POSUPTO:
2493        case OP_NOTUPTO:        case OP_NOTUPTO:
2494        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2495        if (codevalue != OP_EXACT && codevalue != OP_NOTEXACT)        case OP_NOTPOSUPTO:
2496          { ADD_ACTIVE(state_offset + dlen + 3, 0); }        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2497        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2498        if (clen > 0)        if (clen > 0)
2499          {          {
2500          int otherd = -1;          pcre_uint32 otherd = NOTACHAR;
2501          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2502            {            {
2503  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2504            if (utf8 && d >= 128)            if (utf && d >= 128)
2505              {              {
2506  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2507              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2508  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2509              }              }
2510            else            else
2511  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2512            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2513            }            }
2514          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2515            {            {
2516            if (++count >= GET2(code, 1))            if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2517              { ADD_NEW(state_offset + dlen + 3, 0); }              {
2518                active_count--;             /* Remove non-match possibility */
2519                next_active_state--;
2520                }
2521              if (++count >= (int)GET2(code, 1))
2522                { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2523            else            else
2524              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2525            }            }
# Line 1311  for (;;) Line 2536  for (;;)
2536          {          {
2537          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2538          int next_state_offset;          int next_state_offset;
2539          const uschar *ecode;          const pcre_uchar *ecode;
2540    
2541          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2542          can set isinclass from it. */          can set isinclass from it. */
2543    
2544          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2545            {            {
2546            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2547            if (clen > 0)            if (clen > 0)
2548              {              {
2549              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2550                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2551              }              }
2552            }            }
2553    
# Line 1333  for (;;) Line 2558  for (;;)
2558          else          else
2559           {           {
2560           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2561           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2562           }           }
2563    
2564          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2565          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2566          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2567    
2568          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2569    
2570          switch (*ecode)          switch (*ecode)
2571            {            {
2572            case OP_CRSTAR:            case OP_CRSTAR:
2573            case OP_CRMINSTAR:            case OP_CRMINSTAR:
2574              case OP_CRPOSSTAR:
2575            ADD_ACTIVE(next_state_offset + 1, 0);            ADD_ACTIVE(next_state_offset + 1, 0);
2576            if (isinclass) { ADD_NEW(state_offset, 0); }            if (isinclass)
2577                {
2578                if (*ecode == OP_CRPOSSTAR)
2579                  {
2580                  active_count--;           /* Remove non-match possibility */
2581                  next_active_state--;
2582                  }
2583                ADD_NEW(state_offset, 0);
2584                }
2585            break;            break;
2586    
2587            case OP_CRPLUS:            case OP_CRPLUS:
2588            case OP_CRMINPLUS:            case OP_CRMINPLUS:
2589              case OP_CRPOSPLUS:
2590            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2591            if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }            if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2592            if (isinclass) { count++; ADD_NEW(state_offset, count); }            if (isinclass)
2593                {
2594                if (count > 0 && *ecode == OP_CRPOSPLUS)
2595                  {
2596                  active_count--;           /* Remove non-match possibility */
2597                  next_active_state--;
2598                  }
2599                count++;
2600                ADD_NEW(state_offset, count);
2601                }
2602            break;            break;
2603    
2604            case OP_CRQUERY:            case OP_CRQUERY:
2605            case OP_CRMINQUERY:            case OP_CRMINQUERY:
2606              case OP_CRPOSQUERY:
2607            ADD_ACTIVE(next_state_offset + 1, 0);            ADD_ACTIVE(next_state_offset + 1, 0);
2608            if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }            if (isinclass)
2609                {
2610                if (*ecode == OP_CRPOSQUERY)
2611                  {
2612                  active_count--;           /* Remove non-match possibility */
2613                  next_active_state--;
2614                  }
2615                ADD_NEW(next_state_offset + 1, 0);
2616                }
2617            break;            break;
2618    
2619            case OP_CRRANGE:            case OP_CRRANGE:
2620            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2621              case OP_CRPOSRANGE:
2622            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2623            if (count >= GET2(ecode, 1))            if (count >= (int)GET2(ecode, 1))
2624              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2625            if (isinclass)            if (isinclass)
2626              {              {
2627              if (++count >= GET2(ecode, 3))              int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2628                { ADD_NEW(next_state_offset + 5, 0); }              if (*ecode == OP_CRPOSRANGE)
2629                  {
2630                  active_count--;           /* Remove non-match possibility */
2631                  next_active_state--;
2632                  }
2633                if (++count >= max && max != 0)   /* Max 0 => no limit */
2634                  { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2635              else              else
2636                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2637              }              }
# Line 1386  for (;;) Line 2646  for (;;)
2646    
2647  /* ========================================================================== */  /* ========================================================================== */
2648        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2649        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2650          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2651          though the other "backtracking verbs" are not supported. */
2652    
2653          case OP_FAIL:
2654          forced_fail++;    /* Count FAILs for multiple states */
2655          break;
2656    
2657        case OP_ASSERT:        case OP_ASSERT:
2658        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1396  for (;;) Line 2662  for (;;)
2662          int rc;          int rc;
2663          int local_offsets[2];          int local_offsets[2];
2664          int local_workspace[1000];          int local_workspace[1000];
2665          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2666    
2667          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2668    
# Line 1404  for (;;) Line 2670  for (;;)
2670            md,                                   /* static match data */            md,                                   /* static match data */
2671            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2672            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2673            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2674            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2675            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2676            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2677            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2678            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2679    
2680            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2681          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2682              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2683          }          }
2684        break;        break;
2685    
2686        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2687        case OP_COND:        case OP_COND:
2688          case OP_SCOND:
2689          {          {
2690          int local_offsets[1000];          int local_offsets[1000];
2691          int local_workspace[1000];          int local_workspace[1000];
2692          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2693            int condcode;
2694    
2695          /* The only supported version of OP_CREF is for the value 0xffff, which          /* Because of the way auto-callout works during compile, a callout item
2696          means "test if in a recursion". */          is inserted between OP_COND and an assertion condition. This does not
2697            happen for the other conditions. */
2698    
2699          if (condcode == OP_CREF)          if (code[LINK_SIZE+1] == OP_CALLOUT)
2700            {            {
2701            int value = GET2(code, LINK_SIZE+2);            rrc = 0;
2702            if (value != 0xffff) return PCRE_ERROR_DFA_UCOND;            if (PUBL(callout) != NULL)
2703            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }              {
2704              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              PUBL(callout_block) cb;
2705                cb.version          = 1;   /* Version 1 of the callout block */
2706                cb.callout_number   = code[LINK_SIZE+2];
2707                cb.offset_vector    = offsets;
2708    #if defined COMPILE_PCRE8
2709                cb.subject          = (PCRE_SPTR)start_subject;
2710    #elif defined COMPILE_PCRE16
2711                cb.subject          = (PCRE_SPTR16)start_subject;
2712    #elif defined COMPILE_PCRE32
2713                cb.subject          = (PCRE_SPTR32)start_subject;
2714    #endif
2715                cb.subject_length   = (int)(end_subject - start_subject);
2716                cb.start_match      = (int)(current_subject - start_subject);
2717                cb.current_position = (int)(ptr - start_subject);
2718                cb.pattern_position = GET(code, LINK_SIZE + 3);
2719                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2720                cb.capture_top      = 1;
2721                cb.capture_last     = -1;
2722                cb.callout_data     = md->callout_data;
2723                cb.mark             = NULL;   /* No (*MARK) support */
2724                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2725                }
2726              if (rrc > 0) break;                      /* Fail this thread */
2727              code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2728              }
2729    
2730            condcode = code[LINK_SIZE+1];
2731    
2732            /* Back reference conditions and duplicate named recursion conditions
2733            are not supported */
2734    
2735            if (condcode == OP_CREF || condcode == OP_DNCREF ||
2736                condcode == OP_DNRREF)
2737              return PCRE_ERROR_DFA_UCOND;
2738    
2739            /* The DEFINE condition is always false */
2740    
2741            if (condcode == OP_DEF)
2742              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2743    
2744            /* The only supported version of OP_RREF is for the value RREF_ANY,
2745            which means "test if in any recursion". We can't test for specifically
2746            recursed groups. */
2747    
2748            else if (condcode == OP_RREF)
2749              {
2750              int value = GET2(code, LINK_SIZE + 2);
2751              if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2752              if (md->recursive != NULL)
2753                { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2754              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2755            }            }
2756    
2757          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1441  for (;;) Line 2759  for (;;)
2759          else          else
2760            {            {
2761            int rc;            int rc;
2762            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2763            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2764    
2765            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2766    
# Line 1450  for (;;) Line 2768  for (;;)
2768              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2769              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2770              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2771              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2772              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2773              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2774              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2775              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2776              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2777    
2778              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2779            if ((rc >= 0) ==            if ((rc >= 0) ==
2780                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2781              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2782            else            else
2783              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2784            }            }
2785          }          }
2786        break;        break;
# Line 1471  for (;;) Line 2788  for (;;)
2788        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2789        case OP_RECURSE:        case OP_RECURSE:
2790          {          {
2791            dfa_recursion_info *ri;
2792          int local_offsets[1000];          int local_offsets[1000];
2793          int local_workspace[1000];          int local_workspace[1000];
2794            const pcre_uchar *callpat = start_code + GET(code, 1);
2795            int recno = (callpat == md->start_code)? 0 :
2796              GET2(callpat, 1 + LINK_SIZE);
2797          int rc;          int rc;
2798    
2799          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2800            recursing + 1));  
2801            /* Check for repeating a recursion without advancing the subject
2802            pointer. This should catch convoluted mutual recursions. (Some simple
2803            cases are caught at compile time.) */
2804    
2805            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2806              if (recno == ri->group_num && ptr == ri->subject_position)
2807                return PCRE_ERROR_RECURSELOOP;
2808    
2809            /* Remember this recursion and where we started it so as to
2810            catch infinite loops. */
2811    
2812            new_recursive.group_num = recno;
2813            new_recursive.subject_position = ptr;
2814            new_recursive.prevrec = md->recursive;
2815            md->recursive = &new_recursive;
2816    
2817          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2818            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2819            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2820            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2821            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2822            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2823            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2824            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2825            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2826            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
2827            rlevel,                               /* function recursion level */  
2828            recursing + 1);                       /* regex recurse level */          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2829    
2830          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2831            recursing + 1, rc));            rc));
2832    
2833          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2834    
# Line 1506  for (;;) Line 2842  for (;;)
2842            {            {
2843            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2844              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2845              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2846              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2847                if (utf)
2848                  {
2849                  const pcre_uchar *p = start_subject + local_offsets[rc];
2850                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2851                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2852                  }
2853    #endif
2854              if (charcount > 0)              if (charcount > 0)
2855                {                {
2856                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 1525  for (;;) Line 2866  for (;;)
2866        break;        break;
2867    
2868        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2869          case OP_BRAPOS:
2870          case OP_SBRAPOS:
2871          case OP_CBRAPOS:
2872          case OP_SCBRAPOS:
2873          case OP_BRAPOSZERO:
2874            {
2875            int charcount, matched_count;
2876            const pcre_uchar *local_ptr = ptr;
2877            BOOL allow_zero;
2878    
2879            if (codevalue == OP_BRAPOSZERO)
2880              {
2881              allow_zero = TRUE;
2882              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2883              }
2884            else allow_zero = FALSE;
2885    
2886            /* Loop to match the subpattern as many times as possible as if it were
2887            a complete pattern. */
2888    
2889            for (matched_count = 0;; matched_count++)
2890              {
2891              int local_offsets[2];
2892              int local_workspace[1000];
2893    
2894              int rc = internal_dfa_exec(
2895                md,                                   /* fixed match data */
2896                code,                                 /* this subexpression's code */
2897                local_ptr,                            /* where we currently are */
2898                (int)(ptr - start_subject),           /* start offset */
2899                local_offsets,                        /* offset vector */
2900                sizeof(local_offsets)/sizeof(int),    /* size of same */
2901                local_workspace,                      /* workspace vector */
2902                sizeof(local_workspace)/sizeof(int),  /* size of same */
2903                rlevel);                              /* function recursion level */
2904    
2905              /* Failed to match */
2906    
2907              if (rc < 0)
2908                {
2909                if (rc != PCRE_ERROR_NOMATCH) return rc;
2910                break;
2911                }
2912    
2913              /* Matched: break the loop if zero characters matched. */
2914    
2915              charcount = local_offsets[1] - local_offsets[0];
2916              if (charcount == 0) break;
2917              local_ptr += charcount;    /* Advance temporary position ptr */
2918              }
2919    
2920            /* At this point we have matched the subpattern matched_count
2921            times, and local_ptr is pointing to the character after the end of the
2922            last match. */
2923    
2924            if (matched_count > 0 || allow_zero)
2925              {
2926              const pcre_uchar *end_subpattern = code;
2927              int next_state_offset;
2928    
2929              do { end_subpattern += GET(end_subpattern, 1); }
2930                while (*end_subpattern == OP_ALT);
2931              next_state_offset =
2932                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2933    
2934              /* Optimization: if there are no more active states, and there
2935              are no new states yet set up, then skip over the subject string
2936              right here, to save looping. Otherwise, set up the new state to swing
2937              into action when the end of the matched substring is reached. */
2938    
2939              if (i + 1 >= active_count && new_count == 0)
2940                {
2941                ptr = local_ptr;
2942                clen = 0;
2943                ADD_NEW(next_state_offset, 0);
2944                }
2945              else
2946                {
2947                const pcre_uchar *p = ptr;
2948                const pcre_uchar *pp = local_ptr;
2949                charcount = (int)(pp - p);
2950    #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2951                if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2952    #endif
2953                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2954                }
2955              }
2956            }
2957          break;
2958    
2959          /*-----------------------------------------------------------------*/
2960        case OP_ONCE:        case OP_ONCE:
2961          case OP_ONCE_NC:
2962          {          {
2963          int local_offsets[2];          int local_offsets[2];
2964          int local_workspace[1000];          int local_workspace[1000];
# Line 1534  for (;;) Line 2967  for (;;)
2967            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2968            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2969            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2970            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2971            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2972            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2973            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2974            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2975            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2976    
2977          if (rc >= 0)          if (rc >= 0)
2978            {            {
2979            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2980            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2981            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2982    
2983            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2984              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2985            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2986                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2987    
2988            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2989            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 1559  for (;;) Line 2991  for (;;)
2991    
2992            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2993                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2994              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2995    
2996            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2997            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 1574  for (;;) Line 3006  for (;;)
3006            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
3007            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
3008            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
3009            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
3010    
3011            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
3012              {              {
# Line 1597  for (;;) Line 3029  for (;;)
3029              }              }
3030            else            else
3031              {              {
3032              const uschar *p = start_subject + local_offsets[0];  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3033              const uschar *pp = start_subject + local_offsets[1];              if (utf)
3034              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;                {
3035                  const pcre_uchar *p = start_subject + local_offsets[0];
3036                  const pcre_uchar *pp = start_subject + local_offsets[1];
3037                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
3038                  }
3039    #endif
3040              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
3041              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
3042                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
3043              }              }
   
3044            }            }
3045          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
3046          }          }
# Line 1615  for (;;) Line 3051  for (;;)
3051        /* Handle callouts */        /* Handle callouts */
3052    
3053        case OP_CALLOUT:        case OP_CALLOUT:
3054        if (pcre_callout != NULL)        rrc = 0;
3055          if (PUBL(callout) != NULL)
3056          {          {
3057          int rrc;          PUBL(callout_block) cb;
         pcre_callout_block cb;  
3058          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
3059          cb.callout_number   = code[1];          cb.callout_number   = code[1];
3060          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
3061    #if defined COMPILE_PCRE8
3062          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
3063          cb.subject_length   = end_subject - start_subject;  #elif defined COMPILE_PCRE16
3064          cb.start_match      = current_subject - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
3065          cb.current_position = ptr - start_subject;  #elif defined COMPILE_PCRE32
3066            cb.subject          = (PCRE_SPTR32)start_subject;
3067    #endif
3068            cb.subject_length   = (int)(end_subject - start_subject);
3069            cb.start_match      = (int)(current_subject - start_subject);
3070            cb.current_position = (int)(ptr - start_subject);
3071          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
3072          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
3073          cb.capture_top      = 1;          cb.capture_top      = 1;
3074          cb.capture_last     = -1;          cb.capture_last     = -1;
3075          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
3076          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
3077          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3078          }          }
3079          if (rrc == 0)
3080            { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3081        break;        break;
3082    
3083    
# Line 1649  for (;;) Line 3093  for (;;)
3093    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
3094    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
3095    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
3096    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
3097    
3098      The "forced_ fail" variable counts the number of (*F) encountered for the
3099      character. If it is equal to the original active_count (saved in
3100      workspace[1]) it means that (*F) was found on every active state. In this
3101      case we don't want to give a partial match.
3102    
3103      The "could_continue" variable is true if a state could have continued but
3104      for the fact that the end of the subject was reached. */
3105    
3106    if (new_count <= 0)    if (new_count <= 0)
3107      {      {
3108      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
3109          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on, and */
3110          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3111          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
3112          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3113        {          ||                                           /* or... */
3114        if (offsetcount >= 2)          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3115          {           match_count < 0)                            /* no matches */
3116          offsets[0] = current_subject - start_subject;          ) &&                                         /* And... */
3117          offsets[1] = end_subject - start_subject;          (
3118          }          partial_newline ||                           /* Either partial NL */
3119              (                                          /* or ... */
3120              ptr >= end_subject &&                /* End of subject and */
3121              ptr > md->start_used_ptr)            /* Inspected non-empty string */
3122              )
3123            )
3124        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
       }  
   
3125      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3126        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3127        rlevel*2-2, SP));        rlevel*2-2, SP));
3128      return match_count;      break;        /* In effect, "return", but see the comment below */
3129      }      }
3130    
3131    /* One or more states are active for the next character. */    /* One or more states are active for the next character. */
# Line 1678  for (;;) Line 3133  for (;;)
3133    ptr += clen;    /* Advance to next subject character */    ptr += clen;    /* Advance to next subject character */
3134    }               /* Loop to move along the subject string */    }               /* Loop to move along the subject string */
3135    
3136  /* Control never gets here, but we must keep the compiler happy. */  /* Control gets here from "break" a few lines above. We do it this way because
3137    if we use "return" above, we have compiler trouble. Some compilers warn if
3138    there's nothing here because they think the function doesn't return a value. On
3139    the other hand, if we put a dummy statement here, some more clever compilers
3140    complain that it can't be reached. Sigh. */
3141    
3142  DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n"  return match_count;
   "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP));  
 return PCRE_ERROR_NOMATCH;  
3143  }  }
3144    
3145    
# Line 1698  is not anchored. Line 3155  is not anchored.
3155    
3156  Arguments:  Arguments:
3157    argument_re     points to the compiled expression    argument_re     points to the compiled expression
3158    extra_data      points to extra data or is NULL (not currently used)    extra_data      points to extra data or is NULL
3159    subject         points to the subject string    subject         points to the subject string
3160    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
3161    start_offset    where to start in the subject string    start_offset    where to start in the subject string
# Line 1714  Returns:          > 0 => number of match Line 3171  Returns:          > 0 => number of match
3171                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3172  */  */
3173    
3174  PCRE_DATA_SCOPE int  #if defined COMPILE_PCRE8
3175    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3176  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3177    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3178    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3179    #elif defined COMPILE_PCRE16
3180    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3181    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3182      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3183      int offsetcount, int *workspace, int wscount)
3184    #elif defined COMPILE_PCRE32
3185    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3186    pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3187      PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3188      int offsetcount, int *workspace, int wscount)
3189    #endif
3190  {  {
3191  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3192  dfa_match_data match_block;  dfa_match_data match_block;
3193  BOOL utf8, anchored, startline, firstline;  dfa_match_data *md = &match_block;
3194  const uschar *current_subject, *end_subject, *lcc;  BOOL utf, anchored, startline, firstline;
3195    const pcre_uchar *current_subject, *end_subject;
 pcre_study_data internal_study;  
3196  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3197    
3198  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3199  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3200  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3201  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3202  int first_byte = -1;  pcre_uchar first_char = 0;
3203  int req_byte = -1;  pcre_uchar first_char2 = 0;
3204  int req_byte2 = -1;  pcre_uchar req_char = 0;
3205    pcre_uchar req_char2 = 0;
3206    int newline;
3207    
3208  /* Plausibility checks */  /* Plausibility checks */
3209    
# Line 1743  if (re == NULL || subject == NULL || wor Line 3212  if (re == NULL || subject == NULL || wor
3212     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3213  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3214  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3215    if (length < 0) return PCRE_ERROR_BADLENGTH;
3216    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3217    
3218  /* We need to find the pointer to any study data before we test for byte  /* Check that the first field in the block is the magic number. If it is not,
3219  flipping, so we scan the extra_data block first. This may set two fields in the  return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3220  match block, so we must initialize them beforehand. However, the other fields  REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3221  in the match block must not be set until after the byte flipping. */  means that the pattern is likely compiled with different endianness. */
3222    
3223    if (re->magic_number != MAGIC_NUMBER)
3224      return re->magic_number == REVERSED_MAGIC_NUMBER?
3225        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3226    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3227    
3228  match_block.tables = re->tables;  /* If restarting after a partial match, do some sanity checks on the contents
3229  match_block.callout_data = NULL;  of the workspace. */
3230    
3231    if ((options & PCRE_DFA_RESTART) != 0)
3232      {
3233      if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3234        workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3235          return PCRE_ERROR_DFA_BADRESTART;
3236      }
3237    
3238    /* Set up study, callout, and table data */
3239    
3240    md->tables = re->tables;
3241    md->callout_data = NULL;
3242    
3243  if (extra_data != NULL)  if (extra_data != NULL)
3244    {    {
# Line 1761  if (extra_data != NULL) Line 3249  if (extra_data != NULL)
3249    if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)    if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3250      return PCRE_ERROR_DFA_UMLIMIT;      return PCRE_ERROR_DFA_UMLIMIT;
3251    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3252      match_block.callout_data = extra_data->callout_data;      md->callout_data = extra_data->callout_data;
3253    if ((flags & PCRE_EXTRA_TABLES) != 0)    if ((flags & PCRE_EXTRA_TABLES) != 0)
3254      match_block.tables = extra_data->tables;      md->tables = extra_data->tables;
   }  
   
 /* Check that the first field in the block is the magic number. If it is not,  
 test for a regex that was compiled on a host of opposite endianness. If this is  
 the case, flipped values are put in internal_re and internal_study if there was  
 study data too. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   {  
   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);  
   if (re == NULL) return PCRE_ERROR_BADMAGIC;  
   if (study != NULL) study = &internal_study;  
3255    }    }
3256    
3257  /* Set some local values */  /* Set some local values */
3258    
3259  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3260  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3261  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3262    
3263  utf8 = (re->options & PCRE_UTF8) != 0;  #ifdef SUPPORT_UTF
3264    /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3265    utf = (re->options & PCRE_UTF8) != 0;
3266    #else
3267    utf = FALSE;
3268    #endif
3269    
3270  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3271    (re->options & PCRE_ANCHORED) != 0;    (re->options & PCRE_ANCHORED) != 0;
3272    
3273  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3274    
3275  match_block.start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3276      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3277  match_block.start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3278  match_block.end_subject = end_subject;  md->end_subject = end_subject;
3279  match_block.moptions = options;  md->start_offset = start_offset;
3280  match_block.poptions = re->options;  md->moptions = options;
3281    md->poptions = re->options;
3282    
3283    /* If the BSR option is not set at match time, copy what was set
3284    at compile time. */
3285    
3286    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3287      {
3288      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3289        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3290    #ifdef BSR_ANYCRLF
3291      else md->moptions |= PCRE_BSR_ANYCRLF;
3292    #endif
3293      }
3294    
3295    /* Handle different types of newline. The three bits give eight cases. If
3296    nothing is set at run time, whatever was used at compile time applies. */
3297    
3298    switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3299             PCRE_NEWLINE_BITS)
3300      {
3301      case 0: newline = NEWLINE; break;   /* Compile-time default */
3302      case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3303      case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3304      case PCRE_NEWLINE_CR+
3305           PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3306      case PCRE_NEWLINE_ANY: newline = -1; break;
3307      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3308      default: return PCRE_ERROR_BADNEWLINE;
3309      }
3310    
3311    if (newline == -2)
3312      {
3313      md->nltype = NLTYPE_ANYCRLF;
3314      }
3315    else if (newline < 0)
3316      {
3317      md->nltype = NLTYPE_ANY;
3318      }
3319    else
3320      {
3321      md->nltype = NLTYPE_FIXED;
3322      if (newline > 255)
3323        {
3324        md->nllen = 2;
3325        md->nl[0] = (newline >> 8) & 255;
3326        md->nl[1] = newline & 255;
3327        }
3328      else
3329        {
3330        md->nllen = 1;
3331        md->nl[0] = newline;
3332        }
3333      }
3334    
3335  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3336  back the character offset. */  back the character offset. */
3337    
3338  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3339  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3340    {    {
3341    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3342      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3343    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3344      {      {
3345      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3346        {        {
3347        tb &= 0xc0;        offsets[0] = erroroffset;
3348        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3349        }        }
3350    #if defined COMPILE_PCRE8
3351        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3352          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3353    #elif defined COMPILE_PCRE16
3354        return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3355          PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3356    #elif defined COMPILE_PCRE32
3357        return PCRE_ERROR_BADUTF32;
3358    #endif
3359      }      }
3360    #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3361      if (start_offset > 0 && start_offset < length &&
3362            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3363        return P