/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 150 by ph10, Tue Apr 17 08:22:40 2007 UTC revision 979 by ph10, Sun Jun 17 19:08:41 2012 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 37  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a sort of DFA algorithm (not a true  alternative matching function that uses a sort of DFA algorithm (not a true
43  FSM). This is NOT Perl- compatible, but it has advantages in certain  FSM). This is NOT Perl-compatible, but it has advantages in certain
44  applications. */  applications. */
45    
46    
47    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48    the performance of his patterns greatly. I could not use it as it stood, as it
49    was not thread safe, and made assumptions about pattern sizes. Also, it caused
50    test 7 to loop, and test 9 to crash with a segfault.
51    
52    The issue is the check for duplicate states, which is done by a simple linear
53    search up the state list. (Grep for "duplicate" below to find the code.) For
54    many patterns, there will never be many states active at one time, so a simple
55    linear search is fine. In patterns that have many active states, it might be a
56    bottleneck. The suggested code used an indexing scheme to remember which states
57    had previously been used for each character, and avoided the linear search when
58    it knew there was no chance of a duplicate. This was implemented when adding
59    states to the state lists.
60    
61    I wrote some thread-safe, not-limited code to try something similar at the time
62    of checking for duplicates (instead of when adding states), using index vectors
63    on the stack. It did give a 13% improvement with one specially constructed
64    pattern for certain subject strings, but on other strings and on many of the
65    simpler patterns in the test suite it did worse. The major problem, I think,
66    was the extra time to initialize the index. This had to be done for each call
67    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68    only once - I suspect this was the cause of the problems with the tests.)
69    
70    Overall, I concluded that the gains in some cases did not outweigh the losses
71    in others, so I abandoned this code. */
72    
73    
74    
75    #ifdef HAVE_CONFIG_H
76    #include "config.h"
77    #endif
78    
79  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
80  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
81  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 56  applications. */ Line 88  applications. */
88  #define SP "                   "  #define SP "                   "
89    
90    
   
91  /*************************************************  /*************************************************
92  *      Code parameters and static tables         *  *      Code parameters and static tables         *
93  *************************************************/  *************************************************/
94    
95  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
97  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
98    never stored, so we push them well clear of the normal opcodes. */
99    
100  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
101  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
102  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
103    #define OP_HSPACE_EXTRA     360
104    #define OP_VSPACE_EXTRA     380
105    
106    
107  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
108  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
109  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
110  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111  small value. */  small value. Non-zero values in the table are the offsets from the opcode where
112    the character is to be found. ***NOTE*** If the start of this table is
113    modified, the three tables that follow must also be modified. */
114    
115  static uschar coptable[] = {  static const pcre_uint8 coptable[] = {
116    0,                             /* End                                    */    0,                             /* End                                    */
117    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0,                          /* \P, \p                                 */
121      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122      0,                             /* \X                                     */
123      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
124    1,                             /* Char                                   */    1,                             /* Char                                   */
125    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
126    1,                             /* not                                    */    1,                             /* not                                    */
127      1,                             /* noti                                   */
128    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
129    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
132      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135      1+IMM2_SIZE,                   /* exact I                                */
136      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
138    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
141      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144      1+IMM2_SIZE,                   /* NOT exact I                            */
145      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
147    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
150      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
152    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 104  static uschar coptable[] = { Line 155  static uschar coptable[] = {
155    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
156    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
157    0,                             /* REF                                    */    0,                             /* REF                                    */
158      0,                             /* REFI                                   */
159    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
160    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
161    0,                             /* Alt                                    */    0,                             /* Alt                                    */
162    0,                             /* Ket                                    */    0,                             /* Ket                                    */
163    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
164    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
165      0,                             /* KetRpos                                */
166      0,                             /* Reverse                                */
167    0,                             /* Assert                                 */    0,                             /* Assert                                 */
168    0,                             /* Assert not                             */    0,                             /* Assert not                             */
169    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
170    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
171      0, 0,                          /* ONCE, ONCE_NC                          */
172      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
173      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
174      0, 0,                          /* CREF, NCREF                            */
175      0, 0,                          /* RREF, NRREF                            */
176      0,                             /* DEF                                    */
177      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
178      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
179      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
180      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
181      0, 0                           /* CLOSE, SKIPZERO  */
182    };
183    
184    /* This table identifies those opcodes that inspect a character. It is used to
185    remember the fact that a character could have been inspected when the end of
186    the subject is reached. ***NOTE*** If the start of this table is modified, the
187    two tables that follow must also be modified. */
188    
189    static const pcre_uint8 poptable[] = {
190      0,                             /* End                                    */
191      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
192      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
193      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
194      1, 1,                          /* \P, \p                                 */
195      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
196      1,                             /* \X                                     */
197      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
198      1,                             /* Char                                   */
199      1,                             /* Chari                                  */
200      1,                             /* not                                    */
201      1,                             /* noti                                   */
202      /* Positive single-char repeats                                          */
203      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
204      1, 1, 1,                       /* upto, minupto, exact                   */
205      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
206      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
207      1, 1, 1,                       /* upto I, minupto I, exact I             */
208      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
209      /* Negative single-char repeats - only for chars < 256                   */
210      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
211      1, 1, 1,                       /* NOT upto, minupto, exact               */
212      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
213      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
214      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
215      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
216      /* Positive type repeats                                                 */
217      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
218      1, 1, 1,                       /* Type upto, minupto, exact              */
219      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
220      /* Character class & ref repeats                                         */
221      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
222      1, 1,                          /* CRRANGE, CRMINRANGE                    */
223      1,                             /* CLASS                                  */
224      1,                             /* NCLASS                                 */
225      1,                             /* XCLASS - variable length               */
226      0,                             /* REF                                    */
227      0,                             /* REFI                                   */
228      0,                             /* RECURSE                                */
229      0,                             /* CALLOUT                                */
230      0,                             /* Alt                                    */
231      0,                             /* Ket                                    */
232      0,                             /* KetRmax                                */
233      0,                             /* KetRmin                                */
234      0,                             /* KetRpos                                */
235    0,                             /* Reverse                                */    0,                             /* Reverse                                */
236    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
237    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
238    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
239    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
240      0, 0,                          /* ONCE, ONCE_NC                          */
241      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
242      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
243      0, 0,                          /* CREF, NCREF                            */
244      0, 0,                          /* RREF, NRREF                            */
245    0,                             /* DEF                                    */    0,                             /* DEF                                    */
246    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
247      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
248      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
249      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
250      0, 0                           /* CLOSE, SKIPZERO                        */
251  };  };
252    
253  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254  and \w */  and \w */
255    
256  static uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
257    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
258    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
259    ctype_space, ctype_space,    ctype_space, ctype_space,
260    ctype_word,  ctype_word,    ctype_word,  ctype_word,
261    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
262  };  };
263    
264  static uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
265    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
266    ctype_digit, 0,    ctype_digit, 0,
267    ctype_space, 0,    ctype_space, 0,
268    ctype_word,  0,    ctype_word,  0,
269    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
270  };  };
271    
272    
# Line 151  these structures in, is a vector of ints Line 278  these structures in, is a vector of ints
278  typedef struct stateblock {  typedef struct stateblock {
279    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
280    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
281    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
282  } stateblock;  } stateblock;
283    
284  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
285    
286    
287  #ifdef DEBUG  #ifdef PCRE_DEBUG
288  /*************************************************  /*************************************************
289  *             Print character string             *  *             Print character string             *
290  *************************************************/  *************************************************/
# Line 174  Returns:       nothing Line 300  Returns:       nothing
300  */  */
301    
302  static void  static void
303  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
304  {  {
305  int c;  int c;
306  while (length-- > 0)  while (length-- > 0)
# Line 207  Arguments: Line 333  Arguments:
333    offsetcount       size of same    offsetcount       size of same
334    workspace         vector of workspace    workspace         vector of workspace
335    wscount           size of same    wscount           size of same
   ims               the current ims flags  
336    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
337    
338  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
339                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
340                       -1 => failed to match                       -1 => failed to match
341                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
342    
# Line 224  for the current character, one for the f Line 348  for the current character, one for the f
348      { \      { \
349      next_active_state->offset = (x); \      next_active_state->offset = (x); \
350      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
351      next_active_state++; \      next_active_state++; \
352      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353      } \      } \
# Line 235  for the current character, one for the f Line 358  for the current character, one for the f
358      { \      { \
359      next_active_state->offset = (x); \      next_active_state->offset = (x); \
360      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
361      next_active_state->data   = (z); \      next_active_state->data   = (z); \
362      next_active_state++; \      next_active_state++; \
363      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 247  for the current character, one for the f Line 369  for the current character, one for the f
369      { \      { \
370      next_new_state->offset = (x); \      next_new_state->offset = (x); \
371      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
372      next_new_state++; \      next_new_state++; \
373      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
374      } \      } \
# Line 258  for the current character, one for the f Line 379  for the current character, one for the f
379      { \      { \
380      next_new_state->offset = (x); \      next_new_state->offset = (x); \
381      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
382      next_new_state->data   = (z); \      next_new_state->data   = (z); \
383      next_new_state++; \      next_new_state++; \
384      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385          (x), (y), (z), __LINE__)); \
386      } \      } \
387    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
388    
# Line 270  for the current character, one for the f Line 391  for the current character, one for the f
391  static int  static int
392  internal_dfa_exec(  internal_dfa_exec(
393    dfa_match_data *md,    dfa_match_data *md,
394    const uschar *this_start_code,    const pcre_uchar *this_start_code,
395    const uschar *current_subject,    const pcre_uchar *current_subject,
396    int start_offset,    int start_offset,
397    int *offsets,    int *offsets,
398    int offsetcount,    int offsetcount,
399    int *workspace,    int *workspace,
400    int wscount,    int wscount,
401    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
402  {  {
403  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
404  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
405    
406  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
407  const uschar *ptr;  const pcre_uchar *ptr;
408  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
409    
410    dfa_recursion_info new_recursive;
411    
412  int active_count, new_count, match_count;  int active_count, new_count, match_count;
413    
414  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
415  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
416    
417  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
418  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
419  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
420    
421  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
422  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423  #else  #else
424  BOOL utf8 = FALSE;  BOOL utf = FALSE;
425  #endif  #endif
426    
427    BOOL reset_could_continue = FALSE;
428    
429  rlevel++;  rlevel++;
430  offsetcount &= (-2);  offsetcount &= (-2);
431    
# Line 311  wscount = (wscount - (wscount % (INTS_PE Line 434  wscount = (wscount - (wscount % (INTS_PE
434            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
435    
436  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
437    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
438    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439    
440  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
441  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 325  next_new_state = new_states = active_sta Line 448  next_new_state = new_states = active_sta
448  new_count = 0;  new_count = 0;
449    
450  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
451    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453        ? IMM2_SIZE:0);
454    
455  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 353  if (*first_op == OP_REVERSE) Line 478  if (*first_op == OP_REVERSE)
478    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
479    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
480    
481  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
482    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
483    
484    if (utf8)    if (utf)
485      {      {
486      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
487        {        {
488        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
489        current_subject--;        current_subject--;
490        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
491        }        }
492      }      }
493    else    else
# Line 374  if (*first_op == OP_REVERSE) Line 497  if (*first_op == OP_REVERSE)
497    
498      {      {
499      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
500        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
501      current_subject -= gone_back;      current_subject -= gone_back;
502      }      }
503    
504      /* Save the earliest consulted character */
505    
506      if (current_subject < md->start_used_ptr)
507        md->start_used_ptr = current_subject;
508    
509    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
510    
511    end_code = this_start_code;    end_code = this_start_code;
# Line 386  if (*first_op == OP_REVERSE) Line 514  if (*first_op == OP_REVERSE)
514      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
515      if (back <= gone_back)      if (back <= gone_back)
516        {        {
517        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
519        }        }
520      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 419  else Line 547  else
547    else    else
548      {      {
549      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
550        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552            ? IMM2_SIZE:0);
553      do      do
554        {        {
555        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
556        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
557        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
558        }        }
# Line 432  else Line 562  else
562    
563  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
564    
565  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566    
567  /* Loop for scanning the subject */  /* Loop for scanning the subject */
568    
# Line 442  for (;;) Line 572  for (;;)
572    int i, j;    int i, j;
573    int clen, dlen;    int clen, dlen;
574    unsigned int c, d;    unsigned int c, d;
575      int forced_fail = 0;
576      BOOL partial_newline = FALSE;
577      BOOL could_continue = reset_could_continue;
578      reset_could_continue = FALSE;
579    
580    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
581    new state list. */    new state list. */
# Line 455  for (;;) Line 589  for (;;)
589    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
590    workspace[1] = active_count;    workspace[1] = active_count;
591    
592  #ifdef DEBUG  #ifdef PCRE_DEBUG
593    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
595    printf("\"\n");    printf("\"\n");
596    
597    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 477  for (;;) Line 611  for (;;)
611    
612    if (ptr < end_subject)    if (ptr < end_subject)
613      {      {
614      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of data items in the character */
615  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
616      if (utf8) { GETCHARLEN(c, ptr, clen); } else      if (utf) { GETCHARLEN(c, ptr, clen); } else
617  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
618      c = *ptr;      c = *ptr;
619      }      }
620    else    else
# Line 497  for (;;) Line 631  for (;;)
631    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
632      {      {
633      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
634      const uschar *code;      BOOL caseless = FALSE;
635        const pcre_uchar *code;
636      int state_offset = current_state->offset;      int state_offset = current_state->offset;
637      int count, codevalue;      int count, codevalue, rrc;
     int chartype, script;  
638    
639  #ifdef DEBUG  #ifdef PCRE_DEBUG
640      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
641      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
642        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
643          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
644  #endif  #endif
645    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
646      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
647      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
648      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
649        state, arrange for it to passed on. */
650    
651      if (state_offset < 0)      if (state_offset < 0)
652        {        {
# Line 524  for (;;) Line 655  for (;;)
655          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
656          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
657            current_state->data - 1);            current_state->data - 1);
658            if (could_continue) reset_could_continue = TRUE;
659          continue;          continue;
660          }          }
661        else        else
# Line 532  for (;;) Line 664  for (;;)
664          }          }
665        }        }
666    
667      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
668        See the note at the head of this module about the possibility of improving
669        performance here. */
670    
671      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
672        {        {
# Line 549  for (;;) Line 683  for (;;)
683      code = start_code + state_offset;      code = start_code + state_offset;
684      codevalue = *code;      codevalue = *code;
685    
686        /* If this opcode inspects a character, but we are at the end of the
687        subject, remember the fact for use when testing for a partial match. */
688    
689        if (clen == 0 && poptable[codevalue] != 0)
690          could_continue = TRUE;
691    
692      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
693      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
694      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
695      permitted.      permitted.
696    
697      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
698      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long because
699      Unfortunately, we have to take special action to deal with  \P, \p, and      the values are small. We have to take special action to deal with  \P, \p,
700      \X in this case. To keep the other cases fast, convert these ones to new      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
701      opcodes. */      these ones to new opcodes. */
702    
703      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
704        {        {
705        dlen = 1;        dlen = 1;
706  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
707        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
708  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
709        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
710        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
711          {          {
# Line 576  for (;;) Line 716  for (;;)
716            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
717            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
718            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
719              case OP_NOT_HSPACE:
720              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
721              case OP_NOT_VSPACE:
722              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
723            default: break;            default: break;
724            }            }
725          }          }
# Line 591  for (;;) Line 735  for (;;)
735    
736      switch (codevalue)      switch (codevalue)
737        {        {
738    /* ========================================================================== */
739          /* These cases are never obeyed. This is a fudge that causes a compile-
740          time error if the vectors coptable or poptable, which are indexed by
741          opcode, are not the correct length. It seems to be the only way to do
742          such a check at compile time, as the sizeof() operator does not work
743          in the C preprocessor. */
744    
745          case OP_TABLE_LENGTH:
746          case OP_TABLE_LENGTH +
747            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
748             (sizeof(poptable) == OP_TABLE_LENGTH)):
749          break;
750    
751  /* ========================================================================== */  /* ========================================================================== */
752        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
753        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
754        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
755          subpattern, because the possessive subpattern repeats are always handled
756          using recursive calls. Thus, it never adds any new states.
757    
758          At the end of the (sub)pattern, unless we have an empty string and
759          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
760          start of the subject, save the match data, shifting up all previous
761        matches so we always have the longest first. */        matches so we always have the longest first. */
762    
763        case OP_KET:        case OP_KET:
764        case OP_KETRMIN:        case OP_KETRMIN:
765        case OP_KETRMAX:        case OP_KETRMAX:
766          case OP_KETRPOS:
767        if (code != end_code)        if (code != end_code)
768          {          {
769          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 609  for (;;) Line 772  for (;;)
772            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
773            }            }
774          }          }
775        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
776          {          {
777          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
778            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
779              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
780          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
781          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
782          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
783            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
784            offsets[0] = current_subject - start_subject;                match_count = 0;
785            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
786            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
787              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
788            }              {
789          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
790            {              offsets[1] = (int)(ptr - start_subject);
791            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
792              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], (char *)current_subject));
793              match_count, rlevel*2-2, SP));              }
794            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
795                {
796                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
797                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
798                  match_count, rlevel*2-2, SP));
799                return match_count;
800                }
801            }            }
802          }          }
803        break;        break;
# Line 640  for (;;) Line 809  for (;;)
809        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
810        case OP_ALT:        case OP_ALT:
811        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
812        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
813        break;        break;
814    
815        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 648  for (;;) Line 817  for (;;)
817        case OP_SBRA:        case OP_SBRA:
818        do        do
819          {          {
820          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821          code += GET(code, 1);          code += GET(code, 1);
822          }          }
823        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 657  for (;;) Line 826  for (;;)
826        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
827        case OP_CBRA:        case OP_CBRA:
828        case OP_SCBRA:        case OP_SCBRA:
829        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
830        code += GET(code, 1);        code += GET(code, 1);
831        while (*code == OP_ALT)        while (*code == OP_ALT)
832          {          {
833          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
834          code += GET(code, 1);          code += GET(code, 1);
835          }          }
836        break;        break;
# Line 672  for (;;) Line 841  for (;;)
841        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
842        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
843        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
844        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
845          break;
846    
847          /*-----------------------------------------------------------------*/
848          case OP_SKIPZERO:
849          code += 1 + GET(code, 2);
850          while (*code == OP_ALT) code += GET(code, 1);
851          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
852        break;        break;
853    
854        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
855        case OP_CIRC:        case OP_CIRC:
856        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
857          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
858        break;        break;
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_EOD:        case OP_CIRCM:
862        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
863              (ptr != end_subject && WAS_NEWLINE(ptr)))
864            { ADD_ACTIVE(state_offset + 1, 0); }
865        break;        break;
866    
867        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
868        case OP_OPT:        case OP_EOD:
869        ims = code[1];        if (ptr >= end_subject)
870        ADD_ACTIVE(state_offset + 2, 0);          {
871            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
872              could_continue = TRUE;
873            else { ADD_ACTIVE(state_offset + 1, 0); }
874            }
875        break;        break;
876    
877        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 714  for (;;) Line 893  for (;;)
893    
894        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
895        case OP_ANY:        case OP_ANY:
896        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
897            {
898            if (ptr + 1 >= md->end_subject &&
899                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
900                NLBLOCK->nltype == NLTYPE_FIXED &&
901                NLBLOCK->nllen == 2 &&
902                c == NLBLOCK->nl[0])
903              {
904              could_continue = partial_newline = TRUE;
905              }
906            else
907              {
908              ADD_NEW(state_offset + 1, 0);
909              }
910            }
911          break;
912    
913          /*-----------------------------------------------------------------*/
914          case OP_ALLANY:
915          if (clen > 0)
916          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
917        break;        break;
918    
919        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
920        case OP_EODN:        case OP_EODN:
921        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
922            could_continue = TRUE;
923          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
924          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
925        break;        break;
926    
# Line 728  for (;;) Line 928  for (;;)
928        case OP_DOLL:        case OP_DOLL:
929        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
930          {          {
931          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
932              (IS_NEWLINE(ptr) &&            could_continue = TRUE;
933                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)          else if (clen == 0 ||
934                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
935                   (ptr == end_subject - md->nllen)
936              ))              ))
937            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
938            else if (ptr + 1 >= md->end_subject &&
939                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
940                     NLBLOCK->nltype == NLTYPE_FIXED &&
941                     NLBLOCK->nllen == 2 &&
942                     c == NLBLOCK->nl[0])
943              {
944              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
945                {
946                reset_could_continue = TRUE;
947                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
948                }
949              else could_continue = partial_newline = TRUE;
950              }
951            }
952          break;
953    
954          /*-----------------------------------------------------------------*/
955          case OP_DOLLM:
956          if ((md->moptions & PCRE_NOTEOL) == 0)
957            {
958            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
959              could_continue = TRUE;
960            else if (clen == 0 ||
961                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
962              { ADD_ACTIVE(state_offset + 1, 0); }
963            else if (ptr + 1 >= md->end_subject &&
964                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
965                     NLBLOCK->nltype == NLTYPE_FIXED &&
966                     NLBLOCK->nllen == 2 &&
967                     c == NLBLOCK->nl[0])
968              {
969              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
970                {
971                reset_could_continue = TRUE;
972                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
973                }
974              else could_continue = partial_newline = TRUE;
975              }
976          }          }
977        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        else if (IS_NEWLINE(ptr))
978          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
979        break;        break;
980    
# Line 765  for (;;) Line 1005  for (;;)
1005    
1006          if (ptr > start_subject)          if (ptr > start_subject)
1007            {            {
1008            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1009  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1010            if (utf8) BACKCHAR(temp);  #ifdef SUPPORT_UTF
1011              if (utf) { BACKCHAR(temp); }
1012  #endif  #endif
1013            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1014    #ifdef SUPPORT_UCP
1015              if ((md->poptions & PCRE_UCP) != 0)
1016                {
1017                if (d == '_') left_word = TRUE; else
1018                  {
1019                  int cat = UCD_CATEGORY(d);
1020                  left_word = (cat == ucp_L || cat == ucp_N);
1021                  }
1022                }
1023              else
1024    #endif
1025            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1026            }            }
1027          else left_word = 0;          else left_word = FALSE;
1028    
1029          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
1030            else right_word = 0;            {
1031    #ifdef SUPPORT_UCP
1032              if ((md->poptions & PCRE_UCP) != 0)
1033                {
1034                if (c == '_') right_word = TRUE; else
1035                  {
1036                  int cat = UCD_CATEGORY(c);
1037                  right_word = (cat == ucp_L || cat == ucp_N);
1038                  }
1039                }
1040              else
1041    #endif
1042              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1043              }
1044            else right_word = FALSE;
1045    
1046          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1047            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 783  for (;;) Line 1049  for (;;)
1049        break;        break;
1050    
1051    
 #ifdef SUPPORT_UCP  
   
1052        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1053        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
1054        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
1055        */        */
1056    
1057    #ifdef SUPPORT_UCP
1058        case OP_PROP:        case OP_PROP:
1059        case OP_NOTPROP:        case OP_NOTPROP:
1060        if (clen > 0)        if (clen > 0)
1061          {          {
1062          BOOL OK;          BOOL OK;
1063          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1064          switch(code[1])          switch(code[1])
1065            {            {
1066            case PT_ANY:            case PT_ANY:
# Line 803  for (;;) Line 1068  for (;;)
1068            break;            break;
1069    
1070            case PT_LAMP:            case PT_LAMP:
1071            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1072                   prop->chartype == ucp_Lt;
1073            break;            break;
1074    
1075            case PT_GC:            case PT_GC:
1076            OK = category == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1077            break;            break;
1078    
1079            case PT_PC:            case PT_PC:
1080            OK = chartype == code[2];            OK = prop->chartype == code[2];
1081            break;            break;
1082    
1083            case PT_SC:            case PT_SC:
1084            OK = script == code[2];            OK = prop->script == code[2];
1085              break;
1086    
1087              /* These are specials for combination cases. */
1088    
1089              case PT_ALNUM:
1090              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1091                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1092              break;
1093    
1094              case PT_SPACE:    /* Perl space */
1095              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1096                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1097              break;
1098    
1099              case PT_PXSPACE:  /* POSIX space */
1100              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1101                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1102                   c == CHAR_FF || c == CHAR_CR;
1103              break;
1104    
1105              case PT_WORD:
1106              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1107                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1108                   c == CHAR_UNDERSCORE;
1109            break;            break;
1110    
1111            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 835  for (;;) Line 1125  for (;;)
1125  /* ========================================================================== */  /* ========================================================================== */
1126        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
1127        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
1128        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1129        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1130    
1131        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1132        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 845  for (;;) Line 1135  for (;;)
1135        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1136        if (clen > 0)        if (clen > 0)
1137          {          {
1138          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1139                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1140                NLBLOCK->nltype == NLTYPE_FIXED &&
1141                NLBLOCK->nllen == 2 &&
1142                c == NLBLOCK->nl[0])
1143              {
1144              could_continue = partial_newline = TRUE;
1145              }
1146            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1147              (c < 256 &&              (c < 256 &&
1148                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1149                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1150            {            {
1151            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 871  for (;;) Line 1166  for (;;)
1166        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1167        if (clen > 0)        if (clen > 0)
1168          {          {
1169          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1170                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1171                NLBLOCK->nltype == NLTYPE_FIXED &&
1172                NLBLOCK->nllen == 2 &&
1173                c == NLBLOCK->nl[0])
1174              {
1175              could_continue = partial_newline = TRUE;
1176              }
1177            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1178              (c < 256 &&              (c < 256 &&
1179                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1180                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1181            {            {
1182            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 896  for (;;) Line 1196  for (;;)
1196        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1197        if (clen > 0)        if (clen > 0)
1198          {          {
1199          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1200                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1201                NLBLOCK->nltype == NLTYPE_FIXED &&
1202                NLBLOCK->nllen == 2 &&
1203                c == NLBLOCK->nl[0])
1204              {
1205              could_continue = partial_newline = TRUE;
1206              }
1207            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208              (c < 256 &&              (c < 256 &&
1209                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1210                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211            {            {
1212            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 919  for (;;) Line 1224  for (;;)
1224        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1225        if (clen > 0)        if (clen > 0)
1226          {          {
1227          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1228                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1229                NLBLOCK->nltype == NLTYPE_FIXED &&
1230                NLBLOCK->nllen == 2 &&
1231                c == NLBLOCK->nl[0])
1232              {
1233              could_continue = partial_newline = TRUE;
1234              }
1235            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1236              (c < 256 &&              (c < 256 &&
1237                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1238                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1239            {            {
1240            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1241              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1242            else            else
1243              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1244            }            }
# Line 939  for (;;) Line 1249  for (;;)
1249        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1250        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1251        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1252        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1253        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1254        if (clen > 0)        if (clen > 0)
1255          {          {
1256          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1257                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1258                NLBLOCK->nltype == NLTYPE_FIXED &&
1259                NLBLOCK->nllen == 2 &&
1260                c == NLBLOCK->nl[0])
1261              {
1262              could_continue = partial_newline = TRUE;
1263              }
1264            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265              (c < 256 &&              (c < 256 &&
1266                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1267                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268            {            {
1269            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 957  for (;;) Line 1272  for (;;)
1272              next_active_state--;              next_active_state--;
1273              }              }
1274            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1275              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1276            else            else
1277              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1278            }            }
# Line 970  for (;;) Line 1285  for (;;)
1285        argument. It keeps the code above fast for the other cases. The argument        argument. It keeps the code above fast for the other cases. The argument
1286        is in the d variable. */        is in the d variable. */
1287    
1288    #ifdef SUPPORT_UCP
1289        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
1290        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1291        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
# Line 978  for (;;) Line 1294  for (;;)
1294        if (clen > 0)        if (clen > 0)
1295          {          {
1296          BOOL OK;          BOOL OK;
1297          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1298          switch(code[2])          switch(code[2])
1299            {            {
1300            case PT_ANY:            case PT_ANY:
# Line 986  for (;;) Line 1302  for (;;)
1302            break;            break;
1303    
1304            case PT_LAMP:            case PT_LAMP:
1305            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1306                prop->chartype == ucp_Lt;
1307            break;            break;
1308    
1309            case PT_GC:            case PT_GC:
1310            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1311            break;            break;
1312    
1313            case PT_PC:            case PT_PC:
1314            OK = chartype == code[3];            OK = prop->chartype == code[3];
1315            break;            break;
1316    
1317            case PT_SC:            case PT_SC:
1318            OK = script == code[3];            OK = prop->script == code[3];
1319              break;
1320    
1321              /* These are specials for combination cases. */
1322    
1323              case PT_ALNUM:
1324              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1325                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1326              break;
1327    
1328              case PT_SPACE:    /* Perl space */
1329              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1330                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1331              break;
1332    
1333              case PT_PXSPACE:  /* POSIX space */
1334              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1335                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1336                   c == CHAR_FF || c == CHAR_CR;
1337              break;
1338    
1339              case PT_WORD:
1340              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1341                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1342                   c == CHAR_UNDERSCORE;
1343            break;            break;
1344    
1345            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1027  for (;;) Line 1368  for (;;)
1368        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1369        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1370        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1371        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1372          {          {
1373          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1374          int ncount = 0;          int ncount = 0;
1375          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1376            {            {
# Line 1041  for (;;) Line 1382  for (;;)
1382            int nd;            int nd;
1383            int ndlen = 1;            int ndlen = 1;
1384            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1385            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1386            ncount++;            ncount++;
1387            nptr += ndlen;            nptr += ndlen;
1388            }            }
# Line 1049  for (;;) Line 1390  for (;;)
1390          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1391          }          }
1392        break;        break;
1393    #endif
1394    
1395        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1396        case OP_ANYNL_EXTRA + OP_TYPEPLUS:        case OP_ANYNL_EXTRA + OP_TYPEPLUS:
# Line 1061  for (;;) Line 1403  for (;;)
1403          int ncount = 0;          int ncount = 0;
1404          switch (c)          switch (c)
1405            {            {
1406              case 0x000b:
1407              case 0x000c:
1408              case 0x0085:
1409              case 0x2028:
1410              case 0x2029:
1411              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1412              goto ANYNL01;
1413    
1414            case 0x000d:            case 0x000d:
1415            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1416            /* Fall through */            /* Fall through */
1417    
1418              ANYNL01:
1419              case 0x000a:
1420              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1421                {
1422                active_count--;           /* Remove non-match possibility */
1423                next_active_state--;
1424                }
1425              count++;
1426              ADD_NEW_DATA(-state_offset, count, ncount);
1427              break;
1428    
1429              default:
1430              break;
1431              }
1432            }
1433          break;
1434    
1435          /*-----------------------------------------------------------------*/
1436          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1437          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1438          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1439          count = current_state->count;  /* Already matched */
1440          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1441          if (clen > 0)
1442            {
1443            BOOL OK;
1444            switch (c)
1445              {
1446            case 0x000a:            case 0x000a:
1447            case 0x000b:            case 0x000b:
1448            case 0x000c:            case 0x000c:
1449              case 0x000d:
1450            case 0x0085:            case 0x0085:
1451            case 0x2028:            case 0x2028:
1452            case 0x2029:            case 0x2029:
1453            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            OK = TRUE;
1454              break;
1455    
1456              default:
1457              OK = FALSE;
1458              break;
1459              }
1460    
1461            if (OK == (d == OP_VSPACE))
1462              {
1463              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1464              {              {
1465              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1466              next_active_state--;              next_active_state--;
1467              }              }
1468            count++;            count++;
1469            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, 0);
1470              }
1471            }
1472          break;
1473    
1474          /*-----------------------------------------------------------------*/
1475          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1476          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1477          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1478          count = current_state->count;  /* Already matched */
1479          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480          if (clen > 0)
1481            {
1482            BOOL OK;
1483            switch (c)
1484              {
1485              case 0x09:      /* HT */
1486              case 0x20:      /* SPACE */
1487              case 0xa0:      /* NBSP */
1488              case 0x1680:    /* OGHAM SPACE MARK */
1489              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1490              case 0x2000:    /* EN QUAD */
1491              case 0x2001:    /* EM QUAD */
1492              case 0x2002:    /* EN SPACE */
1493              case 0x2003:    /* EM SPACE */
1494              case 0x2004:    /* THREE-PER-EM SPACE */
1495              case 0x2005:    /* FOUR-PER-EM SPACE */
1496              case 0x2006:    /* SIX-PER-EM SPACE */
1497              case 0x2007:    /* FIGURE SPACE */
1498              case 0x2008:    /* PUNCTUATION SPACE */
1499              case 0x2009:    /* THIN SPACE */
1500              case 0x200A:    /* HAIR SPACE */
1501              case 0x202f:    /* NARROW NO-BREAK SPACE */
1502              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1503              case 0x3000:    /* IDEOGRAPHIC SPACE */
1504              OK = TRUE;
1505            break;            break;
1506    
1507            default:            default:
1508              OK = FALSE;
1509            break;            break;
1510            }            }
1511    
1512            if (OK == (d == OP_HSPACE))
1513              {
1514              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1515                {
1516                active_count--;           /* Remove non-match possibility */
1517                next_active_state--;
1518                }
1519              count++;
1520              ADD_NEW_DATA(-state_offset, count, 0);
1521              }
1522          }          }
1523        break;        break;
1524    
1525        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1526    #ifdef SUPPORT_UCP
1527        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1528        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1529        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
# Line 1102  for (;;) Line 1541  for (;;)
1541        if (clen > 0)        if (clen > 0)
1542          {          {
1543          BOOL OK;          BOOL OK;
1544          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1545          switch(code[2])          switch(code[2])
1546            {            {
1547            case PT_ANY:            case PT_ANY:
# Line 1110  for (;;) Line 1549  for (;;)
1549            break;            break;
1550    
1551            case PT_LAMP:            case PT_LAMP:
1552            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1553                prop->chartype == ucp_Lt;
1554            break;            break;
1555    
1556            case PT_GC:            case PT_GC:
1557            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1558            break;            break;
1559    
1560            case PT_PC:            case PT_PC:
1561            OK = chartype == code[3];            OK = prop->chartype == code[3];
1562            break;            break;
1563    
1564            case PT_SC:            case PT_SC:
1565            OK = script == code[3];            OK = prop->script == code[3];
1566              break;
1567    
1568              /* These are specials for combination cases. */
1569    
1570              case PT_ALNUM:
1571              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1572                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1573              break;
1574    
1575              case PT_SPACE:    /* Perl space */
1576              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1577                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1578              break;
1579    
1580              case PT_PXSPACE:  /* POSIX space */
1581              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1582                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1583                   c == CHAR_FF || c == CHAR_CR;
1584              break;
1585    
1586              case PT_WORD:
1587              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1588                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1589                   c == CHAR_UNDERSCORE;
1590            break;            break;
1591    
1592            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1160  for (;;) Line 1624  for (;;)
1624        QS2:        QS2:
1625    
1626        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1627        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1628          {          {
1629          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1630          int ncount = 0;          int ncount = 0;
1631          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1632              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1175  for (;;) Line 1639  for (;;)
1639            int nd;            int nd;
1640            int ndlen = 1;            int ndlen = 1;
1641            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1642            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1643            ncount++;            ncount++;
1644            nptr += ndlen;            nptr += ndlen;
1645            }            }
1646          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1647          }          }
1648        break;        break;
1649    #endif
1650    
1651        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1652        case OP_ANYNL_EXTRA + OP_TYPEQUERY:        case OP_ANYNL_EXTRA + OP_TYPEQUERY:
# Line 1202  for (;;) Line 1667  for (;;)
1667          int ncount = 0;          int ncount = 0;
1668          switch (c)          switch (c)
1669            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1670            case 0x000b:            case 0x000b:
1671            case 0x000c:            case 0x000c:
1672            case 0x0085:            case 0x0085:
1673            case 0x2028:            case 0x2028:
1674            case 0x2029:            case 0x2029:
1675              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1676              goto ANYNL02;
1677    
1678              case 0x000d:
1679              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1680              /* Fall through */
1681    
1682              ANYNL02:
1683              case 0x000a:
1684            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1685                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1686              {              {
# Line 1219  for (;;) Line 1689  for (;;)
1689              }              }
1690            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1691            break;            break;
1692    
1693            default:            default:
1694            break;            break;
1695            }            }
# Line 1226  for (;;) Line 1697  for (;;)
1697        break;        break;
1698    
1699        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1700        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1701        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1702        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1703        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        count = 2;
1704        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        goto QS4;
1705          { ADD_ACTIVE(state_offset + 6, 0); }  
1706        count = current_state->count;  /* Number already matched */        case OP_VSPACE_EXTRA + OP_TYPESTAR:
1707          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1708          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1709          count = 0;
1710    
1711          QS4:
1712          ADD_ACTIVE(state_offset + 2, 0);
1713        if (clen > 0)        if (clen > 0)
1714          {          {
1715          BOOL OK;          BOOL OK;
1716          int category = _pcre_ucp_findprop(c, &chartype, &script);          switch (c)
         switch(code[4])  
1717            {            {
1718            case PT_ANY:            case 0x000a:
1719              case 0x000b:
1720              case 0x000c:
1721              case 0x000d:
1722              case 0x0085:
1723              case 0x2028:
1724              case 0x2029:
1725            OK = TRUE;            OK = TRUE;
1726            break;            break;
1727    
1728            case PT_LAMP:            default:
1729            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = FALSE;
1730            break;            break;
1731              }
1732            if (OK == (d == OP_VSPACE))
1733              {
1734              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1735                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1736                {
1737                active_count--;           /* Remove non-match possibility */
1738                next_active_state--;
1739                }
1740              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1741              }
1742            }
1743          break;
1744    
1745            case PT_GC:        /*-----------------------------------------------------------------*/
1746            OK = category == code[5];        case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1747            break;        case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1748          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1749          count = 2;
1750          goto QS5;
1751    
1752          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1753          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1754          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1755          count = 0;
1756    
1757          QS5:
1758          ADD_ACTIVE(state_offset + 2, 0);
1759          if (clen > 0)
1760            {
1761            BOOL OK;
1762            switch (c)
1763              {
1764              case 0x09:      /* HT */
1765              case 0x20:      /* SPACE */
1766              case 0xa0:      /* NBSP */
1767              case 0x1680:    /* OGHAM SPACE MARK */
1768              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1769              case 0x2000:    /* EN QUAD */
1770              case 0x2001:    /* EM QUAD */
1771              case 0x2002:    /* EN SPACE */
1772              case 0x2003:    /* EM SPACE */
1773              case 0x2004:    /* THREE-PER-EM SPACE */
1774              case 0x2005:    /* FOUR-PER-EM SPACE */
1775              case 0x2006:    /* SIX-PER-EM SPACE */
1776              case 0x2007:    /* FIGURE SPACE */
1777              case 0x2008:    /* PUNCTUATION SPACE */
1778              case 0x2009:    /* THIN SPACE */
1779              case 0x200A:    /* HAIR SPACE */
1780              case 0x202f:    /* NARROW NO-BREAK SPACE */
1781              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1782              case 0x3000:    /* IDEOGRAPHIC SPACE */
1783              OK = TRUE;
1784              break;
1785    
1786              default:
1787              OK = FALSE;
1788              break;
1789              }
1790    
1791            if (OK == (d == OP_HSPACE))
1792              {
1793              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1794                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1795                {
1796                active_count--;           /* Remove non-match possibility */
1797                next_active_state--;
1798                }
1799              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1800              }
1801            }
1802          break;
1803    
1804          /*-----------------------------------------------------------------*/
1805    #ifdef SUPPORT_UCP
1806          case OP_PROP_EXTRA + OP_TYPEEXACT:
1807          case OP_PROP_EXTRA + OP_TYPEUPTO:
1808          case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1809          case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1810          if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1811            { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1812          count = current_state->count;  /* Number already matched */
1813          if (clen > 0)
1814            {
1815            BOOL OK;
1816            const ucd_record * prop = GET_UCD(c);
1817            switch(code[1 + IMM2_SIZE + 1])
1818              {
1819              case PT_ANY:
1820              OK = TRUE;
1821              break;
1822    
1823              case PT_LAMP:
1824              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1825                prop->chartype == ucp_Lt;
1826              break;
1827    
1828              case PT_GC:
1829              OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1830              break;
1831    
1832            case PT_PC:            case PT_PC:
1833            OK = chartype == code[5];            OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1834            break;            break;
1835    
1836            case PT_SC:            case PT_SC:
1837            OK = script == code[5];            OK = prop->script == code[1 + IMM2_SIZE + 2];
1838              break;
1839    
1840              /* These are specials for combination cases. */
1841    
1842              case PT_ALNUM:
1843              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1844                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1845              break;
1846    
1847              case PT_SPACE:    /* Perl space */
1848              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1849                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1850              break;
1851    
1852              case PT_PXSPACE:  /* POSIX space */
1853              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1854                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1855                   c == CHAR_FF || c == CHAR_CR;
1856              break;
1857    
1858              case PT_WORD:
1859              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1860                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1861                   c == CHAR_UNDERSCORE;
1862            break;            break;
1863    
1864            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1274  for (;;) Line 1876  for (;;)
1876              next_active_state--;              next_active_state--;
1877              }              }
1878            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1879              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1880            else            else
1881              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1882            }            }
# Line 1287  for (;;) Line 1889  for (;;)
1889        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1890        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1891        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1892          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1893        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1894        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1895          {          {
1896          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1897          int ncount = 0;          int ncount = 0;
1898          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1899            {            {
# Line 1303  for (;;) Line 1905  for (;;)
1905            int nd;            int nd;
1906            int ndlen = 1;            int ndlen = 1;
1907            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1908            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1909            ncount++;            ncount++;
1910            nptr += ndlen;            nptr += ndlen;
1911            }            }
1912            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1913                reset_could_continue = TRUE;
1914          if (++count >= GET2(code, 1))          if (++count >= GET2(code, 1))
1915            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1916          else          else
1917            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1918          }          }
1919        break;        break;
1920    #endif
1921    
1922        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1923        case OP_ANYNL_EXTRA + OP_TYPEEXACT:        case OP_ANYNL_EXTRA + OP_TYPEEXACT:
# Line 1320  for (;;) Line 1925  for (;;)
1925        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1926        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1927        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1928          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1929        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1930        if (clen > 0)        if (clen > 0)
1931          {          {
1932          int ncount = 0;          int ncount = 0;
1933          switch (c)          switch (c)
1934            {            {
1935              case 0x000b:
1936              case 0x000c:
1937              case 0x0085:
1938              case 0x2028:
1939              case 0x2029:
1940              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1941              goto ANYNL03;
1942    
1943            case 0x000d:            case 0x000d:
1944            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1945            /* Fall through */            /* Fall through */
1946    
1947              ANYNL03:
1948              case 0x000a:
1949              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1950                {
1951                active_count--;           /* Remove non-match possibility */
1952                next_active_state--;
1953                }
1954              if (++count >= GET2(code, 1))
1955                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1956              else
1957                { ADD_NEW_DATA(-state_offset, count, ncount); }
1958              break;
1959    
1960              default:
1961              break;
1962              }
1963            }
1964          break;
1965    
1966          /*-----------------------------------------------------------------*/
1967          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1968          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1969          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1970          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1971          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1972            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1973          count = current_state->count;  /* Number already matched */
1974          if (clen > 0)
1975            {
1976            BOOL OK;
1977            switch (c)
1978              {
1979            case 0x000a:            case 0x000a:
1980            case 0x000b:            case 0x000b:
1981            case 0x000c:            case 0x000c:
1982              case 0x000d:
1983            case 0x0085:            case 0x0085:
1984            case 0x2028:            case 0x2028:
1985            case 0x2029:            case 0x2029:
1986            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            OK = TRUE;
1987              break;
1988    
1989              default:
1990              OK = FALSE;
1991              }
1992    
1993            if (OK == (d == OP_VSPACE))
1994              {
1995              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1996              {              {
1997              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1998              next_active_state--;              next_active_state--;
1999              }              }
2000            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2001              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2002            else            else
2003              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, 0); }
2004              }
2005            }
2006          break;
2007    
2008          /*-----------------------------------------------------------------*/
2009          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2010          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2011          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2012          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2013          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2014            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2015          count = current_state->count;  /* Number already matched */
2016          if (clen > 0)
2017            {
2018            BOOL OK;
2019            switch (c)
2020              {
2021              case 0x09:      /* HT */
2022              case 0x20:      /* SPACE */
2023              case 0xa0:      /* NBSP */
2024              case 0x1680:    /* OGHAM SPACE MARK */
2025              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2026              case 0x2000:    /* EN QUAD */
2027              case 0x2001:    /* EM QUAD */
2028              case 0x2002:    /* EN SPACE */
2029              case 0x2003:    /* EM SPACE */
2030              case 0x2004:    /* THREE-PER-EM SPACE */
2031              case 0x2005:    /* FOUR-PER-EM SPACE */
2032              case 0x2006:    /* SIX-PER-EM SPACE */
2033              case 0x2007:    /* FIGURE SPACE */
2034              case 0x2008:    /* PUNCTUATION SPACE */
2035              case 0x2009:    /* THIN SPACE */
2036              case 0x200A:    /* HAIR SPACE */
2037              case 0x202f:    /* NARROW NO-BREAK SPACE */
2038              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2039              case 0x3000:    /* IDEOGRAPHIC SPACE */
2040              OK = TRUE;
2041            break;            break;
2042    
2043            default:            default:
2044              OK = FALSE;
2045            break;            break;
2046            }            }
2047    
2048            if (OK == (d == OP_HSPACE))
2049              {
2050              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2051                {
2052                active_count--;           /* Remove non-match possibility */
2053                next_active_state--;
2054                }
2055              if (++count >= GET2(code, 1))
2056                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2057              else
2058                { ADD_NEW_DATA(-state_offset, count, 0); }
2059              }
2060          }          }
2061        break;        break;
2062    
# Line 1364  for (;;) Line 2072  for (;;)
2072        break;        break;
2073    
2074        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2075        case OP_CHARNC:        case OP_CHARI:
2076        if (clen == 0) break;        if (clen == 0) break;
2077    
2078  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2079        if (utf8)        if (utf)
2080          {          {
2081          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2082            {            {
2083            unsigned int othercase;            unsigned int othercase;
2084            if (c < 128) othercase = fcc[c]; else            if (c < 128)
2085                othercase = fcc[c];
2086            /* If we have Unicode property support, we can use it to test the            else
2087            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2088                other case of the character. */
2089  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2090            othercase = _pcre_ucp_othercase(c);              othercase = UCD_OTHERCASE(c);
2091  #else  #else
2092            othercase = NOTACHAR;              othercase = NOTACHAR;
2093  #endif  #endif
2094    
2095            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2096            }            }
2097          }          }
2098        else        else
2099  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2100          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2101          {          {
2102          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2103              { ADD_NEW(state_offset + 2, 0); }
2104          }          }
2105        break;        break;
2106    
# Line 1404  for (;;) Line 2112  for (;;)
2112        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2113    
2114        case OP_EXTUNI:        case OP_EXTUNI:
2115        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2116          {          {
2117          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
2118          int ncount = 0;          int ncount = 0;
2119          while (nptr < end_subject)          while (nptr < end_subject)
2120            {            {
2121            int nclen = 1;            int nclen = 1;
2122            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
2123            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
2124            ncount++;            ncount++;
2125            nptr += nclen;            nptr += nclen;
2126            }            }
2127            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2128                reset_could_continue = TRUE;
2129          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2130          }          }
2131        break;        break;
# Line 1429  for (;;) Line 2139  for (;;)
2139        case OP_ANYNL:        case OP_ANYNL:
2140        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2141          {          {
         case 0x000a:  
2142          case 0x000b:          case 0x000b:
2143          case 0x000c:          case 0x000c:
2144          case 0x0085:          case 0x0085:
2145          case 0x2028:          case 0x2028:
2146          case 0x2029:          case 0x2029:
2147            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2148    
2149            case 0x000a:
2150          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2151          break;          break;
2152    
2153          case 0x000d:          case 0x000d:
2154          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 >= end_subject)
2155              {
2156              ADD_NEW(state_offset + 1, 0);
2157              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2158                reset_could_continue = TRUE;
2159              }
2160            else if (ptr[1] == 0x0a)
2161            {            {
2162            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2163            }            }
# Line 1451  for (;;) Line 2170  for (;;)
2170        break;        break;
2171    
2172        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2173        /* Match a negated single character. This is only used for one-byte        case OP_NOT_VSPACE:
2174        characters, that is, we know that d < 256. The character we are        if (clen > 0) switch(c)
2175        checking (c) can be multibyte. */          {
2176            case 0x000a:
2177            case 0x000b:
2178            case 0x000c:
2179            case 0x000d:
2180            case 0x0085:
2181            case 0x2028:
2182            case 0x2029:
2183            break;
2184    
2185            default:
2186            ADD_NEW(state_offset + 1, 0);
2187            break;
2188            }
2189          break;
2190    
2191          /*-----------------------------------------------------------------*/
2192          case OP_VSPACE:
2193          if (clen > 0) switch(c)
2194            {
2195            case 0x000a:
2196            case 0x000b:
2197            case 0x000c:
2198            case 0x000d:
2199            case 0x0085:
2200            case 0x2028:
2201            case 0x2029:
2202            ADD_NEW(state_offset + 1, 0);
2203            break;
2204    
2205            default: break;
2206            }
2207          break;
2208    
2209          /*-----------------------------------------------------------------*/
2210          case OP_NOT_HSPACE:
2211          if (clen > 0) switch(c)
2212            {
2213            case 0x09:      /* HT */
2214            case 0x20:      /* SPACE */
2215            case 0xa0:      /* NBSP */
2216            case 0x1680:    /* OGHAM SPACE MARK */
2217            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2218            case 0x2000:    /* EN QUAD */
2219            case 0x2001:    /* EM QUAD */
2220            case 0x2002:    /* EN SPACE */
2221            case 0x2003:    /* EM SPACE */
2222            case 0x2004:    /* THREE-PER-EM SPACE */
2223            case 0x2005:    /* FOUR-PER-EM SPACE */
2224            case 0x2006:    /* SIX-PER-EM SPACE */
2225            case 0x2007:    /* FIGURE SPACE */
2226            case 0x2008:    /* PUNCTUATION SPACE */
2227            case 0x2009:    /* THIN SPACE */
2228            case 0x200A:    /* HAIR SPACE */
2229            case 0x202f:    /* NARROW NO-BREAK SPACE */
2230            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2231            case 0x3000:    /* IDEOGRAPHIC SPACE */
2232            break;
2233    
2234            default:
2235            ADD_NEW(state_offset + 1, 0);
2236            break;
2237            }
2238          break;
2239    
2240          /*-----------------------------------------------------------------*/
2241          case OP_HSPACE:
2242          if (clen > 0) switch(c)
2243            {
2244            case 0x09:      /* HT */
2245            case 0x20:      /* SPACE */
2246            case 0xa0:      /* NBSP */
2247            case 0x1680:    /* OGHAM SPACE MARK */
2248            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2249            case 0x2000:    /* EN QUAD */
2250            case 0x2001:    /* EM QUAD */
2251            case 0x2002:    /* EN SPACE */
2252            case 0x2003:    /* EM SPACE */
2253            case 0x2004:    /* THREE-PER-EM SPACE */
2254            case 0x2005:    /* FOUR-PER-EM SPACE */
2255            case 0x2006:    /* SIX-PER-EM SPACE */
2256            case 0x2007:    /* FIGURE SPACE */
2257            case 0x2008:    /* PUNCTUATION SPACE */
2258            case 0x2009:    /* THIN SPACE */
2259            case 0x200A:    /* HAIR SPACE */
2260            case 0x202f:    /* NARROW NO-BREAK SPACE */
2261            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2262            case 0x3000:    /* IDEOGRAPHIC SPACE */
2263            ADD_NEW(state_offset + 1, 0);
2264            break;
2265            }
2266          break;
2267    
2268          /*-----------------------------------------------------------------*/
2269          /* Match a negated single character casefully. */
2270    
2271        case OP_NOT:        case OP_NOT:
2272          if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2273          break;
2274    
2275          /*-----------------------------------------------------------------*/
2276          /* Match a negated single character caselessly. */
2277    
2278          case OP_NOTI:
2279        if (clen > 0)        if (clen > 0)
2280          {          {
2281          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          unsigned int otherd;
2282          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  #ifdef SUPPORT_UTF
2283            if (utf && d >= 128)
2284              {
2285    #ifdef SUPPORT_UCP
2286              otherd = UCD_OTHERCASE(d);
2287    #endif  /* SUPPORT_UCP */
2288              }
2289            else
2290    #endif  /* SUPPORT_UTF */
2291            otherd = TABLE_GET(d, fcc, d);
2292            if (c != d && c != otherd)
2293              { ADD_NEW(state_offset + dlen + 1, 0); }
2294          }          }
2295        break;        break;
2296    
2297        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2298          case OP_PLUSI:
2299          case OP_MINPLUSI:
2300          case OP_POSPLUSI:
2301          case OP_NOTPLUSI:
2302          case OP_NOTMINPLUSI:
2303          case OP_NOTPOSPLUSI:
2304          caseless = TRUE;
2305          codevalue -= OP_STARI - OP_STAR;
2306    
2307          /* Fall through */
2308        case OP_PLUS:        case OP_PLUS:
2309        case OP_MINPLUS:        case OP_MINPLUS:
2310        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1475  for (;;) Line 2316  for (;;)
2316        if (clen > 0)        if (clen > 0)
2317          {          {
2318          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2319          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2320            {            {
2321  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2322            if (utf8 && d >= 128)            if (utf && d >= 128)
2323              {              {
2324  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2325              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2326  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2327              }              }
2328            else            else
2329  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2330            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2331            }            }
2332          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2333            {            {
# Line 1503  for (;;) Line 2344  for (;;)
2344        break;        break;
2345    
2346        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2347          case OP_QUERYI:
2348          case OP_MINQUERYI:
2349          case OP_POSQUERYI:
2350          case OP_NOTQUERYI:
2351          case OP_NOTMINQUERYI:
2352          case OP_NOTPOSQUERYI:
2353          caseless = TRUE;
2354          codevalue -= OP_STARI - OP_STAR;
2355          /* Fall through */
2356        case OP_QUERY:        case OP_QUERY:
2357        case OP_MINQUERY:        case OP_MINQUERY:
2358        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1513  for (;;) Line 2363  for (;;)
2363        if (clen > 0)        if (clen > 0)
2364          {          {
2365          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2366          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2367            {            {
2368  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2369            if (utf8 && d >= 128)            if (utf && d >= 128)
2370              {              {
2371  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2372              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2373  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2374              }              }
2375            else            else
2376  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2377            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2378            }            }
2379          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2380            {            {
# Line 1539  for (;;) Line 2389  for (;;)
2389        break;        break;
2390    
2391        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2392          case OP_STARI:
2393          case OP_MINSTARI:
2394          case OP_POSSTARI:
2395          case OP_NOTSTARI:
2396          case OP_NOTMINSTARI:
2397          case OP_NOTPOSSTARI:
2398          caseless = TRUE;
2399          codevalue -= OP_STARI - OP_STAR;
2400          /* Fall through */
2401        case OP_STAR:        case OP_STAR:
2402        case OP_MINSTAR:        case OP_MINSTAR:
2403        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1549  for (;;) Line 2408  for (;;)
2408        if (clen > 0)        if (clen > 0)
2409          {          {
2410          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2411          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2412            {            {
2413  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2414            if (utf8 && d >= 128)            if (utf && d >= 128)
2415              {              {
2416  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2417              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2418  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2419              }              }
2420            else            else
2421  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2422            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2423            }            }
2424          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2425            {            {
# Line 1575  for (;;) Line 2434  for (;;)
2434        break;        break;
2435    
2436        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2437          case OP_EXACTI:
2438          case OP_NOTEXACTI:
2439          caseless = TRUE;
2440          codevalue -= OP_STARI - OP_STAR;
2441          /* Fall through */
2442        case OP_EXACT:        case OP_EXACT:
2443        case OP_NOTEXACT:        case OP_NOTEXACT:
2444        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2445        if (clen > 0)        if (clen > 0)
2446          {          {
2447          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2448          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2449            {            {
2450  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2451            if (utf8 && d >= 128)            if (utf && d >= 128)
2452              {              {
2453  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2454              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2455  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2456              }              }
2457            else            else
2458  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2459            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2460            }            }
2461          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2462            {            {
2463            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2464              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2465            else            else
2466              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2467            }            }
# Line 1605  for (;;) Line 2469  for (;;)
2469        break;        break;
2470    
2471        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2472          case OP_UPTOI:
2473          case OP_MINUPTOI:
2474          case OP_POSUPTOI:
2475          case OP_NOTUPTOI:
2476          case OP_NOTMINUPTOI:
2477          case OP_NOTPOSUPTOI:
2478          caseless = TRUE;
2479          codevalue -= OP_STARI - OP_STAR;
2480          /* Fall through */
2481        case OP_UPTO:        case OP_UPTO:
2482        case OP_MINUPTO:        case OP_MINUPTO:
2483        case OP_POSUPTO:        case OP_POSUPTO:
2484        case OP_NOTUPTO:        case OP_NOTUPTO:
2485        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2486        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2487        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2488        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2489        if (clen > 0)        if (clen > 0)
2490          {          {
2491          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2492          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2493            {            {
2494  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2495            if (utf8 && d >= 128)            if (utf && d >= 128)
2496              {              {
2497  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2498              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2499  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2500              }              }
2501            else            else
2502  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2503            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2504            }            }
2505          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2506            {            {
# Line 1637  for (;;) Line 2510  for (;;)
2510              next_active_state--;              next_active_state--;
2511              }              }
2512            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2513              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2514            else            else
2515              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2516            }            }
# Line 1654  for (;;) Line 2527  for (;;)
2527          {          {
2528          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2529          int next_state_offset;          int next_state_offset;
2530          const uschar *ecode;          const pcre_uchar *ecode;
2531    
2532          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2533          can set isinclass from it. */          can set isinclass from it. */
2534    
2535          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2536            {            {
2537            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2538            if (clen > 0)            if (clen > 0)
2539              {              {
2540              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2541                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2542              }              }
2543            }            }
2544    
# Line 1676  for (;;) Line 2549  for (;;)
2549          else          else
2550           {           {
2551           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2552           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2553           }           }
2554    
2555          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2556          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2557          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2558    
2559          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2560    
2561          switch (*ecode)          switch (*ecode)
2562            {            {
# Line 1710  for (;;) Line 2583  for (;;)
2583            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2584            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2585            if (count >= GET2(ecode, 1))            if (count >= GET2(ecode, 1))
2586              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2587            if (isinclass)            if (isinclass)
2588              {              {
2589              int max = GET2(ecode, 3);              int max = GET2(ecode, 1 + IMM2_SIZE);
2590              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2591                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2592              else              else
2593                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2594              }              }
# Line 1730  for (;;) Line 2603  for (;;)
2603    
2604  /* ========================================================================== */  /* ========================================================================== */
2605        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2606        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2607          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2608          though the other "backtracking verbs" are not supported. */
2609    
2610          case OP_FAIL:
2611          forced_fail++;    /* Count FAILs for multiple states */
2612          break;
2613    
2614        case OP_ASSERT:        case OP_ASSERT:
2615        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1740  for (;;) Line 2619  for (;;)
2619          int rc;          int rc;
2620          int local_offsets[2];          int local_offsets[2];
2621          int local_workspace[1000];          int local_workspace[1000];
2622          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2623    
2624          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2625    
# Line 1748  for (;;) Line 2627  for (;;)
2627            md,                                   /* static match data */            md,                                   /* static match data */
2628            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2629            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2630            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2631            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2632            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2633            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2634            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2635            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2636    
2637            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2638          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2639              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2640          }          }
2641        break;        break;
2642    
# Line 1768  for (;;) Line 2646  for (;;)
2646          {          {
2647          int local_offsets[1000];          int local_offsets[1000];
2648          int local_workspace[1000];          int local_workspace[1000];
2649          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2650            int condcode;
2651    
2652            /* Because of the way auto-callout works during compile, a callout item
2653            is inserted between OP_COND and an assertion condition. This does not
2654            happen for the other conditions. */
2655    
2656            if (code[LINK_SIZE+1] == OP_CALLOUT)
2657              {
2658              rrc = 0;
2659              if (PUBL(callout) != NULL)
2660                {
2661                PUBL(callout_block) cb;
2662                cb.version          = 1;   /* Version 1 of the callout block */
2663                cb.callout_number   = code[LINK_SIZE+2];
2664                cb.offset_vector    = offsets;
2665    #ifdef COMPILE_PCRE8
2666                cb.subject          = (PCRE_SPTR)start_subject;
2667    #else
2668                cb.subject          = (PCRE_SPTR16)start_subject;
2669    #endif
2670                cb.subject_length   = (int)(end_subject - start_subject);
2671                cb.start_match      = (int)(current_subject - start_subject);
2672                cb.current_position = (int)(ptr - start_subject);
2673                cb.pattern_position = GET(code, LINK_SIZE + 3);
2674                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2675                cb.capture_top      = 1;
2676                cb.capture_last     = -1;
2677                cb.callout_data     = md->callout_data;
2678                cb.mark             = NULL;   /* No (*MARK) support */
2679                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2680                }
2681              if (rrc > 0) break;                      /* Fail this thread */
2682              code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2683              }
2684    
2685            condcode = code[LINK_SIZE+1];
2686    
2687          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2688    
2689          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2690              return PCRE_ERROR_DFA_UCOND;
2691    
2692          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2693    
2694          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2695            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2696    
2697          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2698          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2699          recursed groups. */          recursed groups. */
2700    
2701          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2702            {            {
2703            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2704            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2705            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (md->recursive != NULL)
2706              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2707              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2708            }            }
2709    
2710          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1798  for (;;) Line 2712  for (;;)
2712          else          else
2713            {            {
2714            int rc;            int rc;
2715            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2716            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2717    
2718            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2719    
# Line 1807  for (;;) Line 2721  for (;;)
2721              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2722              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2723              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2724              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2725              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2726              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2727              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2728              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2729              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2730    
2731              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2732            if ((rc >= 0) ==            if ((rc >= 0) ==
2733                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2734              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2735            else            else
2736              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2737            }            }
2738          }          }
2739        break;        break;
# Line 1828  for (;;) Line 2741  for (;;)
2741        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2742        case OP_RECURSE:        case OP_RECURSE:
2743          {          {
2744            dfa_recursion_info *ri;
2745          int local_offsets[1000];          int local_offsets[1000];
2746          int local_workspace[1000];          int local_workspace[1000];
2747            const pcre_uchar *callpat = start_code + GET(code, 1);
2748            int recno = (callpat == md->start_code)? 0 :
2749              GET2(callpat, 1 + LINK_SIZE);
2750          int rc;          int rc;
2751    
2752          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2753            recursing + 1));  
2754            /* Check for repeating a recursion without advancing the subject
2755            pointer. This should catch convoluted mutual recursions. (Some simple
2756            cases are caught at compile time.) */
2757    
2758            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2759              if (recno == ri->group_num && ptr == ri->subject_position)
2760                return PCRE_ERROR_RECURSELOOP;
2761    
2762            /* Remember this recursion and where we started it so as to
2763            catch infinite loops. */
2764    
2765            new_recursive.group_num = recno;
2766            new_recursive.subject_position = ptr;
2767            new_recursive.prevrec = md->recursive;
2768            md->recursive = &new_recursive;
2769    
2770          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2771            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2772            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2773            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2774            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2775            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2776            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2777            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2778            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2779            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
2780            rlevel,                               /* function recursion level */  
2781            recursing + 1);                       /* regex recurse level */          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2782    
2783          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2784            recursing + 1, rc));            rc));
2785    
2786          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2787    
# Line 1863  for (;;) Line 2795  for (;;)
2795            {            {
2796            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2797              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2798              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2799              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #ifdef SUPPORT_UTF
2800                if (utf)
2801                  {
2802                  const pcre_uchar *p = start_subject + local_offsets[rc];
2803                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2804                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2805                  }
2806    #endif
2807              if (charcount > 0)              if (charcount > 0)
2808                {                {
2809                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 1882  for (;;) Line 2819  for (;;)
2819        break;        break;
2820    
2821        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2822          case OP_BRAPOS:
2823          case OP_SBRAPOS:
2824          case OP_CBRAPOS:
2825          case OP_SCBRAPOS:
2826          case OP_BRAPOSZERO:
2827            {
2828            int charcount, matched_count;
2829            const pcre_uchar *local_ptr = ptr;
2830            BOOL allow_zero;
2831    
2832            if (codevalue == OP_BRAPOSZERO)
2833              {
2834              allow_zero = TRUE;
2835              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2836              }
2837            else allow_zero = FALSE;
2838    
2839            /* Loop to match the subpattern as many times as possible as if it were
2840            a complete pattern. */
2841    
2842            for (matched_count = 0;; matched_count++)
2843              {
2844              int local_offsets[2];
2845              int local_workspace[1000];
2846    
2847              int rc = internal_dfa_exec(
2848                md,                                   /* fixed match data */
2849                code,                                 /* this subexpression's code */
2850                local_ptr,                            /* where we currently are */
2851                (int)(ptr - start_subject),           /* start offset */
2852                local_offsets,                        /* offset vector */
2853                sizeof(local_offsets)/sizeof(int),    /* size of same */
2854                local_workspace,                      /* workspace vector */
2855                sizeof(local_workspace)/sizeof(int),  /* size of same */
2856                rlevel);                              /* function recursion level */
2857    
2858              /* Failed to match */
2859    
2860              if (rc < 0)
2861                {
2862                if (rc != PCRE_ERROR_NOMATCH) return rc;
2863                break;
2864                }
2865    
2866              /* Matched: break the loop if zero characters matched. */
2867    
2868              charcount = local_offsets[1] - local_offsets[0];
2869              if (charcount == 0) break;
2870              local_ptr += charcount;    /* Advance temporary position ptr */
2871              }
2872    
2873            /* At this point we have matched the subpattern matched_count
2874            times, and local_ptr is pointing to the character after the end of the
2875            last match. */
2876    
2877            if (matched_count > 0 || allow_zero)
2878              {
2879              const pcre_uchar *end_subpattern = code;
2880              int next_state_offset;
2881    
2882              do { end_subpattern += GET(end_subpattern, 1); }
2883                while (*end_subpattern == OP_ALT);
2884              next_state_offset =
2885                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2886    
2887              /* Optimization: if there are no more active states, and there
2888              are no new states yet set up, then skip over the subject string
2889              right here, to save looping. Otherwise, set up the new state to swing
2890              into action when the end of the matched substring is reached. */
2891    
2892              if (i + 1 >= active_count && new_count == 0)
2893                {
2894                ptr = local_ptr;
2895                clen = 0;
2896                ADD_NEW(next_state_offset, 0);
2897                }
2898              else
2899                {
2900                const pcre_uchar *p = ptr;
2901                const pcre_uchar *pp = local_ptr;
2902                charcount = (int)(pp - p);
2903    #ifdef SUPPORT_UTF
2904                if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2905    #endif
2906                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2907                }
2908              }
2909            }
2910          break;
2911    
2912          /*-----------------------------------------------------------------*/
2913        case OP_ONCE:        case OP_ONCE:
2914          case OP_ONCE_NC:
2915          {          {
2916          int local_offsets[2];          int local_offsets[2];
2917          int local_workspace[1000];          int local_workspace[1000];
# Line 1891  for (;;) Line 2920  for (;;)
2920            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2921            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2922            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2923            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2924            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2925            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2926            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2927            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2928            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2929    
2930          if (rc >= 0)          if (rc >= 0)
2931            {            {
2932            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2933            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2934            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2935    
2936            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2937              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2938            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2939                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2940    
2941            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2942            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 1916  for (;;) Line 2944  for (;;)
2944    
2945            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2946                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2947              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2948    
2949            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2950            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 1931  for (;;) Line 2959  for (;;)
2959            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2960            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2961            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2962            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2963    
2964            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2965              {              {
# Line 1954  for (;;) Line 2982  for (;;)
2982              }              }
2983            else            else
2984              {              {
2985              const uschar *p = start_subject + local_offsets[0];  #ifdef SUPPORT_UTF
2986              const uschar *pp = start_subject + local_offsets[1];              if (utf)
2987              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;                {
2988                  const pcre_uchar *p = start_subject + local_offsets[0];
2989                  const pcre_uchar *pp = start_subject + local_offsets[1];
2990                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2991                  }
2992    #endif
2993              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2994              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2995                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2996              }              }
   
2997            }            }
2998          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2999          }          }
# Line 1972  for (;;) Line 3004  for (;;)
3004        /* Handle callouts */        /* Handle callouts */
3005    
3006        case OP_CALLOUT:        case OP_CALLOUT:
3007        if (pcre_callout != NULL)        rrc = 0;
3008          if (PUBL(callout) != NULL)
3009          {          {
3010          int rrc;          PUBL(callout_block) cb;
         pcre_callout_block cb;  
3011          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
3012          cb.callout_number   = code[1];          cb.callout_number   = code[1];
3013          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
3014    #ifdef COMPILE_PCRE8
3015          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
3016          cb.subject_length   = end_subject - start_subject;  #else
3017          cb.start_match      = current_subject - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
3018          cb.current_position = ptr - start_subject;  #endif
3019            cb.subject_length   = (int)(end_subject - start_subject);
3020            cb.start_match      = (int)(current_subject - start_subject);
3021            cb.current_position = (int)(ptr - start_subject);
3022          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
3023          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
3024          cb.capture_top      = 1;          cb.capture_top      = 1;
3025          cb.capture_last     = -1;          cb.capture_last     = -1;
3026          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
3027          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
3028          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3029          }          }
3030          if (rrc == 0)
3031            { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3032        break;        break;
3033    
3034    
# Line 2006  for (;;) Line 3044  for (;;)
3044    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
3045    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
3046    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
3047    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
3048    
3049      The "forced_ fail" variable counts the number of (*F) encountered for the
3050      character. If it is equal to the original active_count (saved in
3051      workspace[1]) it means that (*F) was found on every active state. In this
3052      case we don't want to give a partial match.
3053    
3054      The "could_continue" variable is true if a state could have continued but
3055      for the fact that the end of the subject was reached. */
3056    
3057    if (new_count <= 0)    if (new_count <= 0)
3058      {      {
3059      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
3060          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on, and */
3061          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3062          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
3063          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3064            ||                                           /* or... */
3065            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3066             match_count < 0)                            /* no matches */
3067            ) &&                                         /* And... */
3068            (
3069            partial_newline ||                           /* Either partial NL */
3070              (                                          /* or ... */
3071              ptr >= end_subject &&                /* End of subject and */
3072              ptr > md->start_used_ptr)            /* Inspected non-empty string */
3073              )
3074            )
3075        {        {
3076        if (offsetcount >= 2)        if (offsetcount >= 2)
3077          {          {
3078          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
3079          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
3080          }          }
3081        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
3082        }        }
# Line 2073  Returns:          > 0 => number of match Line 3130  Returns:          > 0 => number of match
3130                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3131  */  */
3132    
3133  PCRE_EXP_DEFN int  #ifdef COMPILE_PCRE8
3134    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3135  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3136    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3137    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3138    #else
3139    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3140    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3141      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3142      int offsetcount, int *workspace, int wscount)
3143    #endif
3144  {  {
3145  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3146  dfa_match_data match_block;  dfa_match_data match_block;
3147  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3148  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3149  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
   
 pcre_study_data internal_study;  
3150  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3151    
3152  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3153  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3154  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3155  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3156  int first_byte = -1;  pcre_uchar first_char = 0;
3157  int req_byte = -1;  pcre_uchar first_char2 = 0;
3158  int req_byte2 = -1;  pcre_uchar req_char = 0;
3159    pcre_uchar req_char2 = 0;
3160  int newline;  int newline;
3161    
3162  /* Plausibility checks */  /* Plausibility checks */
# Line 2104  if (re == NULL || subject == NULL || wor Line 3166  if (re == NULL || subject == NULL || wor
3166     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3167  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3168  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3169    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3170    
3171    /* Check that the first field in the block is the magic number. If it is not,
3172    return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3173    REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3174    means that the pattern is likely compiled with different endianness. */
3175    
3176    if (re->magic_number != MAGIC_NUMBER)
3177      return re->magic_number == REVERSED_MAGIC_NUMBER?
3178        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3179    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3180    
3181  /* We need to find the pointer to any study data before we test for byte  /* If restarting after a partial match, do some sanity checks on the contents
3182  flipping, so we scan the extra_data block first. This may set two fields in the  of the workspace. */
3183  match block, so we must initialize them beforehand. However, the other fields  
3184  in the match block must not be set until after the byte flipping. */  if ((options & PCRE_DFA_RESTART) != 0)
3185      {
3186      if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3187        workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3188          return PCRE_ERROR_DFA_BADRESTART;
3189      }
3190    
3191    /* Set up study, callout, and table data */
3192    
3193  md->tables = re->tables;  md->tables = re->tables;
3194  md->callout_data = NULL;  md->callout_data = NULL;
# Line 2127  if (extra_data != NULL) Line 3207  if (extra_data != NULL)
3207      md->tables = extra_data->tables;      md->tables = extra_data->tables;
3208    }    }
3209    
 /* Check that the first field in the block is the magic number. If it is not,  
 test for a regex that was compiled on a host of opposite endianness. If this is  
 the case, flipped values are put in internal_re and internal_study if there was  
 study data too. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   {  
   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);  
   if (re == NULL) return PCRE_ERROR_BADMAGIC;  
   if (study != NULL) study = &internal_study;  
   }  
   
3210  /* Set some local values */  /* Set some local values */
3211    
3212  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3213  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3214  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3215    
3216  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3217  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3218    utf = (re->options & PCRE_UTF8) != 0;
3219  #else  #else
3220  utf8 = FALSE;  utf = FALSE;
3221  #endif  #endif
3222    
3223  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2156  anchored = (options & (PCRE_ANCHORED|PCR Line 3225  anchored = (options & (PCRE_ANCHORED|PCR
3225    
3226  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3227    
3228  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3229      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3230  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3231  md->end_subject = end_subject;  md->end_subject = end_subject;
3232    md->start_offset = start_offset;
3233  md->moptions = options;  md->moptions = options;
3234  md->poptions = re->options;  md->poptions = re->options;
3235    
3236    /* If the BSR option is not set at match time, copy what was set
3237    at compile time. */
3238    
3239    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3240      {
3241      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3242        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3243    #ifdef BSR_ANYCRLF
3244      else md->moptions |= PCRE_BSR_ANYCRLF;
3245    #endif
3246      }
3247    
3248  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
3249  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
3250    
# Line 2170  switch ((((options & PCRE_NEWLINE_BITS) Line 3252  switch ((((options & PCRE_NEWLINE_BITS)
3252           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
3253    {    {
3254    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
3255    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3256    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3257    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
3258         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3259    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
3260    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3261    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2206  else Line 3288  else
3288  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3289  back the character offset. */  back the character offset. */
3290    
3291  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3292  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3293    {    {
3294    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3295      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3296    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3297      {      {
3298      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3299        {        {
3300        tb &= 0xc0;        offsets[0] = erroroffset;
3301        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3302        }        }
3303        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3304          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3305      }      }
3306      if (start_offset > 0 && start_offset < length &&
3307            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3308        return PCRE_ERROR_BADUTF8_OFFSET;
3309    }    }
3310  #endif  #endif
3311    
# Line 2227  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3313  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3313  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3314  in other programs later. */  in other programs later. */
3315    
3316  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3317    
3318  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3319  used in a loop when finding where to start. */  where to start. */
3320    
3321  lcc = md->tables + lcc_offset;  startline = (re->flags & PCRE_STARTLINE) != 0;
 startline = (re->options & PCRE_STARTLINE) != 0;  
3322  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3323    
3324  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2244  studied, there may be a bitmap of possib Line 3329  studied, there may be a bitmap of possib
3329    
3330  if (!anchored)  if (!anchored)
3331    {    {
3332    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3333      {      {
3334      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3335      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = (pcre_uchar)(re->first_char);
3336        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3337          {
3338          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3339    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3340          if (utf && first_char > 127)
3341            first_char2 = UCD_OTHERCASE(first_char);
3342    #endif
3343          }
3344      }      }
3345    else    else
3346      {      {
3347      if (startline && study != NULL &&      if (!startline && study != NULL &&
3348           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3349        start_bits = study->start_bits;        start_bits = study->start_bits;
3350      }      }
3351    }    }
# Line 2261  if (!anchored) Line 3353  if (!anchored)
3353  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
3354  character" set. */  character" set. */
3355    
3356  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3357    {    {
3358    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3359    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = (pcre_uchar)(re->req_char);
3360    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3361        {
3362        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3363    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3364        if (utf && req_char > 127)
3365          req_char2 = UCD_OTHERCASE(req_char);
3366    #endif
3367        }
3368    }    }
3369    
3370  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
3371  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
3372  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
3373    
3374  for (;;)  for (;;)
3375    {    {
# Line 2279  for (;;) Line 3377  for (;;)
3377    
3378    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3379      {      {
3380      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3381    
3382      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
3383      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
3384      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
3385      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
3386    
3387      if (firstline)      if (firstline)
3388        {        {
3389        const uschar *t = current_subject;        PCRE_PUCHAR t = current_subject;
3390    #ifdef SUPPORT_UTF
3391          if (utf)
3392            {
3393            while (t < md->end_subject && !IS_NEWLINE(t))
3394              {
3395              t++;
3396              ACROSSCHAR(t < end_subject, *t, t++);
3397              }
3398            }
3399          else
3400    #endif
3401        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3402        end_subject = t;        end_subject = t;
3403        }        }
3404    
3405      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
3406        starting point is not found. However, there is an option that disables
3407        these, for testing and for ensuring that all callouts do actually occur.
3408        The option can be set in the regex by (*NO_START_OPT) or passed in
3409        match-time options. */
3410    
3411        if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3412        {        {
3413        if (first_byte_caseless)        /* Advance to a known first char. */
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
3414    
3415      /* Or to just after a linebreak for a multiline match if possible */        if (has_first_char)
3416            {
3417            if (first_char != first_char2)
3418              while (current_subject < end_subject &&
3419                  *current_subject != first_char && *current_subject != first_char2)
3420                current_subject++;
3421            else
3422              while (current_subject < end_subject &&
3423                     *current_subject != first_char)
3424                current_subject++;
3425            }
3426    
3427      else if (startline)        /* Or to just after a linebreak for a multiline match if possible */
3428        {  
3429        if (current_subject > md->start_subject + start_offset)        else if (startline)
3430          {          {
3431          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (current_subject > md->start_subject + start_offset)
3432            current_subject++;            {
3433    #ifdef SUPPORT_UTF
3434              if (utf)
3435                {
3436                while (current_subject < end_subject &&
3437                       !WAS_NEWLINE(current_subject))
3438                  {
3439                  current_subject++;
3440                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3441                    current_subject++);
3442                  }
3443                }
3444              else
3445    #endif
3446              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3447                current_subject++;
3448    
3449          /* If we have just passed a CR and the newline option is ANY or            /* If we have just passed a CR and the newline option is ANY or
3450          ANYCRLF, and we are now at a LF, advance the match position by one more            ANYCRLF, and we are now at a LF, advance the match position by one
3451          character. */            more character. */
3452    
3453          if (current_subject[-1] == '\r' &&            if (current_subject[-1] == CHAR_CR &&
3454               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3455               current_subject < end_subject &&                 current_subject < end_subject &&
3456               *current_subject == '\n')                 *current_subject == CHAR_NL)
3457            current_subject++;              current_subject++;
3458              }
3459          }          }
       }  
3460    
3461      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
3462    
3463      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
3464          {          {
3465          register unsigned int c = *current_subject;          while (current_subject < end_subject)
3466          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
3467              register unsigned int c = *current_subject;
3468    #ifndef COMPILE_PCRE8
3469              if (c > 255) c = 255;
3470    #endif
3471              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3472                {
3473                current_subject++;
3474    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3475                /* In non 8-bit mode, the iteration will stop for
3476                characters > 255 at the beginning or not stop at all. */
3477                if (utf)
3478                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3479                    current_subject++);
3480    #endif
3481                }
3482            else break;            else break;
3483              }
3484          }          }
3485        }        }
3486    
3487      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3488    
3489      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
3490    
3491    /* If req_byte is set, we know that that character must appear in the subject      /* The following two optimizations are disabled for partial matching or if
3492    for the match to succeed. If the first character is set, req_byte must be      disabling is explicitly requested (and of course, by the test above, this
3493    later in the subject; otherwise the test starts at the match point. This      code is not obeyed when restarting after a partial match). */
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
   
     /* We don't need to repeat the search if we haven't yet reached the  
     place we found it at last time. */  
3494    
3495      if (p > req_byte_ptr)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3496            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3497        {        {
3498        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3499          {        is a lower bound; no actual string of that length may actually match the
3500          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3501            {        bytes to avoid spending too much time in this optimization. */
3502            register int pp = *p++;  
3503            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3504            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3505          }          return PCRE_ERROR_NOMATCH;
3506        else  
3507          /* If req_char is set, we know that that character must appear in the
3508          subject for the match to succeed. If the first character is set, req_char
3509          must be later in the subject; otherwise the test starts at the match
3510          point. This optimization can save a huge amount of work in patterns with
3511          nested unlimited repeats that aren't going to match. Writing separate
3512          code for cased/caseless versions makes it go faster, as does using an
3513          autoincrement and backing off on a match.
3514    
3515          HOWEVER: when the subject string is very, very long, searching to its end
3516          can take a long time, and give bad performance on quite ordinary
3517          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3518          string... so we don't do this when the string is sufficiently long. */
3519    
3520          if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3521          {          {
3522          while (p < end_subject)          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3523    
3524            /* We don't need to repeat the search if we haven't yet reached the
3525            place we found it at last time. */
3526    
3527            if (p > req_char_ptr)
3528            {            {
3529            if (*p++ == req_byte) { p--; break; }            if (req_char != req_char2)
3530            }              {
3531          }              while (p < end_subject)
3532                  {
3533                  register int pp = *p++;
3534                  if (pp == req_char || pp == req_char2) { p--; break; }
3535                  }
3536                }
3537              else
3538                {
3539                while (p < end_subject)
3540                  {
3541                  if (*p++ == req_char) { p--; break; }
3542                  }
3543                }
3544    
3545        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3546        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3547    
3548        if (p >= end_subject) break;            if (p >= end_subject) break;
3549    
3550        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3551        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3552        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3553    
3554        req_byte_ptr = p;            req_char_ptr = p;
3555              }
3556            }
3557        }        }
3558      }      }   /* End of optimizations that are done when not restarting */
3559    
3560    /* OK, now we can do the business */    /* OK, now we can do the business */
3561    
3562      md->start_used_ptr = current_subject;
3563      md->recursive = NULL;
3564    
3565    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3566      md,                                /* fixed match data */      md,                                /* fixed match data */
3567      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2410  for (;;) Line 3571  for (;;)
3571      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3572      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3573      wscount,                           /* size of same */      wscount,                           /* size of same */
3574      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3575    
3576    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3577    on only if not anchored. */    on only if not anchored. */
# Line 2424  for (;;) Line 3583  for (;;)
3583    
3584    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3585    current_subject++;    current_subject++;
3586    if (utf8)  #ifdef SUPPORT_UTF
3587      if (utf)
3588      {      {
3589      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3590        current_subject++;        current_subject++);
3591      }      }
3592    #endif
3593    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3594    
3595    /* If we have just passed a CR and the newline option is CRLF or ANY or    /* If we have just passed a CR and we are now at a LF, and the pattern does
3596    ANYCRLF, and we are now at a LF, advance the match position by one more    not contain any explicit matches for \r or \n, and the newline option is CRLF
3597    character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3598    
3599    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
3600         (md->nltype == NLTYPE_ANY ||        current_subject < end_subject &&
3601          md->nltype == NLTYPE_ANYCRLF ||        *current_subject == CHAR_NL &&
3602          md->nllen == 2) &&        (re->flags & PCRE_HASCRORLF) == 0 &&
3603         current_subject < end_subject &&          (md->nltype == NLTYPE_ANY ||
3604         *current_subject == '\n')           md->nltype == NLTYPE_ANYCRLF ||
3605             md->nllen == 2))
3606      current_subject++;      current_subject++;
3607    
3608    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.150