/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 168 by ph10, Tue May 29 15:18:18 2007 UTC revision 1015 by ph10, Sun Aug 26 16:07:14 2012 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 37  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a sort of DFA algorithm (not a true  alternative matching function that uses a sort of DFA algorithm (not a true
43  FSM). This is NOT Perl- compatible, but it has advantages in certain  FSM). This is NOT Perl-compatible, but it has advantages in certain
44  applications. */  applications. */
45    
46    
47    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48    the performance of his patterns greatly. I could not use it as it stood, as it
49    was not thread safe, and made assumptions about pattern sizes. Also, it caused
50    test 7 to loop, and test 9 to crash with a segfault.
51    
52    The issue is the check for duplicate states, which is done by a simple linear
53    search up the state list. (Grep for "duplicate" below to find the code.) For
54    many patterns, there will never be many states active at one time, so a simple
55    linear search is fine. In patterns that have many active states, it might be a
56    bottleneck. The suggested code used an indexing scheme to remember which states
57    had previously been used for each character, and avoided the linear search when
58    it knew there was no chance of a duplicate. This was implemented when adding
59    states to the state lists.
60    
61    I wrote some thread-safe, not-limited code to try something similar at the time
62    of checking for duplicates (instead of when adding states), using index vectors
63    on the stack. It did give a 13% improvement with one specially constructed
64    pattern for certain subject strings, but on other strings and on many of the
65    simpler patterns in the test suite it did worse. The major problem, I think,
66    was the extra time to initialize the index. This had to be done for each call
67    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68    only once - I suspect this was the cause of the problems with the tests.)
69    
70    Overall, I concluded that the gains in some cases did not outweigh the losses
71    in others, so I abandoned this code. */
72    
73    
74    
75    #ifdef HAVE_CONFIG_H
76    #include "config.h"
77    #endif
78    
79  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
80  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
81  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 56  applications. */ Line 88  applications. */
88  #define SP "                   "  #define SP "                   "
89    
90    
   
91  /*************************************************  /*************************************************
92  *      Code parameters and static tables         *  *      Code parameters and static tables         *
93  *************************************************/  *************************************************/
94    
95  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
97  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
98    never stored, so we push them well clear of the normal opcodes. */
99    
100  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
101  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
102  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
103    #define OP_HSPACE_EXTRA     360
104    #define OP_VSPACE_EXTRA     380
105    
106    
107  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
108  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
109  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
110  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. Non-zero values in the table are the offsets from the opcode where
112  that follow must also be modified. */  the character is to be found. ***NOTE*** If the start of this table is
113    modified, the three tables that follow must also be modified. */
114    
115  static uschar coptable[] = {  static const pcre_uint8 coptable[] = {
116    0,                             /* End                                    */    0,                             /* End                                    */
117    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119    0, 0,                          /* Any, Anybyte                           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0,                          /* \P, \p                                 */
121    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122      0,                             /* \X                                     */
123      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
124    1,                             /* Char                                   */    1,                             /* Char                                   */
125    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
126    1,                             /* not                                    */    1,                             /* not                                    */
127      1,                             /* noti                                   */
128    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
129    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
132      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135      1+IMM2_SIZE,                   /* exact I                                */
136      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
138    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
141      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144      1+IMM2_SIZE,                   /* NOT exact I                            */
145      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
147    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
150      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
152    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 106  static uschar coptable[] = { Line 155  static uschar coptable[] = {
155    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
156    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
157    0,                             /* REF                                    */    0,                             /* REF                                    */
158      0,                             /* REFI                                   */
159    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
160    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
161    0,                             /* Alt                                    */    0,                             /* Alt                                    */
162    0,                             /* Ket                                    */    0,                             /* Ket                                    */
163    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
164    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
165      0,                             /* KetRpos                                */
166      0,                             /* Reverse                                */
167    0,                             /* Assert                                 */    0,                             /* Assert                                 */
168    0,                             /* Assert not                             */    0,                             /* Assert not                             */
169    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
170    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
171      0, 0,                          /* ONCE, ONCE_NC                          */
172      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
173      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
174      0, 0,                          /* CREF, NCREF                            */
175      0, 0,                          /* RREF, NRREF                            */
176      0,                             /* DEF                                    */
177      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
178      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
179      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
180      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
181      0, 0                           /* CLOSE, SKIPZERO  */
182    };
183    
184    /* This table identifies those opcodes that inspect a character. It is used to
185    remember the fact that a character could have been inspected when the end of
186    the subject is reached. ***NOTE*** If the start of this table is modified, the
187    two tables that follow must also be modified. */
188    
189    static const pcre_uint8 poptable[] = {
190      0,                             /* End                                    */
191      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
192      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
193      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
194      1, 1,                          /* \P, \p                                 */
195      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
196      1,                             /* \X                                     */
197      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
198      1,                             /* Char                                   */
199      1,                             /* Chari                                  */
200      1,                             /* not                                    */
201      1,                             /* noti                                   */
202      /* Positive single-char repeats                                          */
203      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
204      1, 1, 1,                       /* upto, minupto, exact                   */
205      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
206      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
207      1, 1, 1,                       /* upto I, minupto I, exact I             */
208      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
209      /* Negative single-char repeats - only for chars < 256                   */
210      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
211      1, 1, 1,                       /* NOT upto, minupto, exact               */
212      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
213      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
214      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
215      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
216      /* Positive type repeats                                                 */
217      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
218      1, 1, 1,                       /* Type upto, minupto, exact              */
219      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
220      /* Character class & ref repeats                                         */
221      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
222      1, 1,                          /* CRRANGE, CRMINRANGE                    */
223      1,                             /* CLASS                                  */
224      1,                             /* NCLASS                                 */
225      1,                             /* XCLASS - variable length               */
226      0,                             /* REF                                    */
227      0,                             /* REFI                                   */
228      0,                             /* RECURSE                                */
229      0,                             /* CALLOUT                                */
230      0,                             /* Alt                                    */
231      0,                             /* Ket                                    */
232      0,                             /* KetRmax                                */
233      0,                             /* KetRmin                                */
234      0,                             /* KetRpos                                */
235    0,                             /* Reverse                                */    0,                             /* Reverse                                */
236    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
237    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
238    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
239    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
240      0, 0,                          /* ONCE, ONCE_NC                          */
241      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
242      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
243      0, 0,                          /* CREF, NCREF                            */
244      0, 0,                          /* RREF, NRREF                            */
245    0,                             /* DEF                                    */    0,                             /* DEF                                    */
246    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
247      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
248      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
249      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
250      0, 0                           /* CLOSE, SKIPZERO                        */
251  };  };
252    
253  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254  and \w */  and \w */
255    
256  static uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
257    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
258    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
259    ctype_space, ctype_space,    ctype_space, ctype_space,
260    ctype_word,  ctype_word,    ctype_word,  ctype_word,
261    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
262  };  };
263    
264  static uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
265    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
266    ctype_digit, 0,    ctype_digit, 0,
267    ctype_space, 0,    ctype_space, 0,
268    ctype_word,  0,    ctype_word,  0,
269    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
270  };  };
271    
272    
# Line 153  these structures in, is a vector of ints Line 278  these structures in, is a vector of ints
278  typedef struct stateblock {  typedef struct stateblock {
279    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
280    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
281    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
282  } stateblock;  } stateblock;
283    
284  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
285    
286    
287  #ifdef DEBUG  #ifdef PCRE_DEBUG
288  /*************************************************  /*************************************************
289  *             Print character string             *  *             Print character string             *
290  *************************************************/  *************************************************/
# Line 176  Returns:       nothing Line 300  Returns:       nothing
300  */  */
301    
302  static void  static void
303  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
304  {  {
305  int c;  int c;
306  while (length-- > 0)  while (length-- > 0)
# Line 209  Arguments: Line 333  Arguments:
333    offsetcount       size of same    offsetcount       size of same
334    workspace         vector of workspace    workspace         vector of workspace
335    wscount           size of same    wscount           size of same
   ims               the current ims flags  
336    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
337    
338  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
339                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
340                       -1 => failed to match                       -1 => failed to match
341                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
342    
# Line 226  for the current character, one for the f Line 348  for the current character, one for the f
348      { \      { \
349      next_active_state->offset = (x); \      next_active_state->offset = (x); \
350      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
351      next_active_state++; \      next_active_state++; \
352      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353      } \      } \
# Line 237  for the current character, one for the f Line 358  for the current character, one for the f
358      { \      { \
359      next_active_state->offset = (x); \      next_active_state->offset = (x); \
360      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
361      next_active_state->data   = (z); \      next_active_state->data   = (z); \
362      next_active_state++; \      next_active_state++; \
363      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 249  for the current character, one for the f Line 369  for the current character, one for the f
369      { \      { \
370      next_new_state->offset = (x); \      next_new_state->offset = (x); \
371      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
372      next_new_state++; \      next_new_state++; \
373      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
374      } \      } \
# Line 260  for the current character, one for the f Line 379  for the current character, one for the f
379      { \      { \
380      next_new_state->offset = (x); \      next_new_state->offset = (x); \
381      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
382      next_new_state->data   = (z); \      next_new_state->data   = (z); \
383      next_new_state++; \      next_new_state++; \
384      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385          (x), (y), (z), __LINE__)); \
386      } \      } \
387    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
388    
# Line 272  for the current character, one for the f Line 391  for the current character, one for the f
391  static int  static int
392  internal_dfa_exec(  internal_dfa_exec(
393    dfa_match_data *md,    dfa_match_data *md,
394    const uschar *this_start_code,    const pcre_uchar *this_start_code,
395    const uschar *current_subject,    const pcre_uchar *current_subject,
396    int start_offset,    int start_offset,
397    int *offsets,    int *offsets,
398    int offsetcount,    int offsetcount,
399    int *workspace,    int *workspace,
400    int wscount,    int wscount,
401    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
402  {  {
403  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
404  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
405    
406  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
407  const uschar *ptr;  const pcre_uchar *ptr;
408  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
409    
410    dfa_recursion_info new_recursive;
411    
412  int active_count, new_count, match_count;  int active_count, new_count, match_count;
413    
414  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
415  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
416    
417  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
418  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
419  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
420    
421  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
422  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423  #else  #else
424  BOOL utf8 = FALSE;  BOOL utf = FALSE;
425  #endif  #endif
426    
427    BOOL reset_could_continue = FALSE;
428    
429  rlevel++;  rlevel++;
430  offsetcount &= (-2);  offsetcount &= (-2);
431    
# Line 313  wscount = (wscount - (wscount % (INTS_PE Line 434  wscount = (wscount - (wscount % (INTS_PE
434            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
435    
436  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
437    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
438    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439    
440  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
441  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 327  next_new_state = new_states = active_sta Line 448  next_new_state = new_states = active_sta
448  new_count = 0;  new_count = 0;
449    
450  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
451    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453        ? IMM2_SIZE:0);
454    
455  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 355  if (*first_op == OP_REVERSE) Line 478  if (*first_op == OP_REVERSE)
478    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
479    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
480    
481  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
482    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
483    
484    if (utf8)    if (utf)
485      {      {
486      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
487        {        {
488        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
489        current_subject--;        current_subject--;
490        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
491        }        }
492      }      }
493    else    else
# Line 376  if (*first_op == OP_REVERSE) Line 497  if (*first_op == OP_REVERSE)
497    
498      {      {
499      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
500        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
501      current_subject -= gone_back;      current_subject -= gone_back;
502      }      }
503    
504      /* Save the earliest consulted character */
505    
506      if (current_subject < md->start_used_ptr)
507        md->start_used_ptr = current_subject;
508    
509    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
510    
511    end_code = this_start_code;    end_code = this_start_code;
# Line 388  if (*first_op == OP_REVERSE) Line 514  if (*first_op == OP_REVERSE)
514      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
515      if (back <= gone_back)      if (back <= gone_back)
516        {        {
517        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
519        }        }
520      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 421  else Line 547  else
547    else    else
548      {      {
549      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
550        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552            ? IMM2_SIZE:0);
553      do      do
554        {        {
555        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
556        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
557        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
558        }        }
# Line 434  else Line 562  else
562    
563  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
564    
565  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566    
567  /* Loop for scanning the subject */  /* Loop for scanning the subject */
568    
# Line 444  for (;;) Line 572  for (;;)
572    int i, j;    int i, j;
573    int clen, dlen;    int clen, dlen;
574    unsigned int c, d;    unsigned int c, d;
575      int forced_fail = 0;
576      BOOL partial_newline = FALSE;
577      BOOL could_continue = reset_could_continue;
578      reset_could_continue = FALSE;
579    
580    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
581    new state list. */    new state list. */
# Line 457  for (;;) Line 589  for (;;)
589    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
590    workspace[1] = active_count;    workspace[1] = active_count;
591    
592  #ifdef DEBUG  #ifdef PCRE_DEBUG
593    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
595    printf("\"\n");    printf("\"\n");
596    
597    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 479  for (;;) Line 611  for (;;)
611    
612    if (ptr < end_subject)    if (ptr < end_subject)
613      {      {
614      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of data items in the character */
615  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
616      if (utf8) { GETCHARLEN(c, ptr, clen); } else      if (utf) { GETCHARLEN(c, ptr, clen); } else
617  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
618      c = *ptr;      c = *ptr;
619      }      }
620    else    else
# Line 499  for (;;) Line 631  for (;;)
631    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
632      {      {
633      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
634      const uschar *code;      BOOL caseless = FALSE;
635        const pcre_uchar *code;
636      int state_offset = current_state->offset;      int state_offset = current_state->offset;
637      int count, codevalue;      int count, codevalue, rrc;
 #ifdef SUPPORT_UCP  
     int chartype, script;  
 #endif  
638    
639  #ifdef DEBUG  #ifdef PCRE_DEBUG
640      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
641      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
642        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
643          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
644  #endif  #endif
645    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
646      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
647      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
648      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
649        state, arrange for it to passed on. */
650    
651      if (state_offset < 0)      if (state_offset < 0)
652        {        {
# Line 528  for (;;) Line 655  for (;;)
655          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
656          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
657            current_state->data - 1);            current_state->data - 1);
658            if (could_continue) reset_could_continue = TRUE;
659          continue;          continue;
660          }          }
661        else        else
# Line 536  for (;;) Line 664  for (;;)
664          }          }
665        }        }
666    
667      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
668        See the note at the head of this module about the possibility of improving
669        performance here. */
670    
671      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
672        {        {
# Line 553  for (;;) Line 683  for (;;)
683      code = start_code + state_offset;      code = start_code + state_offset;
684      codevalue = *code;      codevalue = *code;
685    
686        /* If this opcode inspects a character, but we are at the end of the
687        subject, remember the fact for use when testing for a partial match. */
688    
689        if (clen == 0 && poptable[codevalue] != 0)
690          could_continue = TRUE;
691    
692      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
693      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
694      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
695      permitted.      permitted.
696    
697      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
698      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long because
699      Unfortunately, we have to take special action to deal with  \P, \p, and      the values are small. We have to take special action to deal with  \P, \p,
700      \X in this case. To keep the other cases fast, convert these ones to new      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
701      opcodes. */      these ones to new opcodes. */
702    
703      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
704        {        {
705        dlen = 1;        dlen = 1;
706  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
707        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
708  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
709        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
710        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
711          {          {
# Line 580  for (;;) Line 716  for (;;)
716            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
717            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
718            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
719              case OP_NOT_HSPACE:
720              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
721              case OP_NOT_VSPACE:
722              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
723            default: break;            default: break;
724            }            }
725          }          }
# Line 595  for (;;) Line 735  for (;;)
735    
736      switch (codevalue)      switch (codevalue)
737        {        {
738    /* ========================================================================== */
739          /* These cases are never obeyed. This is a fudge that causes a compile-
740          time error if the vectors coptable or poptable, which are indexed by
741          opcode, are not the correct length. It seems to be the only way to do
742          such a check at compile time, as the sizeof() operator does not work
743          in the C preprocessor. */
744    
745          case OP_TABLE_LENGTH:
746          case OP_TABLE_LENGTH +
747            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
748             (sizeof(poptable) == OP_TABLE_LENGTH)):
749          break;
750    
751  /* ========================================================================== */  /* ========================================================================== */
752        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
753        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
754        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
755          subpattern, because the possessive subpattern repeats are always handled
756          using recursive calls. Thus, it never adds any new states.
757    
758          At the end of the (sub)pattern, unless we have an empty string and
759          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
760          start of the subject, save the match data, shifting up all previous
761        matches so we always have the longest first. */        matches so we always have the longest first. */
762    
763        case OP_KET:        case OP_KET:
764        case OP_KETRMIN:        case OP_KETRMIN:
765        case OP_KETRMAX:        case OP_KETRMAX:
766          case OP_KETRPOS:
767        if (code != end_code)        if (code != end_code)
768          {          {
769          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 613  for (;;) Line 772  for (;;)
772            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
773            }            }
774          }          }
775        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
776          {          {
777          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
778            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
779              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
780          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
781          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
782          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
783            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
784            offsets[0] = current_subject - start_subject;                match_count = 0;
785            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
786            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
787              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
788            }              {
789          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
790            {              offsets[1] = (int)(ptr - start_subject);
791            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
792              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], (char *)current_subject));
793              match_count, rlevel*2-2, SP));              }
794            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
795                {
796                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
797                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
798                  match_count, rlevel*2-2, SP));
799                return match_count;
800                }
801            }            }
802          }          }
803        break;        break;
# Line 644  for (;;) Line 809  for (;;)
809        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
810        case OP_ALT:        case OP_ALT:
811        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
812        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
813        break;        break;
814    
815        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 652  for (;;) Line 817  for (;;)
817        case OP_SBRA:        case OP_SBRA:
818        do        do
819          {          {
820          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821          code += GET(code, 1);          code += GET(code, 1);
822          }          }
823        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 661  for (;;) Line 826  for (;;)
826        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
827        case OP_CBRA:        case OP_CBRA:
828        case OP_SCBRA:        case OP_SCBRA:
829        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
830        code += GET(code, 1);        code += GET(code, 1);
831        while (*code == OP_ALT)        while (*code == OP_ALT)
832          {          {
833          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
834          code += GET(code, 1);          code += GET(code, 1);
835          }          }
836        break;        break;
# Line 676  for (;;) Line 841  for (;;)
841        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
842        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
843        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
844        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
845          break;
846    
847          /*-----------------------------------------------------------------*/
848          case OP_SKIPZERO:
849          code += 1 + GET(code, 2);
850          while (*code == OP_ALT) code += GET(code, 1);
851          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
852        break;        break;
853    
854        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
855        case OP_CIRC:        case OP_CIRC:
856        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
857          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
858        break;        break;
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_EOD:        case OP_CIRCM:
862        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
863              (ptr != end_subject && WAS_NEWLINE(ptr)))
864            { ADD_ACTIVE(state_offset + 1, 0); }
865        break;        break;
866    
867        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
868        case OP_OPT:        case OP_EOD:
869        ims = code[1];        if (ptr >= end_subject)
870        ADD_ACTIVE(state_offset + 2, 0);          {
871            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
872              could_continue = TRUE;
873            else { ADD_ACTIVE(state_offset + 1, 0); }
874            }
875        break;        break;
876    
877        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 718  for (;;) Line 893  for (;;)
893    
894        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
895        case OP_ANY:        case OP_ANY:
896        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
897            {
898            if (ptr + 1 >= md->end_subject &&
899                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
900                NLBLOCK->nltype == NLTYPE_FIXED &&
901                NLBLOCK->nllen == 2 &&
902                c == NLBLOCK->nl[0])
903              {
904              could_continue = partial_newline = TRUE;
905              }
906            else
907              {
908              ADD_NEW(state_offset + 1, 0);
909              }
910            }
911          break;
912    
913          /*-----------------------------------------------------------------*/
914          case OP_ALLANY:
915          if (clen > 0)
916          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
917        break;        break;
918    
919        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
920        case OP_EODN:        case OP_EODN:
921        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
922            could_continue = TRUE;
923          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
924          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
925        break;        break;
926    
# Line 732  for (;;) Line 928  for (;;)
928        case OP_DOLL:        case OP_DOLL:
929        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
930          {          {
931          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
932              (IS_NEWLINE(ptr) &&            could_continue = TRUE;
933                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)          else if (clen == 0 ||
934                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
935                   (ptr == end_subject - md->nllen)
936              ))              ))
937            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
938            else if (ptr + 1 >= md->end_subject &&
939                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
940                     NLBLOCK->nltype == NLTYPE_FIXED &&
941                     NLBLOCK->nllen == 2 &&
942                     c == NLBLOCK->nl[0])
943              {
944              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
945                {
946                reset_could_continue = TRUE;
947                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
948                }
949              else could_continue = partial_newline = TRUE;
950              }
951            }
952          break;
953    
954          /*-----------------------------------------------------------------*/
955          case OP_DOLLM:
956          if ((md->moptions & PCRE_NOTEOL) == 0)
957            {
958            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
959              could_continue = TRUE;
960            else if (clen == 0 ||
961                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
962              { ADD_ACTIVE(state_offset + 1, 0); }
963            else if (ptr + 1 >= md->end_subject &&
964                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
965                     NLBLOCK->nltype == NLTYPE_FIXED &&
966                     NLBLOCK->nllen == 2 &&
967                     c == NLBLOCK->nl[0])
968              {
969              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
970                {
971                reset_could_continue = TRUE;
972                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
973                }
974              else could_continue = partial_newline = TRUE;
975              }
976          }          }
977        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        else if (IS_NEWLINE(ptr))
978          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
979        break;        break;
980    
# Line 769  for (;;) Line 1005  for (;;)
1005    
1006          if (ptr > start_subject)          if (ptr > start_subject)
1007            {            {
1008            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1009  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1010            if (utf8) BACKCHAR(temp);  #ifdef SUPPORT_UTF
1011              if (utf) { BACKCHAR(temp); }
1012  #endif  #endif
1013            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1014    #ifdef SUPPORT_UCP
1015              if ((md->poptions & PCRE_UCP) != 0)
1016                {
1017                if (d == '_') left_word = TRUE; else
1018                  {
1019                  int cat = UCD_CATEGORY(d);
1020                  left_word = (cat == ucp_L || cat == ucp_N);
1021                  }
1022                }
1023              else
1024    #endif
1025            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1026            }            }
1027          else left_word = 0;          else left_word = FALSE;
1028    
1029          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
1030            else right_word = 0;            {
1031    #ifdef SUPPORT_UCP
1032              if ((md->poptions & PCRE_UCP) != 0)
1033                {
1034                if (c == '_') right_word = TRUE; else
1035                  {
1036                  int cat = UCD_CATEGORY(c);
1037                  right_word = (cat == ucp_L || cat == ucp_N);
1038                  }
1039                }
1040              else
1041    #endif
1042              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1043              }
1044            else right_word = FALSE;
1045    
1046          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1047            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 798  for (;;) Line 1060  for (;;)
1060        if (clen > 0)        if (clen > 0)
1061          {          {
1062          BOOL OK;          BOOL OK;
1063          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1064          switch(code[1])          switch(code[1])
1065            {            {
1066            case PT_ANY:            case PT_ANY:
# Line 806  for (;;) Line 1068  for (;;)
1068            break;            break;
1069    
1070            case PT_LAMP:            case PT_LAMP:
1071            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1072                   prop->chartype == ucp_Lt;
1073            break;            break;
1074    
1075            case PT_GC:            case PT_GC:
1076            OK = category == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1077            break;            break;
1078    
1079            case PT_PC:            case PT_PC:
1080            OK = chartype == code[2];            OK = prop->chartype == code[2];
1081            break;            break;
1082    
1083            case PT_SC:            case PT_SC:
1084            OK = script == code[2];            OK = prop->script == code[2];
1085              break;
1086    
1087              /* These are specials for combination cases. */
1088    
1089              case PT_ALNUM:
1090              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1091                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1092              break;
1093    
1094              case PT_SPACE:    /* Perl space */
1095              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1096                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1097              break;
1098    
1099              case PT_PXSPACE:  /* POSIX space */
1100              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1101                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1102                   c == CHAR_FF || c == CHAR_CR;
1103              break;
1104    
1105              case PT_WORD:
1106              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1107                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1108                   c == CHAR_UNDERSCORE;
1109            break;            break;
1110    
1111            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 838  for (;;) Line 1125  for (;;)
1125  /* ========================================================================== */  /* ========================================================================== */
1126        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
1127        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
1128        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1129        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1130    
1131        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1132        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 848  for (;;) Line 1135  for (;;)
1135        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1136        if (clen > 0)        if (clen > 0)
1137          {          {
1138          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1139                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1140                NLBLOCK->nltype == NLTYPE_FIXED &&
1141                NLBLOCK->nllen == 2 &&
1142                c == NLBLOCK->nl[0])
1143              {
1144              could_continue = partial_newline = TRUE;
1145              }
1146            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1147              (c < 256 &&              (c < 256 &&
1148                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1149                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1150            {            {
1151            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 874  for (;;) Line 1166  for (;;)
1166        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1167        if (clen > 0)        if (clen > 0)
1168          {          {
1169          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1170                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1171                NLBLOCK->nltype == NLTYPE_FIXED &&
1172                NLBLOCK->nllen == 2 &&
1173                c == NLBLOCK->nl[0])
1174              {
1175              could_continue = partial_newline = TRUE;
1176              }
1177            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1178              (c < 256 &&              (c < 256 &&
1179                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1180                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1181            {            {
1182            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 899  for (;;) Line 1196  for (;;)
1196        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1197        if (clen > 0)        if (clen > 0)
1198          {          {
1199          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1200                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1201                NLBLOCK->nltype == NLTYPE_FIXED &&
1202                NLBLOCK->nllen == 2 &&
1203                c == NLBLOCK->nl[0])
1204              {
1205              could_continue = partial_newline = TRUE;
1206              }
1207            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208              (c < 256 &&              (c < 256 &&
1209                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1210                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211            {            {
1212            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 922  for (;;) Line 1224  for (;;)
1224        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1225        if (clen > 0)        if (clen > 0)
1226          {          {
1227          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1228                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1229                NLBLOCK->nltype == NLTYPE_FIXED &&
1230                NLBLOCK->nllen == 2 &&
1231                c == NLBLOCK->nl[0])
1232              {
1233              could_continue = partial_newline = TRUE;
1234              }
1235            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1236              (c < 256 &&              (c < 256 &&
1237                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1238                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1239            {            {
1240            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1241              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1242            else            else
1243              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1244            }            }
# Line 942  for (;;) Line 1249  for (;;)
1249        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1250        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1251        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1252        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1253        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1254        if (clen > 0)        if (clen > 0)
1255          {          {
1256          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1257                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1258                NLBLOCK->nltype == NLTYPE_FIXED &&
1259                NLBLOCK->nllen == 2 &&
1260                c == NLBLOCK->nl[0])
1261              {
1262              could_continue = partial_newline = TRUE;
1263              }
1264            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265              (c < 256 &&              (c < 256 &&
1266                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1267                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268            {            {
1269            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 960  for (;;) Line 1272  for (;;)
1272              next_active_state--;              next_active_state--;
1273              }              }
1274            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1275              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1276            else            else
1277              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1278            }            }
# Line 982  for (;;) Line 1294  for (;;)
1294        if (clen > 0)        if (clen > 0)
1295          {          {
1296          BOOL OK;          BOOL OK;
1297          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1298          switch(code[2])          switch(code[2])
1299            {            {
1300            case PT_ANY:            case PT_ANY:
# Line 990  for (;;) Line 1302  for (;;)
1302            break;            break;
1303    
1304            case PT_LAMP:            case PT_LAMP:
1305            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1306                prop->chartype == ucp_Lt;
1307            break;            break;
1308    
1309            case PT_GC:            case PT_GC:
1310            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1311            break;            break;
1312    
1313            case PT_PC:            case PT_PC:
1314            OK = chartype == code[3];            OK = prop->chartype == code[3];
1315            break;            break;
1316    
1317            case PT_SC:            case PT_SC:
1318            OK = script == code[3];            OK = prop->script == code[3];
1319              break;
1320    
1321              /* These are specials for combination cases. */
1322    
1323              case PT_ALNUM:
1324              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1325                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1326              break;
1327    
1328              case PT_SPACE:    /* Perl space */
1329              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1330                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1331              break;
1332    
1333              case PT_PXSPACE:  /* POSIX space */
1334              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1335                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1336                   c == CHAR_FF || c == CHAR_CR;
1337              break;
1338    
1339              case PT_WORD:
1340              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1341                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1342                   c == CHAR_UNDERSCORE;
1343            break;            break;
1344    
1345            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1031  for (;;) Line 1368  for (;;)
1368        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1369        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1370        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1371        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1372          {          {
1373          const uschar *nptr = ptr + clen;          int lgb, rgb;
1374            const pcre_uchar *nptr = ptr + clen;
1375          int ncount = 0;          int ncount = 0;
1376          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1377            {            {
1378            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1379            next_active_state--;            next_active_state--;
1380            }            }
1381            lgb = UCD_GRAPHBREAK(c);
1382          while (nptr < end_subject)          while (nptr < end_subject)
1383            {            {
1384            int nd;            dlen = 1;
1385            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1386            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1387            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1388            ncount++;            ncount++;
1389            nptr += ndlen;            lgb = rgb;
1390              nptr += dlen;
1391            }            }
1392          count++;          count++;
1393          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
# Line 1066  for (;;) Line 1406  for (;;)
1406          int ncount = 0;          int ncount = 0;
1407          switch (c)          switch (c)
1408            {            {
1409              case 0x000b:
1410              case 0x000c:
1411              case 0x0085:
1412              case 0x2028:
1413              case 0x2029:
1414              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1415              goto ANYNL01;
1416    
1417            case 0x000d:            case 0x000d:
1418            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1419            /* Fall through */            /* Fall through */
1420    
1421              ANYNL01:
1422              case 0x000a:
1423              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1424                {
1425                active_count--;           /* Remove non-match possibility */
1426                next_active_state--;
1427                }
1428              count++;
1429              ADD_NEW_DATA(-state_offset, count, ncount);
1430              break;
1431    
1432              default:
1433              break;
1434              }
1435            }
1436          break;
1437    
1438          /*-----------------------------------------------------------------*/
1439          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1440          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1441          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1442          count = current_state->count;  /* Already matched */
1443          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1444          if (clen > 0)
1445            {
1446            BOOL OK;
1447            switch (c)
1448              {
1449            case 0x000a:            case 0x000a:
1450            case 0x000b:            case 0x000b:
1451            case 0x000c:            case 0x000c:
1452              case 0x000d:
1453            case 0x0085:            case 0x0085:
1454            case 0x2028:            case 0x2028:
1455            case 0x2029:            case 0x2029:
1456            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            OK = TRUE;
1457              break;
1458    
1459              default:
1460              OK = FALSE;
1461              break;
1462              }
1463    
1464            if (OK == (d == OP_VSPACE))
1465              {
1466              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1467              {              {
1468              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1469              next_active_state--;              next_active_state--;
1470              }              }
1471            count++;            count++;
1472            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, 0);
1473              }
1474            }
1475          break;
1476    
1477          /*-----------------------------------------------------------------*/
1478          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1479          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1480          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1481          count = current_state->count;  /* Already matched */
1482          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1483          if (clen > 0)
1484            {
1485            BOOL OK;
1486            switch (c)
1487              {
1488              case 0x09:      /* HT */
1489              case 0x20:      /* SPACE */
1490              case 0xa0:      /* NBSP */
1491              case 0x1680:    /* OGHAM SPACE MARK */
1492              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1493              case 0x2000:    /* EN QUAD */
1494              case 0x2001:    /* EM QUAD */
1495              case 0x2002:    /* EN SPACE */
1496              case 0x2003:    /* EM SPACE */
1497              case 0x2004:    /* THREE-PER-EM SPACE */
1498              case 0x2005:    /* FOUR-PER-EM SPACE */
1499              case 0x2006:    /* SIX-PER-EM SPACE */
1500              case 0x2007:    /* FIGURE SPACE */
1501              case 0x2008:    /* PUNCTUATION SPACE */
1502              case 0x2009:    /* THIN SPACE */
1503              case 0x200A:    /* HAIR SPACE */
1504              case 0x202f:    /* NARROW NO-BREAK SPACE */
1505              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1506              case 0x3000:    /* IDEOGRAPHIC SPACE */
1507              OK = TRUE;
1508            break;            break;
1509    
1510            default:            default:
1511              OK = FALSE;
1512            break;            break;
1513            }            }
1514    
1515            if (OK == (d == OP_HSPACE))
1516              {
1517              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1518                {
1519                active_count--;           /* Remove non-match possibility */
1520                next_active_state--;
1521                }
1522              count++;
1523              ADD_NEW_DATA(-state_offset, count, 0);
1524              }
1525          }          }
1526        break;        break;
1527    
# Line 1108  for (;;) Line 1544  for (;;)
1544        if (clen > 0)        if (clen > 0)
1545          {          {
1546          BOOL OK;          BOOL OK;
1547          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1548          switch(code[2])          switch(code[2])
1549            {            {
1550            case PT_ANY:            case PT_ANY:
# Line 1116  for (;;) Line 1552  for (;;)
1552            break;            break;
1553    
1554            case PT_LAMP:            case PT_LAMP:
1555            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1556                prop->chartype == ucp_Lt;
1557            break;            break;
1558    
1559            case PT_GC:            case PT_GC:
1560            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1561            break;            break;
1562    
1563            case PT_PC:            case PT_PC:
1564            OK = chartype == code[3];            OK = prop->chartype == code[3];
1565            break;            break;
1566    
1567            case PT_SC:            case PT_SC:
1568            OK = script == code[3];            OK = prop->script == code[3];
1569              break;
1570    
1571              /* These are specials for combination cases. */
1572    
1573              case PT_ALNUM:
1574              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1575                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1576              break;
1577    
1578              case PT_SPACE:    /* Perl space */
1579              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1580                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1581              break;
1582    
1583              case PT_PXSPACE:  /* POSIX space */
1584              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1585                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1586                   c == CHAR_FF || c == CHAR_CR;
1587              break;
1588    
1589              case PT_WORD:
1590              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1591                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1592                   c == CHAR_UNDERSCORE;
1593            break;            break;
1594    
1595            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1166  for (;;) Line 1627  for (;;)
1627        QS2:        QS2:
1628    
1629        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1630        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1631          {          {
1632          const uschar *nptr = ptr + clen;          int lgb, rgb;
1633            const pcre_uchar *nptr = ptr + clen;
1634          int ncount = 0;          int ncount = 0;
1635          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1636              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1176  for (;;) Line 1638  for (;;)
1638            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1639            next_active_state--;            next_active_state--;
1640            }            }
1641            lgb = UCD_GRAPHBREAK(c);
1642          while (nptr < end_subject)          while (nptr < end_subject)
1643            {            {
1644            int nd;            dlen = 1;
1645            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1646            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1647            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1648            ncount++;            ncount++;
1649            nptr += ndlen;            lgb = rgb;
1650              nptr += dlen;
1651            }            }
1652          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1653          }          }
# Line 1209  for (;;) Line 1673  for (;;)
1673          int ncount = 0;          int ncount = 0;
1674          switch (c)          switch (c)
1675            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1676            case 0x000b:            case 0x000b:
1677            case 0x000c:            case 0x000c:
1678            case 0x0085:            case 0x0085:
1679            case 0x2028:            case 0x2028:
1680            case 0x2029:            case 0x2029:
1681              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1682              goto ANYNL02;
1683    
1684              case 0x000d:
1685              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1686              /* Fall through */
1687    
1688              ANYNL02:
1689              case 0x000a:
1690            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1691                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1692              {              {
# Line 1226  for (;;) Line 1695  for (;;)
1695              }              }
1696            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1697            break;            break;
1698    
1699            default:            default:
1700            break;            break;
1701            }            }
# Line 1233  for (;;) Line 1703  for (;;)
1703        break;        break;
1704    
1705        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1706  #ifdef SUPPORT_UCP        case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1707        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1708        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1709        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        count = 2;
1710        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        goto QS4;
1711        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)  
1712          { ADD_ACTIVE(state_offset + 6, 0); }        case OP_VSPACE_EXTRA + OP_TYPESTAR:
1713        count = current_state->count;  /* Number already matched */        case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1714          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1715          count = 0;
1716    
1717          QS4:
1718          ADD_ACTIVE(state_offset + 2, 0);
1719        if (clen > 0)        if (clen > 0)
1720          {          {
1721          BOOL OK;          BOOL OK;
1722          int category = _pcre_ucp_findprop(c, &chartype, &script);          switch (c)
         switch(code[4])  
1723            {            {
1724            case PT_ANY:            case 0x000a:
1725              case 0x000b:
1726              case 0x000c:
1727              case 0x000d:
1728              case 0x0085:
1729              case 0x2028:
1730              case 0x2029:
1731            OK = TRUE;            OK = TRUE;
1732            break;            break;
1733    
           case PT_LAMP:  
           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;  
           break;  
   
           case PT_GC:  
           OK = category == code[5];  
           break;  
   
           case PT_PC:  
           OK = chartype == code[5];  
           break;  
   
           case PT_SC:  
           OK = script == code[5];  
           break;  
   
           /* Should never occur, but keep compilers from grumbling. */  
   
1734            default:            default:
1735            OK = codevalue != OP_PROP;            OK = FALSE;
1736              break;
1737              }
1738            if (OK == (d == OP_VSPACE))
1739              {
1740              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1741                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1742                {
1743                active_count--;           /* Remove non-match possibility */
1744                next_active_state--;
1745                }
1746              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1747              }
1748            }
1749          break;
1750    
1751          /*-----------------------------------------------------------------*/
1752          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1753          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1754          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1755          count = 2;
1756          goto QS5;
1757    
1758          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1759          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1760          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1761          count = 0;
1762    
1763          QS5:
1764          ADD_ACTIVE(state_offset + 2, 0);
1765          if (clen > 0)
1766            {
1767            BOOL OK;
1768            switch (c)
1769              {
1770              case 0x09:      /* HT */
1771              case 0x20:      /* SPACE */
1772              case 0xa0:      /* NBSP */
1773              case 0x1680:    /* OGHAM SPACE MARK */
1774              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1775              case 0x2000:    /* EN QUAD */
1776              case 0x2001:    /* EM QUAD */
1777              case 0x2002:    /* EN SPACE */
1778              case 0x2003:    /* EM SPACE */
1779              case 0x2004:    /* THREE-PER-EM SPACE */
1780              case 0x2005:    /* FOUR-PER-EM SPACE */
1781              case 0x2006:    /* SIX-PER-EM SPACE */
1782              case 0x2007:    /* FIGURE SPACE */
1783              case 0x2008:    /* PUNCTUATION SPACE */
1784              case 0x2009:    /* THIN SPACE */
1785              case 0x200A:    /* HAIR SPACE */
1786              case 0x202f:    /* NARROW NO-BREAK SPACE */
1787              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1788              case 0x3000:    /* IDEOGRAPHIC SPACE */
1789              OK = TRUE;
1790              break;
1791    
1792              default:
1793              OK = FALSE;
1794              break;
1795              }
1796    
1797            if (OK == (d == OP_HSPACE))
1798              {
1799              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1800                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1801                {
1802                active_count--;           /* Remove non-match possibility */
1803                next_active_state--;
1804                }
1805              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1806              }
1807            }
1808          break;
1809    
1810          /*-----------------------------------------------------------------*/
1811    #ifdef SUPPORT_UCP
1812          case OP_PROP_EXTRA + OP_TYPEEXACT:
1813          case OP_PROP_EXTRA + OP_TYPEUPTO:
1814          case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1815          case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1816          if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1817            { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1818          count = current_state->count;  /* Number already matched */
1819          if (clen > 0)
1820            {
1821            BOOL OK;
1822            const ucd_record * prop = GET_UCD(c);
1823            switch(code[1 + IMM2_SIZE + 1])
1824              {
1825              case PT_ANY:
1826              OK = TRUE;
1827              break;
1828    
1829              case PT_LAMP:
1830              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1831                prop->chartype == ucp_Lt;
1832              break;
1833    
1834              case PT_GC:
1835              OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1836              break;
1837    
1838              case PT_PC:
1839              OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1840              break;
1841    
1842              case PT_SC:
1843              OK = prop->script == code[1 + IMM2_SIZE + 2];
1844              break;
1845    
1846              /* These are specials for combination cases. */
1847    
1848              case PT_ALNUM:
1849              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1850                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1851              break;
1852    
1853              case PT_SPACE:    /* Perl space */
1854              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1855                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1856              break;
1857    
1858              case PT_PXSPACE:  /* POSIX space */
1859              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1860                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1861                   c == CHAR_FF || c == CHAR_CR;
1862              break;
1863    
1864              case PT_WORD:
1865              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1866                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1867                   c == CHAR_UNDERSCORE;
1868              break;
1869    
1870              /* Should never occur, but keep compilers from grumbling. */
1871    
1872              default:
1873              OK = codevalue != OP_PROP;
1874            break;            break;
1875            }            }
1876    
# Line 1282  for (;;) Line 1882  for (;;)
1882              next_active_state--;              next_active_state--;
1883              }              }
1884            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1885              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1886            else            else
1887              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1888            }            }
# Line 1295  for (;;) Line 1895  for (;;)
1895        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1896        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1897        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1898          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1899        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1900        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1901          {          {
1902          const uschar *nptr = ptr + clen;          int lgb, rgb;
1903            const pcre_uchar *nptr = ptr + clen;
1904          int ncount = 0;          int ncount = 0;
1905          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1906            {            {
1907            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1908            next_active_state--;            next_active_state--;
1909            }            }
1910            lgb = UCD_GRAPHBREAK(c);
1911          while (nptr < end_subject)          while (nptr < end_subject)
1912            {            {
1913            int nd;            dlen = 1;
1914            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1915            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1916            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1917            ncount++;            ncount++;
1918            nptr += ndlen;            lgb = rgb;
1919              nptr += dlen;
1920            }            }
1921            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1922                reset_could_continue = TRUE;
1923          if (++count >= GET2(code, 1))          if (++count >= GET2(code, 1))
1924            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1925          else          else
1926            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1927          }          }
# Line 1329  for (;;) Line 1934  for (;;)
1934        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1935        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1936        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1937          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1938        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1939        if (clen > 0)        if (clen > 0)
1940          {          {
1941          int ncount = 0;          int ncount = 0;
1942          switch (c)          switch (c)
1943            {            {
1944              case 0x000b:
1945              case 0x000c:
1946              case 0x0085:
1947              case 0x2028:
1948              case 0x2029:
1949              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1950              goto ANYNL03;
1951    
1952            case 0x000d:            case 0x000d:
1953            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1954            /* Fall through */            /* Fall through */
1955    
1956              ANYNL03:
1957              case 0x000a:
1958              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1959                {
1960                active_count--;           /* Remove non-match possibility */
1961                next_active_state--;
1962                }
1963              if (++count >= GET2(code, 1))
1964                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1965              else
1966                { ADD_NEW_DATA(-state_offset, count, ncount); }
1967              break;
1968    
1969              default:
1970              break;
1971              }
1972            }
1973          break;
1974    
1975          /*-----------------------------------------------------------------*/
1976          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1977          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1978          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1979          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1980          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1981            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1982          count = current_state->count;  /* Number already matched */
1983          if (clen > 0)
1984            {
1985            BOOL OK;
1986            switch (c)
1987              {
1988            case 0x000a:            case 0x000a:
1989            case 0x000b:            case 0x000b:
1990            case 0x000c:            case 0x000c:
1991              case 0x000d:
1992            case 0x0085:            case 0x0085:
1993            case 0x2028:            case 0x2028:
1994            case 0x2029:            case 0x2029:
1995            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            OK = TRUE;
1996              break;
1997    
1998              default:
1999              OK = FALSE;
2000              }
2001    
2002            if (OK == (d == OP_VSPACE))
2003              {
2004              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2005              {              {
2006              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
2007              next_active_state--;              next_active_state--;
2008              }              }
2009            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2010              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2011            else            else
2012              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, 0); }
2013              }
2014            }
2015          break;
2016    
2017          /*-----------------------------------------------------------------*/
2018          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2019          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2020          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2021          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2022          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2023            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2024          count = current_state->count;  /* Number already matched */
2025          if (clen > 0)
2026            {
2027            BOOL OK;
2028            switch (c)
2029              {
2030              case 0x09:      /* HT */
2031              case 0x20:      /* SPACE */
2032              case 0xa0:      /* NBSP */
2033              case 0x1680:    /* OGHAM SPACE MARK */
2034              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2035              case 0x2000:    /* EN QUAD */
2036              case 0x2001:    /* EM QUAD */
2037              case 0x2002:    /* EN SPACE */
2038              case 0x2003:    /* EM SPACE */
2039              case 0x2004:    /* THREE-PER-EM SPACE */
2040              case 0x2005:    /* FOUR-PER-EM SPACE */
2041              case 0x2006:    /* SIX-PER-EM SPACE */
2042              case 0x2007:    /* FIGURE SPACE */
2043              case 0x2008:    /* PUNCTUATION SPACE */
2044              case 0x2009:    /* THIN SPACE */
2045              case 0x200A:    /* HAIR SPACE */
2046              case 0x202f:    /* NARROW NO-BREAK SPACE */
2047              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2048              case 0x3000:    /* IDEOGRAPHIC SPACE */
2049              OK = TRUE;
2050            break;            break;
2051    
2052            default:            default:
2053              OK = FALSE;
2054            break;            break;
2055            }            }
2056    
2057            if (OK == (d == OP_HSPACE))
2058              {
2059              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2060                {
2061                active_count--;           /* Remove non-match possibility */
2062                next_active_state--;
2063                }
2064              if (++count >= GET2(code, 1))
2065                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2066              else
2067                { ADD_NEW_DATA(-state_offset, count, 0); }
2068              }
2069          }          }
2070        break;        break;
2071    
# Line 1373  for (;;) Line 2081  for (;;)
2081        break;        break;
2082    
2083        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2084        case OP_CHARNC:        case OP_CHARI:
2085        if (clen == 0) break;        if (clen == 0) break;
2086    
2087  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2088        if (utf8)        if (utf)
2089          {          {
2090          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2091            {            {
2092            unsigned int othercase;            unsigned int othercase;
2093            if (c < 128) othercase = fcc[c]; else            if (c < 128)
2094                othercase = fcc[c];
2095            /* If we have Unicode property support, we can use it to test the            else
2096            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2097                other case of the character. */
2098  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2099            othercase = _pcre_ucp_othercase(c);              othercase = UCD_OTHERCASE(c);
2100  #else  #else
2101            othercase = NOTACHAR;              othercase = NOTACHAR;
2102  #endif  #endif
2103    
2104            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2105            }            }
2106          }          }
2107        else        else
2108  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2109          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2110          {          {
2111          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2112              { ADD_NEW(state_offset + 2, 0); }
2113          }          }
2114        break;        break;
2115    
# Line 1413  for (;;) Line 2121  for (;;)
2121        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2122    
2123        case OP_EXTUNI:        case OP_EXTUNI:
2124        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
2125          {          {
2126          const uschar *nptr = ptr + clen;          int lgb, rgb;
2127            const pcre_uchar *nptr = ptr + clen;
2128          int ncount = 0;          int ncount = 0;
2129            lgb = UCD_GRAPHBREAK(c);
2130          while (nptr < end_subject)          while (nptr < end_subject)
2131            {            {
2132            int nclen = 1;            dlen = 1;
2133            GETCHARLEN(c, nptr, nclen);            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2134            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            rgb = UCD_GRAPHBREAK(d);
2135              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2136            ncount++;            ncount++;
2137            nptr += nclen;            lgb = rgb;
2138              nptr += dlen;
2139            }            }
2140            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2141                reset_could_continue = TRUE;
2142          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2143          }          }
2144        break;        break;
# Line 1438  for (;;) Line 2152  for (;;)
2152        case OP_ANYNL:        case OP_ANYNL:
2153        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2154          {          {
         case 0x000a:  
2155          case 0x000b:          case 0x000b:
2156          case 0x000c:          case 0x000c:
2157          case 0x0085:          case 0x0085:
2158          case 0x2028:          case 0x2028:
2159          case 0x2029:          case 0x2029:
2160            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2161    
2162            case 0x000a:
2163          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2164          break;          break;
2165    
2166          case 0x000d:          case 0x000d:
2167          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 >= end_subject)
2168              {
2169              ADD_NEW(state_offset + 1, 0);
2170              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2171                reset_could_continue = TRUE;
2172              }
2173            else if (ptr[1] == 0x0a)
2174            {            {
2175            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2176            }            }
# Line 1460  for (;;) Line 2183  for (;;)
2183        break;        break;
2184    
2185        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2186        /* Match a negated single character. This is only used for one-byte        case OP_NOT_VSPACE:
2187        characters, that is, we know that d < 256. The character we are        if (clen > 0) switch(c)
2188        checking (c) can be multibyte. */          {
2189            case 0x000a:
2190            case 0x000b:
2191            case 0x000c:
2192            case 0x000d:
2193            case 0x0085:
2194            case 0x2028:
2195            case 0x2029:
2196            break;
2197    
2198            default:
2199            ADD_NEW(state_offset + 1, 0);
2200            break;
2201            }
2202          break;
2203    
2204          /*-----------------------------------------------------------------*/
2205          case OP_VSPACE:
2206          if (clen > 0) switch(c)
2207            {
2208            case 0x000a:
2209            case 0x000b:
2210            case 0x000c:
2211            case 0x000d:
2212            case 0x0085:
2213            case 0x2028:
2214            case 0x2029:
2215            ADD_NEW(state_offset + 1, 0);
2216            break;
2217    
2218            default: break;
2219            }
2220          break;
2221    
2222          /*-----------------------------------------------------------------*/
2223          case OP_NOT_HSPACE:
2224          if (clen > 0) switch(c)
2225            {
2226            case 0x09:      /* HT */
2227            case 0x20:      /* SPACE */
2228            case 0xa0:      /* NBSP */
2229            case 0x1680:    /* OGHAM SPACE MARK */
2230            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2231            case 0x2000:    /* EN QUAD */
2232            case 0x2001:    /* EM QUAD */
2233            case 0x2002:    /* EN SPACE */
2234            case 0x2003:    /* EM SPACE */
2235            case 0x2004:    /* THREE-PER-EM SPACE */
2236            case 0x2005:    /* FOUR-PER-EM SPACE */
2237            case 0x2006:    /* SIX-PER-EM SPACE */
2238            case 0x2007:    /* FIGURE SPACE */
2239            case 0x2008:    /* PUNCTUATION SPACE */
2240            case 0x2009:    /* THIN SPACE */
2241            case 0x200A:    /* HAIR SPACE */
2242            case 0x202f:    /* NARROW NO-BREAK SPACE */
2243            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2244            case 0x3000:    /* IDEOGRAPHIC SPACE */
2245            break;
2246    
2247            default:
2248            ADD_NEW(state_offset + 1, 0);
2249            break;
2250            }
2251          break;
2252    
2253          /*-----------------------------------------------------------------*/
2254          case OP_HSPACE:
2255          if (clen > 0) switch(c)
2256            {
2257            case 0x09:      /* HT */
2258            case 0x20:      /* SPACE */
2259            case 0xa0:      /* NBSP */
2260            case 0x1680:    /* OGHAM SPACE MARK */
2261            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2262            case 0x2000:    /* EN QUAD */
2263            case 0x2001:    /* EM QUAD */
2264            case 0x2002:    /* EN SPACE */
2265            case 0x2003:    /* EM SPACE */
2266            case 0x2004:    /* THREE-PER-EM SPACE */
2267            case 0x2005:    /* FOUR-PER-EM SPACE */
2268            case 0x2006:    /* SIX-PER-EM SPACE */
2269            case 0x2007:    /* FIGURE SPACE */
2270            case 0x2008:    /* PUNCTUATION SPACE */
2271            case 0x2009:    /* THIN SPACE */
2272            case 0x200A:    /* HAIR SPACE */
2273            case 0x202f:    /* NARROW NO-BREAK SPACE */
2274            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2275            case 0x3000:    /* IDEOGRAPHIC SPACE */
2276            ADD_NEW(state_offset + 1, 0);
2277            break;
2278            }
2279          break;
2280    
2281          /*-----------------------------------------------------------------*/
2282          /* Match a negated single character casefully. */
2283    
2284        case OP_NOT:        case OP_NOT:
2285          if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2286          break;
2287    
2288          /*-----------------------------------------------------------------*/
2289          /* Match a negated single character caselessly. */
2290    
2291          case OP_NOTI:
2292        if (clen > 0)        if (clen > 0)
2293          {          {
2294          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          unsigned int otherd;
2295          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  #ifdef SUPPORT_UTF
2296            if (utf && d >= 128)
2297              {
2298    #ifdef SUPPORT_UCP
2299              otherd = UCD_OTHERCASE(d);
2300    #endif  /* SUPPORT_UCP */
2301              }
2302            else
2303    #endif  /* SUPPORT_UTF */
2304            otherd = TABLE_GET(d, fcc, d);
2305            if (c != d && c != otherd)
2306              { ADD_NEW(state_offset + dlen + 1, 0); }
2307          }          }
2308        break;        break;
2309    
2310        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2311          case OP_PLUSI:
2312          case OP_MINPLUSI:
2313          case OP_POSPLUSI:
2314          case OP_NOTPLUSI:
2315          case OP_NOTMINPLUSI:
2316          case OP_NOTPOSPLUSI:
2317          caseless = TRUE;
2318          codevalue -= OP_STARI - OP_STAR;
2319    
2320          /* Fall through */
2321        case OP_PLUS:        case OP_PLUS:
2322        case OP_MINPLUS:        case OP_MINPLUS:
2323        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1484  for (;;) Line 2329  for (;;)
2329        if (clen > 0)        if (clen > 0)
2330          {          {
2331          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2332          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2333            {            {
2334  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2335            if (utf8 && d >= 128)            if (utf && d >= 128)
2336              {              {
2337  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2338              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2339  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2340              }              }
2341            else            else
2342  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2343            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2344            }            }
2345          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2346            {            {
# Line 1512  for (;;) Line 2357  for (;;)
2357        break;        break;
2358    
2359        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2360          case OP_QUERYI:
2361          case OP_MINQUERYI:
2362          case OP_POSQUERYI:
2363          case OP_NOTQUERYI:
2364          case OP_NOTMINQUERYI:
2365          case OP_NOTPOSQUERYI:
2366          caseless = TRUE;
2367          codevalue -= OP_STARI - OP_STAR;
2368          /* Fall through */
2369        case OP_QUERY:        case OP_QUERY:
2370        case OP_MINQUERY:        case OP_MINQUERY:
2371        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1522  for (;;) Line 2376  for (;;)
2376        if (clen > 0)        if (clen > 0)
2377          {          {
2378          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2379          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2380            {            {
2381  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2382            if (utf8 && d >= 128)            if (utf && d >= 128)
2383              {              {
2384  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2385              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2386  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2387              }              }
2388            else            else
2389  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2390            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2391            }            }
2392          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2393            {            {
# Line 1548  for (;;) Line 2402  for (;;)
2402        break;        break;
2403    
2404        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2405          case OP_STARI:
2406          case OP_MINSTARI:
2407          case OP_POSSTARI:
2408          case OP_NOTSTARI:
2409          case OP_NOTMINSTARI:
2410          case OP_NOTPOSSTARI:
2411          caseless = TRUE;
2412          codevalue -= OP_STARI - OP_STAR;
2413          /* Fall through */
2414        case OP_STAR:        case OP_STAR:
2415        case OP_MINSTAR:        case OP_MINSTAR:
2416        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1558  for (;;) Line 2421  for (;;)
2421        if (clen > 0)        if (clen > 0)
2422          {          {
2423          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2424          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2425            {            {
2426  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2427            if (utf8 && d >= 128)            if (utf && d >= 128)
2428              {              {
2429  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2430              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2431  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2432              }              }
2433            else            else
2434  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2435            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2436            }            }
2437          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2438            {            {
# Line 1584  for (;;) Line 2447  for (;;)
2447        break;        break;
2448    
2449        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2450          case OP_EXACTI:
2451          case OP_NOTEXACTI:
2452          caseless = TRUE;
2453          codevalue -= OP_STARI - OP_STAR;
2454          /* Fall through */
2455        case OP_EXACT:        case OP_EXACT:
2456        case OP_NOTEXACT:        case OP_NOTEXACT:
2457        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2458        if (clen > 0)        if (clen > 0)
2459          {          {
2460          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2461          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2462            {            {
2463  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2464            if (utf8 && d >= 128)            if (utf && d >= 128)
2465              {              {
2466  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2467              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2468  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2469              }              }
2470            else            else
2471  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2472            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2473            }            }
2474          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2475            {            {
2476            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2477              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2478            else            else
2479              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2480            }            }
# Line 1614  for (;;) Line 2482  for (;;)
2482        break;        break;
2483    
2484        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2485          case OP_UPTOI:
2486          case OP_MINUPTOI:
2487          case OP_POSUPTOI:
2488          case OP_NOTUPTOI:
2489          case OP_NOTMINUPTOI:
2490          case OP_NOTPOSUPTOI:
2491          caseless = TRUE;
2492          codevalue -= OP_STARI - OP_STAR;
2493          /* Fall through */
2494        case OP_UPTO:        case OP_UPTO:
2495        case OP_MINUPTO:        case OP_MINUPTO:
2496        case OP_POSUPTO:        case OP_POSUPTO:
2497        case OP_NOTUPTO:        case OP_NOTUPTO:
2498        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2499        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2500        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2501        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2502        if (clen > 0)        if (clen > 0)
2503          {          {
2504          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2505          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2506            {            {
2507  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2508            if (utf8 && d >= 128)            if (utf && d >= 128)
2509              {              {
2510  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2511              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2512  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2513              }              }
2514            else            else
2515  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2516            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2517            }            }
2518          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2519            {            {
# Line 1646  for (;;) Line 2523  for (;;)
2523              next_active_state--;              next_active_state--;
2524              }              }
2525            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2526              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2527            else            else
2528              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2529            }            }
# Line 1663  for (;;) Line 2540  for (;;)
2540          {          {
2541          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2542          int next_state_offset;          int next_state_offset;
2543          const uschar *ecode;          const pcre_uchar *ecode;
2544    
2545          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2546          can set isinclass from it. */          can set isinclass from it. */
2547    
2548          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2549            {            {
2550            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2551            if (clen > 0)            if (clen > 0)
2552              {              {
2553              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2554                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2555              }              }
2556            }            }
2557    
# Line 1685  for (;;) Line 2562  for (;;)
2562          else          else
2563           {           {
2564           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2565           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2566           }           }
2567    
2568          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2569          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2570          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2571    
2572          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2573    
2574          switch (*ecode)          switch (*ecode)
2575            {            {
# Line 1719  for (;;) Line 2596  for (;;)
2596            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2597            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2598            if (count >= GET2(ecode, 1))            if (count >= GET2(ecode, 1))
2599              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2600            if (isinclass)            if (isinclass)
2601              {              {
2602              int max = GET2(ecode, 3);              int max = GET2(ecode, 1 + IMM2_SIZE);
2603              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2604                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2605              else              else
2606                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2607              }              }
# Line 1739  for (;;) Line 2616  for (;;)
2616    
2617  /* ========================================================================== */  /* ========================================================================== */
2618        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2619        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2620          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2621          though the other "backtracking verbs" are not supported. */
2622    
2623          case OP_FAIL:
2624          forced_fail++;    /* Count FAILs for multiple states */
2625          break;
2626    
2627        case OP_ASSERT:        case OP_ASSERT:
2628        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1749  for (;;) Line 2632  for (;;)
2632          int rc;          int rc;
2633          int local_offsets[2];          int local_offsets[2];
2634          int local_workspace[1000];          int local_workspace[1000];
2635          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2636    
2637          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2638    
# Line 1757  for (;;) Line 2640  for (;;)
2640            md,                                   /* static match data */            md,                                   /* static match data */
2641            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2642            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2643            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2644            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2645            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2646            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2647            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2648            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2649    
2650            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2651          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2652              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2653          }          }
2654        break;        break;
2655    
# Line 1777  for (;;) Line 2659  for (;;)
2659          {          {
2660          int local_offsets[1000];          int local_offsets[1000];
2661          int local_workspace[1000];          int local_workspace[1000];
2662          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2663            int condcode;
2664    
2665            /* Because of the way auto-callout works during compile, a callout item
2666            is inserted between OP_COND and an assertion condition. This does not
2667            happen for the other conditions. */
2668    
2669            if (code[LINK_SIZE+1] == OP_CALLOUT)
2670              {
2671              rrc = 0;
2672              if (PUBL(callout) != NULL)
2673                {
2674                PUBL(callout_block) cb;
2675                cb.version          = 1;   /* Version 1 of the callout block */
2676                cb.callout_number   = code[LINK_SIZE+2];
2677                cb.offset_vector    = offsets;
2678    #ifdef COMPILE_PCRE8
2679                cb.subject          = (PCRE_SPTR)start_subject;
2680    #else
2681                cb.subject          = (PCRE_SPTR16)start_subject;
2682    #endif
2683                cb.subject_length   = (int)(end_subject - start_subject);
2684                cb.start_match      = (int)(current_subject - start_subject);
2685                cb.current_position = (int)(ptr - start_subject);
2686                cb.pattern_position = GET(code, LINK_SIZE + 3);
2687                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2688                cb.capture_top      = 1;
2689                cb.capture_last     = -1;
2690                cb.callout_data     = md->callout_data;
2691                cb.mark             = NULL;   /* No (*MARK) support */
2692                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2693                }
2694              if (rrc > 0) break;                      /* Fail this thread */
2695              code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2696              }
2697    
2698            condcode = code[LINK_SIZE+1];
2699    
2700          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2701    
2702          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2703              return PCRE_ERROR_DFA_UCOND;
2704    
2705          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2706    
2707          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2708            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2709    
2710          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2711          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2712          recursed groups. */          recursed groups. */
2713    
2714          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2715            {            {
2716            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2717            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2718            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (md->recursive != NULL)
2719              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2720              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2721            }            }
2722    
2723          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1807  for (;;) Line 2725  for (;;)
2725          else          else
2726            {            {
2727            int rc;            int rc;
2728            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2729            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2730    
2731            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2732    
# Line 1816  for (;;) Line 2734  for (;;)
2734              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2735              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2736              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2737              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2738              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2739              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2740              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2741              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2742              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2743    
2744              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2745            if ((rc >= 0) ==            if ((rc >= 0) ==
2746                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2747              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2748            else            else
2749              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2750            }            }
2751          }          }
2752        break;        break;
# Line 1837  for (;;) Line 2754  for (;;)
2754        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2755        case OP_RECURSE:        case OP_RECURSE:
2756          {          {
2757            dfa_recursion_info *ri;
2758          int local_offsets[1000];          int local_offsets[1000];
2759          int local_workspace[1000];          int local_workspace[1000];
2760            const pcre_uchar *callpat = start_code + GET(code, 1);
2761            int recno = (callpat == md->start_code)? 0 :
2762              GET2(callpat, 1 + LINK_SIZE);
2763          int rc;          int rc;
2764    
2765          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2766            recursing + 1));  
2767            /* Check for repeating a recursion without advancing the subject
2768            pointer. This should catch convoluted mutual recursions. (Some simple
2769            cases are caught at compile time.) */
2770    
2771            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2772              if (recno == ri->group_num && ptr == ri->subject_position)
2773                return PCRE_ERROR_RECURSELOOP;
2774    
2775            /* Remember this recursion and where we started it so as to
2776            catch infinite loops. */
2777    
2778            new_recursive.group_num = recno;
2779            new_recursive.subject_position = ptr;
2780            new_recursive.prevrec = md->recursive;
2781            md->recursive = &new_recursive;
2782    
2783          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2784            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2785            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2786            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2787            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2788            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2789            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2790            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2791            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2792            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing + 1);                       /* regex recurse level */  
2793    
2794          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2795            recursing + 1, rc));  
2796            DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2797              rc));
2798    
2799          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2800    
# Line 1872  for (;;) Line 2808  for (;;)
2808            {            {
2809            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2810              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2811              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2812              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #ifdef SUPPORT_UTF
2813                if (utf)
2814                  {
2815                  const pcre_uchar *p = start_subject + local_offsets[rc];
2816                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2817                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2818                  }
2819    #endif
2820              if (charcount > 0)              if (charcount > 0)
2821                {                {
2822                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 1891  for (;;) Line 2832  for (;;)
2832        break;        break;
2833    
2834        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2835          case OP_BRAPOS:
2836          case OP_SBRAPOS:
2837          case OP_CBRAPOS:
2838          case OP_SCBRAPOS:
2839          case OP_BRAPOSZERO:
2840            {
2841            int charcount, matched_count;
2842            const pcre_uchar *local_ptr = ptr;
2843            BOOL allow_zero;
2844    
2845            if (codevalue == OP_BRAPOSZERO)
2846              {
2847              allow_zero = TRUE;
2848              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2849              }
2850            else allow_zero = FALSE;
2851    
2852            /* Loop to match the subpattern as many times as possible as if it were
2853            a complete pattern. */
2854    
2855            for (matched_count = 0;; matched_count++)
2856              {
2857              int local_offsets[2];
2858              int local_workspace[1000];
2859    
2860              int rc = internal_dfa_exec(
2861                md,                                   /* fixed match data */
2862                code,                                 /* this subexpression's code */
2863                local_ptr,                            /* where we currently are */
2864                (int)(ptr - start_subject),           /* start offset */
2865                local_offsets,                        /* offset vector */
2866                sizeof(local_offsets)/sizeof(int),    /* size of same */
2867                local_workspace,                      /* workspace vector */
2868                sizeof(local_workspace)/sizeof(int),  /* size of same */
2869                rlevel);                              /* function recursion level */
2870    
2871              /* Failed to match */
2872    
2873              if (rc < 0)
2874                {
2875                if (rc != PCRE_ERROR_NOMATCH) return rc;
2876                break;
2877                }
2878    
2879              /* Matched: break the loop if zero characters matched. */
2880    
2881              charcount = local_offsets[1] - local_offsets[0];
2882              if (charcount == 0) break;
2883              local_ptr += charcount;    /* Advance temporary position ptr */
2884              }
2885    
2886            /* At this point we have matched the subpattern matched_count
2887            times, and local_ptr is pointing to the character after the end of the
2888            last match. */
2889    
2890            if (matched_count > 0 || allow_zero)
2891              {
2892              const pcre_uchar *end_subpattern = code;
2893              int next_state_offset;
2894    
2895              do { end_subpattern += GET(end_subpattern, 1); }
2896                while (*end_subpattern == OP_ALT);
2897              next_state_offset =
2898                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2899    
2900              /* Optimization: if there are no more active states, and there
2901              are no new states yet set up, then skip over the subject string
2902              right here, to save looping. Otherwise, set up the new state to swing
2903              into action when the end of the matched substring is reached. */
2904    
2905              if (i + 1 >= active_count && new_count == 0)
2906                {
2907                ptr = local_ptr;
2908                clen = 0;
2909                ADD_NEW(next_state_offset, 0);
2910                }
2911              else
2912                {
2913                const pcre_uchar *p = ptr;
2914                const pcre_uchar *pp = local_ptr;
2915                charcount = (int)(pp - p);
2916    #ifdef SUPPORT_UTF
2917                if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2918    #endif
2919                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2920                }
2921              }
2922            }
2923          break;
2924    
2925          /*-----------------------------------------------------------------*/
2926        case OP_ONCE:        case OP_ONCE:
2927          case OP_ONCE_NC:
2928          {          {
2929          int local_offsets[2];          int local_offsets[2];
2930          int local_workspace[1000];          int local_workspace[1000];
# Line 1900  for (;;) Line 2933  for (;;)
2933            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2934            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2935            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2936            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2937            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2938            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2939            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2940            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2941            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2942    
2943          if (rc >= 0)          if (rc >= 0)
2944            {            {
2945            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2946            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2947            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2948    
2949            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2950              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2951            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2952                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2953    
2954            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2955            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 1925  for (;;) Line 2957  for (;;)
2957    
2958            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2959                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2960              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2961    
2962            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2963            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 1940  for (;;) Line 2972  for (;;)
2972            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2973            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2974            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2975            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2976    
2977            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2978              {              {
# Line 1963  for (;;) Line 2995  for (;;)
2995              }              }
2996            else            else
2997              {              {
2998              const uschar *p = start_subject + local_offsets[0];  #ifdef SUPPORT_UTF
2999              const uschar *pp = start_subject + local_offsets[1];              if (utf)
3000              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;                {
3001                  const pcre_uchar *p = start_subject + local_offsets[0];
3002                  const pcre_uchar *pp = start_subject + local_offsets[1];
3003                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
3004                  }
3005    #endif
3006              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
3007              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
3008                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
3009              }              }
   
3010            }            }
3011          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
3012          }          }
# Line 1981  for (;;) Line 3017  for (;;)
3017        /* Handle callouts */        /* Handle callouts */
3018    
3019        case OP_CALLOUT:        case OP_CALLOUT:
3020        if (pcre_callout != NULL)        rrc = 0;
3021          if (PUBL(callout) != NULL)
3022          {          {
3023          int rrc;          PUBL(callout_block) cb;
         pcre_callout_block cb;  
3024          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
3025          cb.callout_number   = code[1];          cb.callout_number   = code[1];
3026          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
3027    #ifdef COMPILE_PCRE8
3028          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
3029          cb.subject_length   = end_subject - start_subject;  #else
3030          cb.start_match      = current_subject - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
3031          cb.current_position = ptr - start_subject;  #endif
3032            cb.subject_length   = (int)(end_subject - start_subject);
3033            cb.start_match      = (int)(current_subject - start_subject);
3034            cb.current_position = (int)(ptr - start_subject);
3035          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
3036          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
3037          cb.capture_top      = 1;          cb.capture_top      = 1;
3038          cb.capture_last     = -1;          cb.capture_last     = -1;
3039          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
3040          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
3041          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3042          }          }
3043          if (rrc == 0)
3044            { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3045        break;        break;
3046    
3047    
# Line 2015  for (;;) Line 3057  for (;;)
3057    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
3058    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
3059    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
3060    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
3061    
3062      The "forced_ fail" variable counts the number of (*F) encountered for the
3063      character. If it is equal to the original active_count (saved in
3064      workspace[1]) it means that (*F) was found on every active state. In this
3065      case we don't want to give a partial match.
3066    
3067      The "could_continue" variable is true if a state could have continued but
3068      for the fact that the end of the subject was reached. */
3069    
3070    if (new_count <= 0)    if (new_count <= 0)
3071      {      {
3072      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
3073          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on, and */
3074          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3075          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
3076          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3077            ||                                           /* or... */
3078            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3079             match_count < 0)                            /* no matches */
3080            ) &&                                         /* And... */
3081            (
3082            partial_newline ||                           /* Either partial NL */
3083              (                                          /* or ... */
3084              ptr >= end_subject &&                /* End of subject and */
3085              ptr > md->start_used_ptr)            /* Inspected non-empty string */
3086              )
3087            )
3088        {        {
3089        if (offsetcount >= 2)        if (offsetcount >= 2)
3090          {          {
3091          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
3092          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
3093          }          }
3094        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
3095        }        }
# Line 2082  Returns:          > 0 => number of match Line 3143  Returns:          > 0 => number of match
3143                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3144  */  */
3145    
3146  PCRE_EXP_DEFN int  #ifdef COMPILE_PCRE8
3147    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3148  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3149    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3150    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3151    #else
3152    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3153    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3154      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3155      int offsetcount, int *workspace, int wscount)
3156    #endif
3157  {  {
3158  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3159  dfa_match_data match_block;  dfa_match_data match_block;
3160  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3161  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3162  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
   
 pcre_study_data internal_study;  
3163  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3164    
3165  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3166  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3167  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3168  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3169  int first_byte = -1;  pcre_uchar first_char = 0;
3170  int req_byte = -1;  pcre_uchar first_char2 = 0;
3171  int req_byte2 = -1;  pcre_uchar req_char = 0;
3172    pcre_uchar req_char2 = 0;
3173  int newline;  int newline;
3174    
3175  /* Plausibility checks */  /* Plausibility checks */
# Line 2113  if (re == NULL || subject == NULL || wor Line 3179  if (re == NULL || subject == NULL || wor
3179     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3180  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3181  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3182    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3183    
3184    /* Check that the first field in the block is the magic number. If it is not,
3185    return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3186    REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3187    means that the pattern is likely compiled with different endianness. */
3188    
3189    if (re->magic_number != MAGIC_NUMBER)
3190      return re->magic_number == REVERSED_MAGIC_NUMBER?
3191        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3192    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3193    
3194    /* If restarting after a partial match, do some sanity checks on the contents
3195    of the workspace. */
3196    
3197    if ((options & PCRE_DFA_RESTART) != 0)
3198      {
3199      if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3200        workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3201          return PCRE_ERROR_DFA_BADRESTART;
3202      }
3203    
3204  /* We need to find the pointer to any study data before we test for byte  /* Set up study, callout, and table data */
 flipping, so we scan the extra_data block first. This may set two fields in the  
 match block, so we must initialize them beforehand. However, the other fields  
 in the match block must not be set until after the byte flipping. */  
3205    
3206  md->tables = re->tables;  md->tables = re->tables;
3207  md->callout_data = NULL;  md->callout_data = NULL;
# Line 2136  if (extra_data != NULL) Line 3220  if (extra_data != NULL)
3220      md->tables = extra_data->tables;      md->tables = extra_data->tables;
3221    }    }
3222    
 /* Check that the first field in the block is the magic number. If it is not,  
 test for a regex that was compiled on a host of opposite endianness. If this is  
 the case, flipped values are put in internal_re and internal_study if there was  
 study data too. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   {  
   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);  
   if (re == NULL) return PCRE_ERROR_BADMAGIC;  
   if (study != NULL) study = &internal_study;  
   }  
   
3223  /* Set some local values */  /* Set some local values */
3224    
3225  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3226  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3227  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3228    
3229  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3230  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3231    utf = (re->options & PCRE_UTF8) != 0;
3232  #else  #else
3233  utf8 = FALSE;  utf = FALSE;
3234  #endif  #endif
3235    
3236  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2165  anchored = (options & (PCRE_ANCHORED|PCR Line 3238  anchored = (options & (PCRE_ANCHORED|PCR
3238    
3239  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3240    
3241  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3242      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3243  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3244  md->end_subject = end_subject;  md->end_subject = end_subject;
3245    md->start_offset = start_offset;
3246  md->moptions = options;  md->moptions = options;
3247  md->poptions = re->options;  md->poptions = re->options;
3248    
3249    /* If the BSR option is not set at match time, copy what was set
3250    at compile time. */
3251    
3252    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3253      {
3254      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3255        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3256    #ifdef BSR_ANYCRLF
3257      else md->moptions |= PCRE_BSR_ANYCRLF;
3258    #endif
3259      }
3260    
3261  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
3262  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
3263    
# Line 2179  switch ((((options & PCRE_NEWLINE_BITS) Line 3265  switch ((((options & PCRE_NEWLINE_BITS)
3265           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
3266    {    {
3267    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
3268    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3269    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3270    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
3271         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3272    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
3273    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3274    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2215  else Line 3301  else
3301  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3302  back the character offset. */  back the character offset. */
3303    
3304  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3305  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3306    {    {
3307    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3308      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3309    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3310      {      {
3311      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3312        {        {
3313        tb &= 0xc0;        offsets[0] = erroroffset;
3314        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3315        }        }
3316        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3317          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3318      }      }
3319      if (start_offset > 0 && start_offset < length &&
3320            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3321        return PCRE_ERROR_BADUTF8_OFFSET;
3322    }    }
3323  #endif  #endif
3324    
# Line 2236  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3326  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3326  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3327  in other programs later. */  in other programs later. */
3328    
3329  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3330    
3331  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3332  used in a loop when finding where to start. */  where to start. */
3333    
3334  lcc = md->tables + lcc_offset;  startline = (re->flags & PCRE_STARTLINE) != 0;
 startline = (re->options & PCRE_STARTLINE) != 0;  
3335  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3336    
3337  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2253  studied, there may be a bitmap of possib Line 3342  studied, there may be a bitmap of possib
3342    
3343  if (!anchored)  if (!anchored)
3344    {    {
3345    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3346      {      {
3347      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3348      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = (pcre_uchar)(re->first_char);
3349        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3350          {
3351          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3352    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3353          if (utf && first_char > 127)
3354            first_char2 = UCD_OTHERCASE(first_char);
3355    #endif
3356          }
3357      }      }
3358    else    else
3359      {      {
3360      if (startline && study != NULL &&      if (!startline && study != NULL &&
3361           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3362        start_bits = study->start_bits;        start_bits = study->start_bits;
3363      }      }
3364    }    }
# Line 2270  if (!anchored) Line 3366  if (!anchored)
3366  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
3367  character" set. */  character" set. */
3368    
3369  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3370    {    {
3371    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3372    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = (pcre_uchar)(re->req_char);
3373    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3374        {
3375        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3376    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3377        if (utf && req_char > 127)
3378          req_char2 = UCD_OTHERCASE(req_char);
3379    #endif
3380        }
3381    }    }
3382    
3383  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
3384  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
3385  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
3386    
3387  for (;;)  for (;;)
3388    {    {
# Line 2288  for (;;) Line 3390  for (;;)
3390    
3391    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3392      {      {
3393      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3394    
3395      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
3396      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
3397      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
3398      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
3399    
3400      if (firstline)      if (firstline)
3401        {        {
3402        const uschar *t = current_subject;        PCRE_PUCHAR t = current_subject;
3403    #ifdef SUPPORT_UTF
3404          if (utf)
3405            {
3406            while (t < md->end_subject && !IS_NEWLINE(t))
3407              {
3408              t++;
3409              ACROSSCHAR(t < end_subject, *t, t++);
3410              }
3411            }
3412          else
3413    #endif
3414        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3415        end_subject = t;        end_subject = t;
3416        }        }
3417    
3418      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
3419        starting point is not found. However, there is an option that disables
3420        these, for testing and for ensuring that all callouts do actually occur.
3421        The option can be set in the regex by (*NO_START_OPT) or passed in
3422        match-time options. */
3423    
3424        if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3425        {        {
3426        if (first_byte_caseless)        /* Advance to a known first char. */
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
3427    
3428      /* Or to just after a linebreak for a multiline match if possible */        if (has_first_char)
3429            {
3430            if (first_char != first_char2)
3431              while (current_subject < end_subject &&
3432                  *current_subject != first_char && *current_subject != first_char2)
3433                current_subject++;
3434            else
3435              while (current_subject < end_subject &&
3436                     *current_subject != first_char)
3437                current_subject++;
3438            }
3439    
3440      else if (startline)        /* Or to just after a linebreak for a multiline match if possible */
3441        {  
3442        if (current_subject > md->start_subject + start_offset)        else if (startline)
3443          {          {
3444          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (current_subject > md->start_subject + start_offset)
3445            current_subject++;            {
3446    #ifdef SUPPORT_UTF
3447              if (utf)
3448                {
3449                while (current_subject < end_subject &&
3450                       !WAS_NEWLINE(current_subject))
3451                  {
3452                  current_subject++;
3453                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3454                    current_subject++);
3455                  }
3456                }
3457              else
3458    #endif
3459              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3460                current_subject++;
3461    
3462              /* If we have just passed a CR and the newline option is ANY or
3463              ANYCRLF, and we are now at a LF, advance the match position by one
3464              more character. */
3465    
3466          /* If we have just passed a CR and the newline option is ANY or            if (current_subject[-1] == CHAR_CR &&
3467          ANYCRLF, and we are now at a LF, advance the match position by one more                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3468          character. */                 current_subject < end_subject &&
3469                   *current_subject == CHAR_NL)
3470          if (current_subject[-1] == '\r' &&              current_subject++;
3471               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&            }
              current_subject < end_subject &&  
              *current_subject == '\n')  
           current_subject++;  
3472          }          }
       }  
3473    
3474      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
3475    
3476      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
3477          {          {
3478          register unsigned int c = *current_subject;          while (current_subject < end_subject)
3479          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
3480              register unsigned int c = *current_subject;
3481    #ifndef COMPILE_PCRE8
3482              if (c > 255) c = 255;
3483    #endif
3484              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3485                {
3486                current_subject++;
3487    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3488                /* In non 8-bit mode, the iteration will stop for
3489                characters > 255 at the beginning or not stop at all. */
3490                if (utf)
3491                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3492                    current_subject++);
3493    #endif
3494                }
3495            else break;            else break;
3496              }
3497          }          }
3498        }        }
3499    
3500      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3501    
3502      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
3503    
3504    /* If req_byte is set, we know that that character must appear in the subject      /* The following two optimizations are disabled for partial matching or if
3505    for the match to succeed. If the first character is set, req_byte must be      disabling is explicitly requested (and of course, by the test above, this
3506    later in the subject; otherwise the test starts at the match point. This      code is not obeyed when restarting after a partial match). */
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
   
     /* We don't need to repeat the search if we haven't yet reached the  
     place we found it at last time. */  
3507    
3508      if (p > req_byte_ptr)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3509            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3510        {        {
3511        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3512          {        is a lower bound; no actual string of that length may actually match the
3513          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3514            {        bytes to avoid spending too much time in this optimization. */
3515            register int pp = *p++;  
3516            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3517            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3518          }          return PCRE_ERROR_NOMATCH;
3519        else  
3520          /* If req_char is set, we know that that character must appear in the
3521          subject for the match to succeed. If the first character is set, req_char
3522          must be later in the subject; otherwise the test starts at the match
3523          point. This optimization can save a huge amount of work in patterns with
3524          nested unlimited repeats that aren't going to match. Writing separate
3525          code for cased/caseless versions makes it go faster, as does using an
3526          autoincrement and backing off on a match.
3527    
3528          HOWEVER: when the subject string is very, very long, searching to its end
3529          can take a long time, and give bad performance on quite ordinary
3530          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3531          string... so we don't do this when the string is sufficiently long. */
3532    
3533          if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3534          {          {
3535          while (p < end_subject)          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3536    
3537            /* We don't need to repeat the search if we haven't yet reached the
3538            place we found it at last time. */
3539    
3540            if (p > req_char_ptr)
3541            {            {
3542            if (*p++ == req_byte) { p--; break; }            if (req_char != req_char2)
3543            }              {
3544          }              while (p < end_subject)
3545                  {
3546                  register int pp = *p++;
3547                  if (pp == req_char || pp == req_char2) { p--; break; }
3548                  }
3549                }
3550              else
3551                {
3552                while (p < end_subject)
3553                  {
3554                  if (*p++ == req_char) { p--; break; }
3555                  }
3556                }
3557    
3558        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3559        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3560    
3561        if (p >= end_subject) break;            if (p >= end_subject) break;
3562    
3563        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3564        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3565        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3566    
3567        req_byte_ptr = p;            req_char_ptr = p;
3568              }
3569            }
3570        }        }
3571      }      }   /* End of optimizations that are done when not restarting */
3572    
3573    /* OK, now we can do the business */    /* OK, now we can do the business */
3574    
3575      md->start_used_ptr = current_subject;
3576      md->recursive = NULL;
3577    
3578    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3579      md,                                /* fixed match data */      md,                                /* fixed match data */
3580      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2419  for (;;) Line 3584  for (;;)
3584      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3585      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3586      wscount,                           /* size of same */      wscount,                           /* size of same */
3587      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3588    
3589    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3590    on only if not anchored. */    on only if not anchored. */
# Line 2433  for (;;) Line 3596  for (;;)
3596    
3597    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3598    current_subject++;    current_subject++;
3599    if (utf8)  #ifdef SUPPORT_UTF
3600      if (utf)
3601      {      {
3602      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3603        current_subject++;        current_subject++);
3604      }      }
3605    #endif
3606    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3607    
3608    /* If we have just passed a CR and the newline option is CRLF or ANY or    /* If we have just passed a CR and we are now at a LF, and the pattern does
3609    ANYCRLF, and we are now at a LF, advance the match position by one more    not contain any explicit matches for \r or \n, and the newline option is CRLF
3610    character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3611    
3612    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
3613         (md->nltype == NLTYPE_ANY ||        current_subject &l