/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 145 by ph10, Wed Apr 4 14:06:52 2007 UTC revision 1363 by ph10, Tue Oct 1 16:54:40 2013 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2013 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 37  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a sort of DFA algorithm (not a true  alternative matching function that uses a sort of DFA algorithm (not a true
43  FSM). This is NOT Perl- compatible, but it has advantages in certain  FSM). This is NOT Perl-compatible, but it has advantages in certain
44  applications. */  applications. */
45    
46    
47    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48    the performance of his patterns greatly. I could not use it as it stood, as it
49    was not thread safe, and made assumptions about pattern sizes. Also, it caused
50    test 7 to loop, and test 9 to crash with a segfault.
51    
52    The issue is the check for duplicate states, which is done by a simple linear
53    search up the state list. (Grep for "duplicate" below to find the code.) For
54    many patterns, there will never be many states active at one time, so a simple
55    linear search is fine. In patterns that have many active states, it might be a
56    bottleneck. The suggested code used an indexing scheme to remember which states
57    had previously been used for each character, and avoided the linear search when
58    it knew there was no chance of a duplicate. This was implemented when adding
59    states to the state lists.
60    
61    I wrote some thread-safe, not-limited code to try something similar at the time
62    of checking for duplicates (instead of when adding states), using index vectors
63    on the stack. It did give a 13% improvement with one specially constructed
64    pattern for certain subject strings, but on other strings and on many of the
65    simpler patterns in the test suite it did worse. The major problem, I think,
66    was the extra time to initialize the index. This had to be done for each call
67    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68    only once - I suspect this was the cause of the problems with the tests.)
69    
70    Overall, I concluded that the gains in some cases did not outweigh the losses
71    in others, so I abandoned this code. */
72    
73    
74    
75    #ifdef HAVE_CONFIG_H
76    #include "config.h"
77    #endif
78    
79  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
80  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
81  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 56  applications. */ Line 88  applications. */
88  #define SP "                   "  #define SP "                   "
89    
90    
   
91  /*************************************************  /*************************************************
92  *      Code parameters and static tables         *  *      Code parameters and static tables         *
93  *************************************************/  *************************************************/
94    
95  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
97  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
98    never stored, so we push them well clear of the normal opcodes. */
99    
100  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
101  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
102  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
103    #define OP_HSPACE_EXTRA     360
104    #define OP_VSPACE_EXTRA     380
105    
106    
107  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
108  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
109  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
110  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111  small value. */  small value. Non-zero values in the table are the offsets from the opcode where
112    the character is to be found. ***NOTE*** If the start of this table is
113    modified, the three tables that follow must also be modified. */
114    
115  static uschar coptable[] = {  static const pcre_uint8 coptable[] = {
116    0,                             /* End                                    */    0,                             /* End                                    */
117    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0,                          /* \P, \p                                 */
121      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122      0,                             /* \X                                     */
123      0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124    1,                             /* Char                                   */    1,                             /* Char                                   */
125    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
126    1,                             /* not                                    */    1,                             /* not                                    */
127      1,                             /* noti                                   */
128    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
129    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
132      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135      1+IMM2_SIZE,                   /* exact I                                */
136      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
138    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
141      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144      1+IMM2_SIZE,                   /* NOT exact I                            */
145      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
147    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
150      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
152    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 104  static uschar coptable[] = { Line 155  static uschar coptable[] = {
155    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
156    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
157    0,                             /* REF                                    */    0,                             /* REF                                    */
158      0,                             /* REFI                                   */
159      0,                             /* DNREF                                  */
160      0,                             /* DNREFI                                 */
161    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
162    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
163    0,                             /* Alt                                    */    0,                             /* Alt                                    */
164    0,                             /* Ket                                    */    0,                             /* Ket                                    */
165    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
166    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
167      0,                             /* KetRpos                                */
168      0,                             /* Reverse                                */
169    0,                             /* Assert                                 */    0,                             /* Assert                                 */
170    0,                             /* Assert not                             */    0,                             /* Assert not                             */
171    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
172    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
173      0, 0,                          /* ONCE, ONCE_NC                          */
174      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
175      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
176      0, 0,                          /* CREF, NCREF                            */
177      0, 0,                          /* RREF, NRREF                            */
178      0,                             /* DEF                                    */
179      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
180      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
181      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
182      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
183      0, 0                           /* CLOSE, SKIPZERO  */
184    };
185    
186    /* This table identifies those opcodes that inspect a character. It is used to
187    remember the fact that a character could have been inspected when the end of
188    the subject is reached. ***NOTE*** If the start of this table is modified, the
189    two tables that follow must also be modified. */
190    
191    static const pcre_uint8 poptable[] = {
192      0,                             /* End                                    */
193      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
194      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
195      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
196      1, 1,                          /* \P, \p                                 */
197      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
198      1,                             /* \X                                     */
199      0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
200      1,                             /* Char                                   */
201      1,                             /* Chari                                  */
202      1,                             /* not                                    */
203      1,                             /* noti                                   */
204      /* Positive single-char repeats                                          */
205      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
206      1, 1, 1,                       /* upto, minupto, exact                   */
207      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
208      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
209      1, 1, 1,                       /* upto I, minupto I, exact I             */
210      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
211      /* Negative single-char repeats - only for chars < 256                   */
212      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
213      1, 1, 1,                       /* NOT upto, minupto, exact               */
214      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
215      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
216      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
217      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
218      /* Positive type repeats                                                 */
219      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
220      1, 1, 1,                       /* Type upto, minupto, exact              */
221      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
222      /* Character class & ref repeats                                         */
223      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
224      1, 1,                          /* CRRANGE, CRMINRANGE                    */
225      1,                             /* CLASS                                  */
226      1,                             /* NCLASS                                 */
227      1,                             /* XCLASS - variable length               */
228      0,                             /* REF                                    */
229      0,                             /* REFI                                   */
230      0,                             /* DNREF                                  */
231      0,                             /* DNREFI                                 */
232      0,                             /* RECURSE                                */
233      0,                             /* CALLOUT                                */
234      0,                             /* Alt                                    */
235      0,                             /* Ket                                    */
236      0,                             /* KetRmax                                */
237      0,                             /* KetRmin                                */
238      0,                             /* KetRpos                                */
239    0,                             /* Reverse                                */    0,                             /* Reverse                                */
240    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
241    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
242    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
243    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
244      0, 0,                          /* ONCE, ONCE_NC                          */
245      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
246      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
247      0, 0,                          /* CREF, NCREF                            */
248      0, 0,                          /* RREF, NRREF                            */
249    0,                             /* DEF                                    */    0,                             /* DEF                                    */
250    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
251      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
252      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
253      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
254      0, 0                           /* CLOSE, SKIPZERO                        */
255  };  };
256    
257  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
258  and \w */  and \w */
259    
260  static uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
261    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
262    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
263    ctype_space, ctype_space,    ctype_space, ctype_space,
264    ctype_word,  ctype_word,    ctype_word,  ctype_word,
265    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
266  };  };
267    
268  static uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
269    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
270    ctype_digit, 0,    ctype_digit, 0,
271    ctype_space, 0,    ctype_space, 0,
272    ctype_word,  0,    ctype_word,  0,
273    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
274  };  };
275    
276    
# Line 151  these structures in, is a vector of ints Line 282  these structures in, is a vector of ints
282  typedef struct stateblock {  typedef struct stateblock {
283    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
284    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
285    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
286  } stateblock;  } stateblock;
287    
288  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
289    
290    
291  #ifdef DEBUG  #ifdef PCRE_DEBUG
292  /*************************************************  /*************************************************
293  *             Print character string             *  *             Print character string             *
294  *************************************************/  *************************************************/
# Line 174  Returns:       nothing Line 304  Returns:       nothing
304  */  */
305    
306  static void  static void
307  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
308  {  {
309  int c;  pcre_uint32 c;
310  while (length-- > 0)  while (length-- > 0)
311    {    {
312    if (isprint(c = *(p++)))    if (isprint(c = *(p++)))
313      fprintf(f, "%c", c);      fprintf(f, "%c", c);
314    else    else
315      fprintf(f, "\\x%02x", c);      fprintf(f, "\\x{%02x}", c);
316    }    }
317  }  }
318  #endif  #endif
# Line 207  Arguments: Line 337  Arguments:
337    offsetcount       size of same    offsetcount       size of same
338    workspace         vector of workspace    workspace         vector of workspace
339    wscount           size of same    wscount           size of same
   ims               the current ims flags  
340    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
341    
342  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
343                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
344                       -1 => failed to match                       -1 => failed to match
345                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
346    
# Line 224  for the current character, one for the f Line 352  for the current character, one for the f
352      { \      { \
353      next_active_state->offset = (x); \      next_active_state->offset = (x); \
354      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
355      next_active_state++; \      next_active_state++; \
356      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
357      } \      } \
# Line 235  for the current character, one for the f Line 362  for the current character, one for the f
362      { \      { \
363      next_active_state->offset = (x); \      next_active_state->offset = (x); \
364      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
365      next_active_state->data   = (z); \      next_active_state->data   = (z); \
366      next_active_state++; \      next_active_state++; \
367      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 247  for the current character, one for the f Line 373  for the current character, one for the f
373      { \      { \
374      next_new_state->offset = (x); \      next_new_state->offset = (x); \
375      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
376      next_new_state++; \      next_new_state++; \
377      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
378      } \      } \
# Line 258  for the current character, one for the f Line 383  for the current character, one for the f
383      { \      { \
384      next_new_state->offset = (x); \      next_new_state->offset = (x); \
385      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
386      next_new_state->data   = (z); \      next_new_state->data   = (z); \
387      next_new_state++; \      next_new_state++; \
388      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
389          (x), (y), (z), __LINE__)); \
390      } \      } \
391    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
392    
# Line 270  for the current character, one for the f Line 395  for the current character, one for the f
395  static int  static int
396  internal_dfa_exec(  internal_dfa_exec(
397    dfa_match_data *md,    dfa_match_data *md,
398    const uschar *this_start_code,    const pcre_uchar *this_start_code,
399    const uschar *current_subject,    const pcre_uchar *current_subject,
400    int start_offset,    int start_offset,
401    int *offsets,    int *offsets,
402    int offsetcount,    int offsetcount,
403    int *workspace,    int *workspace,
404    int wscount,    int wscount,
405    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
406  {  {
407  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
408  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
409    
410  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
411  const uschar *ptr;  const pcre_uchar *ptr;
412  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
413    
414    dfa_recursion_info new_recursive;
415    
416  int active_count, new_count, match_count;  int active_count, new_count, match_count;
417    
418  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
419  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
420    
421  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
422  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
423  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
424    
425  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
426  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
427  #else  #else
428  BOOL utf8 = FALSE;  BOOL utf = FALSE;
429  #endif  #endif
430    
431    BOOL reset_could_continue = FALSE;
432    
433  rlevel++;  rlevel++;
434  offsetcount &= (-2);  offsetcount &= (-2);
435    
# Line 311  wscount = (wscount - (wscount % (INTS_PE Line 438  wscount = (wscount - (wscount % (INTS_PE
438            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
439    
440  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
441    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
442    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
443    
444  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
445  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 325  next_new_state = new_states = active_sta Line 452  next_new_state = new_states = active_sta
452  new_count = 0;  new_count = 0;
453    
454  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
455    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
456        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
457        ? IMM2_SIZE:0);
458    
459  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
460  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 353  if (*first_op == OP_REVERSE) Line 482  if (*first_op == OP_REVERSE)
482    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
483    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
484    
485  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
486    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
487    
488    if (utf8)    if (utf)
489      {      {
490      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
491        {        {
492        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
493        current_subject--;        current_subject--;
494        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
495        }        }
496      }      }
497    else    else
# Line 374  if (*first_op == OP_REVERSE) Line 501  if (*first_op == OP_REVERSE)
501    
502      {      {
503      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
504        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
505      current_subject -= gone_back;      current_subject -= gone_back;
506      }      }
507    
508      /* Save the earliest consulted character */
509    
510      if (current_subject < md->start_used_ptr)
511        md->start_used_ptr = current_subject;
512    
513    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
514    
515    end_code = this_start_code;    end_code = this_start_code;
# Line 386  if (*first_op == OP_REVERSE) Line 518  if (*first_op == OP_REVERSE)
518      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
519      if (back <= gone_back)      if (back <= gone_back)
520        {        {
521        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
522        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
523        }        }
524      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 419  else Line 551  else
551    else    else
552      {      {
553      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
554        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
555            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
556            ? IMM2_SIZE:0);
557      do      do
558        {        {
559        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
560        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
561        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
562        }        }
# Line 432  else Line 566  else
566    
567  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
568    
569  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
570    
571  /* Loop for scanning the subject */  /* Loop for scanning the subject */
572    
# Line 441  for (;;) Line 575  for (;;)
575    {    {
576    int i, j;    int i, j;
577    int clen, dlen;    int clen, dlen;
578    unsigned int c, d;    pcre_uint32 c, d;
579      int forced_fail = 0;
580      BOOL partial_newline = FALSE;
581      BOOL could_continue = reset_could_continue;
582      reset_could_continue = FALSE;
583    
584    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
585    new state list. */    new state list. */
# Line 455  for (;;) Line 593  for (;;)
593    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
594    workspace[1] = active_count;    workspace[1] = active_count;
595    
596  #ifdef DEBUG  #ifdef PCRE_DEBUG
597    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
598    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
599    printf("\"\n");    printf("\"\n");
600    
601    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 477  for (;;) Line 615  for (;;)
615    
616    if (ptr < end_subject)    if (ptr < end_subject)
617      {      {
618      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of data items in the character */
619  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
620      if (utf8) { GETCHARLEN(c, ptr, clen); } else      GETCHARLENTEST(c, ptr, clen);
621  #endif  /* SUPPORT_UTF8 */  #else
622      c = *ptr;      c = *ptr;
623    #endif  /* SUPPORT_UTF */
624      }      }
625    else    else
626      {      {
# Line 497  for (;;) Line 636  for (;;)
636    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
637      {      {
638      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
639      const uschar *code;      BOOL caseless = FALSE;
640        const pcre_uchar *code;
641      int state_offset = current_state->offset;      int state_offset = current_state->offset;
642      int count, codevalue;      int codevalue, rrc;
643      int chartype, script;      int count;
644    
645  #ifdef DEBUG  #ifdef PCRE_DEBUG
646      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
647      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
648        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
649          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
650  #endif  #endif
651    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
652      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
653      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
654      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
655        state, arrange for it to passed on. */
656    
657      if (state_offset < 0)      if (state_offset < 0)
658        {        {
# Line 524  for (;;) Line 661  for (;;)
661          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
662          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
663            current_state->data - 1);            current_state->data - 1);
664            if (could_continue) reset_could_continue = TRUE;
665          continue;          continue;
666          }          }
667        else        else
# Line 532  for (;;) Line 670  for (;;)
670          }          }
671        }        }
672    
673      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
674        See the note at the head of this module about the possibility of improving
675        performance here. */
676    
677      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
678        {        {
# Line 549  for (;;) Line 689  for (;;)
689      code = start_code + state_offset;      code = start_code + state_offset;
690      codevalue = *code;      codevalue = *code;
691    
692        /* If this opcode inspects a character, but we are at the end of the
693        subject, remember the fact for use when testing for a partial match. */
694    
695        if (clen == 0 && poptable[codevalue] != 0)
696          could_continue = TRUE;
697    
698      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
699      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
700      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
701      permitted.      permitted.
702    
703      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
704      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long because
705      Unfortunately, we have to take special action to deal with  \P, \p, and      the values are small. We have to take special action to deal with  \P, \p,
706      \X in this case. To keep the other cases fast, convert these ones to new      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
707      opcodes. */      these ones to new opcodes. */
708    
709      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
710        {        {
711        dlen = 1;        dlen = 1;
712  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
713        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
714  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
715        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
716        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
717          {          {
# Line 576  for (;;) Line 722  for (;;)
722            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
723            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
724            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
725              case OP_NOT_HSPACE:
726              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
727              case OP_NOT_VSPACE:
728              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
729            default: break;            default: break;
730            }            }
731          }          }
# Line 591  for (;;) Line 741  for (;;)
741    
742      switch (codevalue)      switch (codevalue)
743        {        {
744    /* ========================================================================== */
745          /* These cases are never obeyed. This is a fudge that causes a compile-
746          time error if the vectors coptable or poptable, which are indexed by
747          opcode, are not the correct length. It seems to be the only way to do
748          such a check at compile time, as the sizeof() operator does not work
749          in the C preprocessor. */
750    
751          case OP_TABLE_LENGTH:
752          case OP_TABLE_LENGTH +
753            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
754             (sizeof(poptable) == OP_TABLE_LENGTH)):
755          break;
756    
757  /* ========================================================================== */  /* ========================================================================== */
758        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
759        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
760        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
761          subpattern, because the possessive subpattern repeats are always handled
762          using recursive calls. Thus, it never adds any new states.
763    
764          At the end of the (sub)pattern, unless we have an empty string and
765          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
766          start of the subject, save the match data, shifting up all previous
767        matches so we always have the longest first. */        matches so we always have the longest first. */
768    
769        case OP_KET:        case OP_KET:
770        case OP_KETRMIN:        case OP_KETRMIN:
771        case OP_KETRMAX:        case OP_KETRMAX:
772          case OP_KETRPOS:
773        if (code != end_code)        if (code != end_code)
774          {          {
775          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 609  for (;;) Line 778  for (;;)
778            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
779            }            }
780          }          }
781        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
782          {          {
783          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
784            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
785              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
786          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
787          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
788          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
789            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
790            offsets[0] = current_subject - start_subject;                match_count = 0;
791            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
792            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
793              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
794            }              {
795          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
796            {              offsets[1] = (int)(ptr - start_subject);
797            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
798              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], (char *)current_subject));
799              match_count, rlevel*2-2, SP));              }
800            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
801                {
802                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
803                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
804                  match_count, rlevel*2-2, SP));
805                return match_count;
806                }
807            }            }
808          }          }
809        break;        break;
# Line 640  for (;;) Line 815  for (;;)
815        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
816        case OP_ALT:        case OP_ALT:
817        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
818        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
819        break;        break;
820    
821        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 648  for (;;) Line 823  for (;;)
823        case OP_SBRA:        case OP_SBRA:
824        do        do
825          {          {
826          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
827          code += GET(code, 1);          code += GET(code, 1);
828          }          }
829        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 657  for (;;) Line 832  for (;;)
832        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
833        case OP_CBRA:        case OP_CBRA:
834        case OP_SCBRA:        case OP_SCBRA:
835        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
836        code += GET(code, 1);        code += GET(code, 1);
837        while (*code == OP_ALT)        while (*code == OP_ALT)
838          {          {
839          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
840          code += GET(code, 1);          code += GET(code, 1);
841          }          }
842        break;        break;
# Line 672  for (;;) Line 847  for (;;)
847        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
848        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
849        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
850        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
851          break;
852    
853          /*-----------------------------------------------------------------*/
854          case OP_SKIPZERO:
855          code += 1 + GET(code, 2);
856          while (*code == OP_ALT) code += GET(code, 1);
857          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
858        break;        break;
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_CIRC:        case OP_CIRC:
862        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
863          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
864        break;        break;
865    
866        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
867        case OP_EOD:        case OP_CIRCM:
868        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
869              (ptr != end_subject && WAS_NEWLINE(ptr)))
870            { ADD_ACTIVE(state_offset + 1, 0); }
871        break;        break;
872    
873        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
874        case OP_OPT:        case OP_EOD:
875        ims = code[1];        if (ptr >= end_subject)
876        ADD_ACTIVE(state_offset + 2, 0);          {
877            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
878              could_continue = TRUE;
879            else { ADD_ACTIVE(state_offset + 1, 0); }
880            }
881        break;        break;
882    
883        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 714  for (;;) Line 899  for (;;)
899    
900        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
901        case OP_ANY:        case OP_ANY:
902        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
903            {
904            if (ptr + 1 >= md->end_subject &&
905                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
906                NLBLOCK->nltype == NLTYPE_FIXED &&
907                NLBLOCK->nllen == 2 &&
908                c == NLBLOCK->nl[0])
909              {
910              could_continue = partial_newline = TRUE;
911              }
912            else
913              {
914              ADD_NEW(state_offset + 1, 0);
915              }
916            }
917          break;
918    
919          /*-----------------------------------------------------------------*/
920          case OP_ALLANY:
921          if (clen > 0)
922          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
923        break;        break;
924    
925        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
926        case OP_EODN:        case OP_EODN:
927        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
928            could_continue = TRUE;
929          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
930          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
931        break;        break;
932    
# Line 728  for (;;) Line 934  for (;;)
934        case OP_DOLL:        case OP_DOLL:
935        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
936          {          {
937          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
938              (IS_NEWLINE(ptr) &&            could_continue = TRUE;
939                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)          else if (clen == 0 ||
940                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
941                   (ptr == end_subject - md->nllen)
942              ))              ))
943            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
944            else if (ptr + 1 >= md->end_subject &&
945                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
946                     NLBLOCK->nltype == NLTYPE_FIXED &&
947                     NLBLOCK->nllen == 2 &&
948                     c == NLBLOCK->nl[0])
949              {
950              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
951                {
952                reset_could_continue = TRUE;
953                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
954                }
955              else could_continue = partial_newline = TRUE;
956              }
957            }
958          break;
959    
960          /*-----------------------------------------------------------------*/
961          case OP_DOLLM:
962          if ((md->moptions & PCRE_NOTEOL) == 0)
963            {
964            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
965              could_continue = TRUE;
966            else if (clen == 0 ||
967                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
968              { ADD_ACTIVE(state_offset + 1, 0); }
969            else if (ptr + 1 >= md->end_subject &&
970                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
971                     NLBLOCK->nltype == NLTYPE_FIXED &&
972                     NLBLOCK->nllen == 2 &&
973                     c == NLBLOCK->nl[0])
974              {
975              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
976                {
977                reset_could_continue = TRUE;
978                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
979                }
980              else could_continue = partial_newline = TRUE;
981              }
982          }          }
983        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        else if (IS_NEWLINE(ptr))
984          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
985        break;        break;
986    
# Line 765  for (;;) Line 1011  for (;;)
1011    
1012          if (ptr > start_subject)          if (ptr > start_subject)
1013            {            {
1014            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1015  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1016            if (utf8) BACKCHAR(temp);  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1017              if (utf) { BACKCHAR(temp); }
1018  #endif  #endif
1019            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1020    #ifdef SUPPORT_UCP
1021              if ((md->poptions & PCRE_UCP) != 0)
1022                {
1023                if (d == '_') left_word = TRUE; else
1024                  {
1025                  int cat = UCD_CATEGORY(d);
1026                  left_word = (cat == ucp_L || cat == ucp_N);
1027                  }
1028                }
1029              else
1030    #endif
1031            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1032            }            }
1033          else left_word = 0;          else left_word = FALSE;
1034    
1035          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
1036            else right_word = 0;            {
1037    #ifdef SUPPORT_UCP
1038              if ((md->poptions & PCRE_UCP) != 0)
1039                {
1040                if (c == '_') right_word = TRUE; else
1041                  {
1042                  int cat = UCD_CATEGORY(c);
1043                  right_word = (cat == ucp_L || cat == ucp_N);
1044                  }
1045                }
1046              else
1047    #endif
1048              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1049              }
1050            else right_word = FALSE;
1051    
1052          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1053            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 783  for (;;) Line 1055  for (;;)
1055        break;        break;
1056    
1057    
 #ifdef SUPPORT_UCP  
   
1058        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1059        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
1060        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
1061        */        */
1062    
1063    #ifdef SUPPORT_UCP
1064        case OP_PROP:        case OP_PROP:
1065        case OP_NOTPROP:        case OP_NOTPROP:
1066        if (clen > 0)        if (clen > 0)
1067          {          {
1068          BOOL OK;          BOOL OK;
1069          int category = _pcre_ucp_findprop(c, &chartype, &script);          const pcre_uint32 *cp;
1070            const ucd_record * prop = GET_UCD(c);
1071          switch(code[1])          switch(code[1])
1072            {            {
1073            case PT_ANY:            case PT_ANY:
# Line 803  for (;;) Line 1075  for (;;)
1075            break;            break;
1076    
1077            case PT_LAMP:            case PT_LAMP:
1078            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1079                   prop->chartype == ucp_Lt;
1080            break;            break;
1081    
1082            case PT_GC:            case PT_GC:
1083            OK = category == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1084            break;            break;
1085    
1086            case PT_PC:            case PT_PC:
1087            OK = chartype == code[2];            OK = prop->chartype == code[2];
1088            break;            break;
1089    
1090            case PT_SC:            case PT_SC:
1091            OK = script == code[2];            OK = prop->script == code[2];
1092              break;
1093    
1094              /* These are specials for combination cases. */
1095    
1096              case PT_ALNUM:
1097              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1098                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1099              break;
1100    
1101              case PT_SPACE:    /* Perl space */
1102              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1103                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1104              break;
1105    
1106              case PT_PXSPACE:  /* POSIX space */
1107              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1108                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1109                   c == CHAR_FF || c == CHAR_CR;
1110              break;
1111    
1112              case PT_WORD:
1113              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1114                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1115                   c == CHAR_UNDERSCORE;
1116              break;
1117    
1118              case PT_CLIST:
1119              cp = PRIV(ucd_caseless_sets) + code[2];
1120              for (;;)
1121                {
1122                if (c < *cp) { OK = FALSE; break; }
1123                if (c == *cp++) { OK = TRUE; break; }
1124                }
1125              break;
1126    
1127              case PT_UCNC:
1128              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1129                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1130                   c >= 0xe000;
1131            break;            break;
1132    
1133            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 835  for (;;) Line 1147  for (;;)
1147  /* ========================================================================== */  /* ========================================================================== */
1148        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
1149        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
1150        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1151        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1152    
1153        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1154        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 845  for (;;) Line 1157  for (;;)
1157        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1158        if (clen > 0)        if (clen > 0)
1159          {          {
1160          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1161                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1162                NLBLOCK->nltype == NLTYPE_FIXED &&
1163                NLBLOCK->nllen == 2 &&
1164                c == NLBLOCK->nl[0])
1165              {
1166              could_continue = partial_newline = TRUE;
1167              }
1168            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1169              (c < 256 &&              (c < 256 &&
1170                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1171                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1172            {            {
1173            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 871  for (;;) Line 1188  for (;;)
1188        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1189        if (clen > 0)        if (clen > 0)
1190          {          {
1191          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1192                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1193                NLBLOCK->nltype == NLTYPE_FIXED &&
1194                NLBLOCK->nllen == 2 &&
1195                c == NLBLOCK->nl[0])
1196              {
1197              could_continue = partial_newline = TRUE;
1198              }
1199            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1200              (c < 256 &&              (c < 256 &&
1201                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1202                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1203            {            {
1204            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 896  for (;;) Line 1218  for (;;)
1218        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1219        if (clen > 0)        if (clen > 0)
1220          {          {
1221          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1222                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1223                NLBLOCK->nltype == NLTYPE_FIXED &&
1224                NLBLOCK->nllen == 2 &&
1225                c == NLBLOCK->nl[0])
1226              {
1227              could_continue = partial_newline = TRUE;
1228              }
1229            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1230              (c < 256 &&              (c < 256 &&
1231                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1232                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1233            {            {
1234            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 919  for (;;) Line 1246  for (;;)
1246        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1247        if (clen > 0)        if (clen > 0)
1248          {          {
1249          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1250                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1251                NLBLOCK->nltype == NLTYPE_FIXED &&
1252                NLBLOCK->nllen == 2 &&
1253                c == NLBLOCK->nl[0])
1254              {
1255              could_continue = partial_newline = TRUE;
1256              }
1257            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1258              (c < 256 &&              (c < 256 &&
1259                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1260                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1261            {            {
1262            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1263              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1264            else            else
1265              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1266            }            }
# Line 939  for (;;) Line 1271  for (;;)
1271        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1272        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1273        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1274        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1275        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1276        if (clen > 0)        if (clen > 0)
1277          {          {
1278          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1279                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1280                NLBLOCK->nltype == NLTYPE_FIXED &&
1281                NLBLOCK->nllen == 2 &&
1282                c == NLBLOCK->nl[0])
1283              {
1284              could_continue = partial_newline = TRUE;
1285              }
1286            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1287              (c < 256 &&              (c < 256 &&
1288                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1289                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1290            {            {
1291            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 956  for (;;) Line 1293  for (;;)
1293              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1294              next_active_state--;              next_active_state--;
1295              }              }
1296            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1297              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1298            else            else
1299              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1300            }            }
# Line 970  for (;;) Line 1307  for (;;)
1307        argument. It keeps the code above fast for the other cases. The argument        argument. It keeps the code above fast for the other cases. The argument
1308        is in the d variable. */        is in the d variable. */
1309    
1310    #ifdef SUPPORT_UCP
1311        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
1312        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1313        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
# Line 978  for (;;) Line 1316  for (;;)
1316        if (clen > 0)        if (clen > 0)
1317          {          {
1318          BOOL OK;          BOOL OK;
1319          int category = _pcre_ucp_findprop(c, &chartype, &script);          const pcre_uint32 *cp;
1320            const ucd_record * prop = GET_UCD(c);
1321          switch(code[2])          switch(code[2])
1322            {            {
1323            case PT_ANY:            case PT_ANY:
# Line 986  for (;;) Line 1325  for (;;)
1325            break;            break;
1326    
1327            case PT_LAMP:            case PT_LAMP:
1328            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1329                prop->chartype == ucp_Lt;
1330            break;            break;
1331    
1332            case PT_GC:            case PT_GC:
1333            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1334            break;            break;
1335    
1336            case PT_PC:            case PT_PC:
1337            OK = chartype == code[3];            OK = prop->chartype == code[3];
1338            break;            break;
1339    
1340            case PT_SC:            case PT_SC:
1341            OK = script == code[3];            OK = prop->script == code[3];
1342              break;
1343    
1344              /* These are specials for combination cases. */
1345    
1346              case PT_ALNUM:
1347              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1348                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1349              break;
1350    
1351              case PT_SPACE:    /* Perl space */
1352              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1353                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1354              break;
1355    
1356              case PT_PXSPACE:  /* POSIX space */
1357              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1358                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1359                   c == CHAR_FF || c == CHAR_CR;
1360              break;
1361    
1362              case PT_WORD:
1363              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1364                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1365                   c == CHAR_UNDERSCORE;
1366              break;
1367    
1368              case PT_CLIST:
1369              cp = PRIV(ucd_caseless_sets) + code[3];
1370              for (;;)
1371                {
1372                if (c < *cp) { OK = FALSE; break; }
1373                if (c == *cp++) { OK = TRUE; break; }
1374                }
1375              break;
1376    
1377              case PT_UCNC:
1378              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1379                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1380                   c >= 0xe000;
1381            break;            break;
1382    
1383            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1027  for (;;) Line 1406  for (;;)
1406        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1407        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1408        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1409        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1410          {          {
1411          const uschar *nptr = ptr + clen;          int lgb, rgb;
1412            const pcre_uchar *nptr = ptr + clen;
1413          int ncount = 0;          int ncount = 0;
1414          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1415            {            {
1416            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1417            next_active_state--;            next_active_state--;
1418            }            }
1419            lgb = UCD_GRAPHBREAK(c);
1420          while (nptr < end_subject)          while (nptr < end_subject)
1421            {            {
1422            int nd;            dlen = 1;
1423            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1424            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1425            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1426            ncount++;            ncount++;
1427            nptr += ndlen;            lgb = rgb;
1428              nptr += dlen;
1429            }            }
1430          count++;          count++;
1431          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1432          }          }
1433        break;        break;
1434    #endif
1435    
1436        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1437        case OP_ANYNL_EXTRA + OP_TYPEPLUS:        case OP_ANYNL_EXTRA + OP_TYPEPLUS:
# Line 1061  for (;;) Line 1444  for (;;)
1444          int ncount = 0;          int ncount = 0;
1445          switch (c)          switch (c)
1446            {            {
1447            case 0x000d:            case CHAR_VT:
1448            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            case CHAR_FF:
1449            /* Fall through */            case CHAR_NEL:
1450            case 0x000a:  #ifndef EBCDIC
           case 0x000b:  
           case 0x000c:  
           case 0x0085:  
1451            case 0x2028:            case 0x2028:
1452            case 0x2029:            case 0x2029:
1453    #endif  /* Not EBCDIC */
1454              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1455              goto ANYNL01;
1456    
1457              case CHAR_CR:
1458              if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1459              /* Fall through */
1460    
1461              ANYNL01:
1462              case CHAR_LF:
1463            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1464              {              {
1465              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1078  for (;;) Line 1468  for (;;)
1468            count++;            count++;
1469            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, ncount);
1470            break;            break;
1471    
1472              default:
1473              break;
1474              }
1475            }
1476          break;
1477    
1478          /*-----------------------------------------------------------------*/
1479          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1480          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1481          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1482          count = current_state->count;  /* Already matched */
1483          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1484          if (clen > 0)
1485            {
1486            BOOL OK;
1487            switch (c)
1488              {
1489              VSPACE_CASES:
1490              OK = TRUE;
1491              break;
1492    
1493              default:
1494              OK = FALSE;
1495              break;
1496              }
1497    
1498            if (OK == (d == OP_VSPACE))
1499              {
1500              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1501                {
1502                active_count--;           /* Remove non-match possibility */
1503                next_active_state--;
1504                }
1505              count++;
1506              ADD_NEW_DATA(-state_offset, count, 0);
1507              }
1508            }
1509          break;
1510    
1511          /*-----------------------------------------------------------------*/
1512          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1513          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1514          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1515          count = current_state->count;  /* Already matched */
1516          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1517          if (clen > 0)
1518            {
1519            BOOL OK;
1520            switch (c)
1521              {
1522              HSPACE_CASES:
1523              OK = TRUE;
1524              break;
1525    
1526            default:            default:
1527              OK = FALSE;
1528            break;            break;
1529            }            }
1530    
1531            if (OK == (d == OP_HSPACE))
1532              {
1533              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1534                {
1535                active_count--;           /* Remove non-match possibility */
1536                next_active_state--;
1537                }
1538              count++;
1539              ADD_NEW_DATA(-state_offset, count, 0);
1540              }
1541          }          }
1542        break;        break;
1543    
1544        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1545    #ifdef SUPPORT_UCP
1546        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1547        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1548        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
# Line 1102  for (;;) Line 1560  for (;;)
1560        if (clen > 0)        if (clen > 0)
1561          {          {
1562          BOOL OK;          BOOL OK;
1563          int category = _pcre_ucp_findprop(c, &chartype, &script);          const pcre_uint32 *cp;
1564            const ucd_record * prop = GET_UCD(c);
1565          switch(code[2])          switch(code[2])
1566            {            {
1567            case PT_ANY:            case PT_ANY:
# Line 1110  for (;;) Line 1569  for (;;)
1569            break;            break;
1570    
1571            case PT_LAMP:            case PT_LAMP:
1572            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1573                prop->chartype == ucp_Lt;
1574            break;            break;
1575    
1576            case PT_GC:            case PT_GC:
1577            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1578            break;            break;
1579    
1580            case PT_PC:            case PT_PC:
1581            OK = chartype == code[3];            OK = prop->chartype == code[3];
1582            break;            break;
1583    
1584            case PT_SC:            case PT_SC:
1585            OK = script == code[3];            OK = prop->script == code[3];
1586              break;
1587    
1588              /* These are specials for combination cases. */
1589    
1590              case PT_ALNUM:
1591              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1592                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1593              break;
1594    
1595              case PT_SPACE:    /* Perl space */
1596              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1597                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1598              break;
1599    
1600              case PT_PXSPACE:  /* POSIX space */
1601              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1602                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1603                   c == CHAR_FF || c == CHAR_CR;
1604              break;
1605    
1606              case PT_WORD:
1607              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1608                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1609                   c == CHAR_UNDERSCORE;
1610              break;
1611    
1612              case PT_CLIST:
1613              cp = PRIV(ucd_caseless_sets) + code[3];
1614              for (;;)
1615                {
1616                if (c < *cp) { OK = FALSE; break; }
1617                if (c == *cp++) { OK = TRUE; break; }
1618                }
1619              break;
1620    
1621              case PT_UCNC:
1622              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1623                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1624                   c >= 0xe000;
1625            break;            break;
1626    
1627            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1160  for (;;) Line 1659  for (;;)
1659        QS2:        QS2:
1660    
1661        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1662        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1663          {          {
1664          const uschar *nptr = ptr + clen;          int lgb, rgb;
1665            const pcre_uchar *nptr = ptr + clen;
1666          int ncount = 0;          int ncount = 0;
1667          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1668              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1170  for (;;) Line 1670  for (;;)
1670            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1671            next_active_state--;            next_active_state--;
1672            }            }
1673            lgb = UCD_GRAPHBREAK(c);
1674          while (nptr < end_subject)          while (nptr < end_subject)
1675            {            {
1676            int nd;            dlen = 1;
1677            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1678            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1679            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1680            ncount++;            ncount++;
1681            nptr += ndlen;            lgb = rgb;
1682              nptr += dlen;
1683            }            }
1684          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1685          }          }
1686        break;        break;
1687    #endif
1688    
1689        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1690        case OP_ANYNL_EXTRA + OP_TYPEQUERY:        case OP_ANYNL_EXTRA + OP_TYPEQUERY:
# Line 1202  for (;;) Line 1705  for (;;)
1705          int ncount = 0;          int ncount = 0;
1706          switch (c)          switch (c)
1707            {            {
1708            case 0x000d:            case CHAR_VT:
1709            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            case CHAR_FF:
1710            /* Fall through */            case CHAR_NEL:
1711            case 0x000a:  #ifndef EBCDIC
           case 0x000b:  
           case 0x000c:  
           case 0x0085:  
1712            case 0x2028:            case 0x2028:
1713            case 0x2029:            case 0x2029:
1714    #endif  /* Not EBCDIC */
1715              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1716              goto ANYNL02;
1717    
1718              case CHAR_CR:
1719              if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1720              /* Fall through */
1721    
1722              ANYNL02:
1723              case CHAR_LF:
1724            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1725                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1726              {              {
1727              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1728              next_active_state--;              next_active_state--;
1729              }              }
1730            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1731            break;            break;
1732    
1733            default:            default:
1734            break;            break;
1735            }            }
# Line 1226  for (;;) Line 1737  for (;;)
1737        break;        break;
1738    
1739        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1740        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1741        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1742        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1743        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        count = 2;
1744        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        goto QS4;
1745          { ADD_ACTIVE(state_offset + 6, 0); }  
1746        count = current_state->count;  /* Number already matched */        case OP_VSPACE_EXTRA + OP_TYPESTAR:
1747          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1748          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1749          count = 0;
1750    
1751          QS4:
1752          ADD_ACTIVE(state_offset + 2, 0);
1753        if (clen > 0)        if (clen > 0)
1754          {          {
1755          BOOL OK;          BOOL OK;
1756          int category = _pcre_ucp_findprop(c, &chartype, &script);          switch (c)
         switch(code[4])  
1757            {            {
1758            case PT_ANY:            VSPACE_CASES:
1759            OK = TRUE;            OK = TRUE;
1760            break;            break;
1761    
1762            case PT_LAMP:            default:
1763            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = FALSE;
1764            break;            break;
1765              }
1766            case PT_GC:          if (OK == (d == OP_VSPACE))
1767            OK = category == code[5];            {
1768              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1769                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1770                {
1771                active_count--;           /* Remove non-match possibility */
1772                next_active_state--;
1773                }
1774              ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1775              }
1776            }
1777          break;
1778    
1779          /*-----------------------------------------------------------------*/
1780          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1781          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1782          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1783          count = 2;
1784          goto QS5;
1785    
1786          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1787          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1788          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1789          count = 0;
1790    
1791          QS5:
1792          ADD_ACTIVE(state_offset + 2, 0);
1793          if (clen > 0)
1794            {
1795            BOOL OK;
1796            switch (c)
1797              {
1798              HSPACE_CASES:
1799              OK = TRUE;
1800              break;
1801    
1802              default:
1803              OK = FALSE;
1804              break;
1805              }
1806    
1807            if (OK == (d == OP_HSPACE))
1808              {
1809              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1810                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1811                {
1812                active_count--;           /* Remove non-match possibility */
1813                next_active_state--;
1814                }
1815              ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1816              }
1817            }
1818          break;
1819    
1820          /*-----------------------------------------------------------------*/
1821    #ifdef SUPPORT_UCP
1822          case OP_PROP_EXTRA + OP_TYPEEXACT:
1823          case OP_PROP_EXTRA + OP_TYPEUPTO:
1824          case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1825          case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1826          if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1827            { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1828          count = current_state->count;  /* Number already matched */
1829          if (clen > 0)
1830            {
1831            BOOL OK;
1832            const pcre_uint32 *cp;
1833            const ucd_record * prop = GET_UCD(c);
1834            switch(code[1 + IMM2_SIZE + 1])
1835              {
1836              case PT_ANY:
1837              OK = TRUE;
1838              break;
1839    
1840              case PT_LAMP:
1841              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1842                prop->chartype == ucp_Lt;
1843              break;
1844    
1845              case PT_GC:
1846              OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1847            break;            break;
1848    
1849            case PT_PC:            case PT_PC:
1850            OK = chartype == code[5];            OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1851            break;            break;
1852    
1853            case PT_SC:            case PT_SC:
1854            OK = script == code[5];            OK = prop->script == code[1 + IMM2_SIZE + 2];
1855              break;
1856    
1857              /* These are specials for combination cases. */
1858    
1859              case PT_ALNUM:
1860              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1861                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1862              break;
1863    
1864              case PT_SPACE:    /* Perl space */
1865              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1866                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1867              break;
1868    
1869              case PT_PXSPACE:  /* POSIX space */
1870              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1871                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1872                   c == CHAR_FF || c == CHAR_CR;
1873              break;
1874    
1875              case PT_WORD:
1876              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1877                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1878                   c == CHAR_UNDERSCORE;
1879              break;
1880    
1881              case PT_CLIST:
1882              cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1883              for (;;)
1884                {
1885                if (c < *cp) { OK = FALSE; break; }
1886                if (c == *cp++) { OK = TRUE; break; }
1887                }
1888              break;
1889    
1890              case PT_UCNC:
1891              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1892                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1893                   c >= 0xe000;
1894            break;            break;
1895    
1896            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1273  for (;;) Line 1907  for (;;)
1907              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1908              next_active_state--;              next_active_state--;
1909              }              }
1910            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1911              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1912            else            else
1913              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1914            }            }
# Line 1287  for (;;) Line 1921  for (;;)
1921        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1922        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1923        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1924          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1925        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1926        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1927          {          {
1928          const uschar *nptr = ptr + clen;          int lgb, rgb;
1929            const pcre_uchar *nptr = ptr + clen;
1930          int ncount = 0;          int ncount = 0;
1931          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1932            {            {
1933            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1934            next_active_state--;            next_active_state--;
1935            }            }
1936            lgb = UCD_GRAPHBREAK(c);
1937          while (nptr < end_subject)          while (nptr < end_subject)
1938            {            {
1939            int nd;            dlen = 1;
1940            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1941            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1942            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1943            ncount++;            ncount++;
1944            nptr += ndlen;            lgb = rgb;
1945              nptr += dlen;
1946            }            }
1947          if (++count >= GET2(code, 1))          if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1948            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              reset_could_continue = TRUE;
1949            if (++count >= (int)GET2(code, 1))
1950              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1951          else          else
1952            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1953          }          }
1954        break;        break;
1955    #endif
1956    
1957        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1958        case OP_ANYNL_EXTRA + OP_TYPEEXACT:        case OP_ANYNL_EXTRA + OP_TYPEEXACT:
# Line 1320  for (;;) Line 1960  for (;;)
1960        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1961        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1962        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1963          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1964        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1965        if (clen > 0)        if (clen > 0)
1966          {          {
1967          int ncount = 0;          int ncount = 0;
1968          switch (c)          switch (c)
1969            {            {
1970            case 0x000d:            case CHAR_VT:
1971            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            case CHAR_FF:
1972            /* Fall through */            case CHAR_NEL:
1973            case 0x000a:  #ifndef EBCDIC
           case 0x000b:  
           case 0x000c:  
           case 0x0085:  
1974            case 0x2028:            case 0x2028:
1975            case 0x2029:            case 0x2029:
1976    #endif  /* Not EBCDIC */
1977              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1978              goto ANYNL03;
1979    
1980              case CHAR_CR:
1981              if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1982              /* Fall through */
1983    
1984              ANYNL03:
1985              case CHAR_LF:
1986            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1987              {              {
1988              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1989              next_active_state--;              next_active_state--;
1990              }              }
1991            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1992              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1993            else            else
1994              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1995            break;            break;
1996    
1997              default:
1998              break;
1999              }
2000            }
2001          break;
2002    
2003          /*-----------------------------------------------------------------*/
2004          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2005          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2006          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2007          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2008          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2009            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2010          count = current_state->count;  /* Number already matched */
2011          if (clen > 0)
2012            {
2013            BOOL OK;
2014            switch (c)
2015              {
2016              VSPACE_CASES:
2017              OK = TRUE;
2018              break;
2019    
2020              default:
2021              OK = FALSE;
2022              }
2023    
2024            if (OK == (d == OP_VSPACE))
2025              {
2026              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2027                {
2028                active_count--;           /* Remove non-match possibility */
2029                next_active_state--;
2030                }
2031              if (++count >= (int)GET2(code, 1))
2032                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2033              else
2034                { ADD_NEW_DATA(-state_offset, count, 0); }
2035              }
2036            }
2037          break;
2038    
2039          /*-----------------------------------------------------------------*/
2040          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2041          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2042          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2043          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2044          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2045            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2046          count = current_state->count;  /* Number already matched */
2047          if (clen > 0)
2048            {
2049            BOOL OK;
2050            switch (c)
2051              {
2052              HSPACE_CASES:
2053              OK = TRUE;
2054              break;
2055    
2056            default:            default:
2057              OK = FALSE;
2058            break;            break;
2059            }            }
2060    
2061            if (OK == (d == OP_HSPACE))
2062              {
2063              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2064                {
2065                active_count--;           /* Remove non-match possibility */
2066                next_active_state--;
2067                }
2068              if (++count >= (int)GET2(code, 1))
2069                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2070              else
2071                { ADD_NEW_DATA(-state_offset, count, 0); }
2072              }
2073          }          }
2074        break;        break;
2075    
# Line 1364  for (;;) Line 2085  for (;;)
2085        break;        break;
2086    
2087        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2088        case OP_CHARNC:        case OP_CHARI:
2089        if (clen == 0) break;        if (clen == 0) break;
2090    
2091  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2092        if (utf8)        if (utf)
2093          {          {
2094          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2095            {            {
2096            unsigned int othercase;            unsigned int othercase;
2097            if (c < 128) othercase = fcc[c]; else            if (c < 128)
2098                othercase = fcc[c];
2099            /* If we have Unicode property support, we can use it to test the            else
2100            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2101                other case of the character. */
2102  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2103            othercase = _pcre_ucp_othercase(c);              othercase = UCD_OTHERCASE(c);
2104  #else  #else
2105            othercase = NOTACHAR;              othercase = NOTACHAR;
2106  #endif  #endif
2107    
2108            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2109            }            }
2110          }          }
2111        else        else
2112  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2113          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2114          {          {
2115          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2116              { ADD_NEW(state_offset + 2, 0); }
2117          }          }
2118        break;        break;
2119    
# Line 1404  for (;;) Line 2125  for (;;)
2125        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2126    
2127        case OP_EXTUNI:        case OP_EXTUNI:
2128        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
2129          {          {
2130          const uschar *nptr = ptr + clen;          int lgb, rgb;
2131            const pcre_uchar *nptr = ptr + clen;
2132          int ncount = 0;          int ncount = 0;
2133            lgb = UCD_GRAPHBREAK(c);
2134          while (nptr < end_subject)          while (nptr < end_subject)
2135            {            {
2136            int nclen = 1;            dlen = 1;
2137            GETCHARLEN(c, nptr, nclen);            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2138            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            rgb = UCD_GRAPHBREAK(d);
2139              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2140            ncount++;            ncount++;
2141            nptr += nclen;            lgb = rgb;
2142              nptr += dlen;
2143            }            }
2144            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2145                reset_could_continue = TRUE;
2146          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2147          }          }
2148        break;        break;
# Line 1429  for (;;) Line 2156  for (;;)
2156        case OP_ANYNL:        case OP_ANYNL:
2157        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2158          {          {
2159          case 0x000a:          case CHAR_VT:
2160          case 0x000b:          case CHAR_FF:
2161          case 0x000c:          case CHAR_NEL:
2162          case 0x0085:  #ifndef EBCDIC
2163          case 0x2028:          case 0x2028:
2164          case 0x2029:          case 0x2029:
2165    #endif  /* Not EBCDIC */
2166            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2167    
2168            case CHAR_LF:
2169          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2170          break;          break;
2171          case 0x000d:  
2172          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          case CHAR_CR:
2173            if (ptr + 1 >= end_subject)
2174              {
2175              ADD_NEW(state_offset + 1, 0);
2176              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2177                reset_could_continue = TRUE;
2178              }
2179            else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2180            {            {
2181            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2182            }            }
# Line 1451  for (;;) Line 2189  for (;;)
2189        break;        break;
2190    
2191        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2192        /* Match a negated single character. This is only used for one-byte        case OP_NOT_VSPACE:
2193        characters, that is, we know that d < 256. The character we are        if (clen > 0) switch(c)
2194        checking (c) can be multibyte. */          {
2195            VSPACE_CASES:
2196            break;
2197    
2198            default:
2199            ADD_NEW(state_offset + 1, 0);
2200            break;
2201            }
2202          break;
2203    
2204          /*-----------------------------------------------------------------*/
2205          case OP_VSPACE:
2206          if (clen > 0) switch(c)
2207            {
2208            VSPACE_CASES:
2209            ADD_NEW(state_offset + 1, 0);
2210            break;
2211    
2212            default:
2213            break;
2214            }
2215          break;
2216    
2217          /*-----------------------------------------------------------------*/
2218          case OP_NOT_HSPACE:
2219          if (clen > 0) switch(c)
2220            {
2221            HSPACE_CASES:
2222            break;
2223    
2224            default:
2225            ADD_NEW(state_offset + 1, 0);
2226            break;
2227            }
2228          break;
2229    
2230          /*-----------------------------------------------------------------*/
2231          case OP_HSPACE:
2232          if (clen > 0) switch(c)
2233            {
2234            HSPACE_CASES:
2235            ADD_NEW(state_offset + 1, 0);
2236            break;
2237    
2238            default:
2239            break;
2240            }
2241          break;
2242    
2243          /*-----------------------------------------------------------------*/
2244          /* Match a negated single character casefully. */
2245    
2246        case OP_NOT:        case OP_NOT:
2247          if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2248          break;
2249    
2250          /*-----------------------------------------------------------------*/
2251          /* Match a negated single character caselessly. */
2252    
2253          case OP_NOTI:
2254        if (clen > 0)        if (clen > 0)
2255          {          {
2256          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          unsigned int otherd;
2257          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  #ifdef SUPPORT_UTF
2258            if (utf && d >= 128)
2259              {
2260    #ifdef SUPPORT_UCP
2261              otherd = UCD_OTHERCASE(d);
2262    #endif  /* SUPPORT_UCP */
2263              }
2264            else
2265    #endif  /* SUPPORT_UTF */
2266            otherd = TABLE_GET(d, fcc, d);
2267            if (c != d && c != otherd)
2268              { ADD_NEW(state_offset + dlen + 1, 0); }
2269          }          }
2270        break;        break;
2271    
2272        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2273          case OP_PLUSI:
2274          case OP_MINPLUSI:
2275          case OP_POSPLUSI:
2276          case OP_NOTPLUSI:
2277          case OP_NOTMINPLUSI:
2278          case OP_NOTPOSPLUSI:
2279          caseless = TRUE;
2280          codevalue -= OP_STARI - OP_STAR;
2281    
2282          /* Fall through */
2283        case OP_PLUS:        case OP_PLUS:
2284        case OP_MINPLUS:        case OP_MINPLUS:
2285        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1474  for (;;) Line 2290  for (;;)
2290        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2291        if (clen > 0)        if (clen > 0)
2292          {          {
2293          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2294          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2295            {            {
2296  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2297            if (utf8 && d >= 128)            if (utf && d >= 128)
2298              {              {
2299  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2300              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2301  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2302              }              }
2303            else            else
2304  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2305            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2306            }            }
2307          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2308            {            {
# Line 1503  for (;;) Line 2319  for (;;)
2319        break;        break;
2320    
2321        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2322          case OP_QUERYI:
2323          case OP_MINQUERYI:
2324          case OP_POSQUERYI:
2325          case OP_NOTQUERYI:
2326          case OP_NOTMINQUERYI:
2327          case OP_NOTPOSQUERYI:
2328          caseless = TRUE;
2329          codevalue -= OP_STARI - OP_STAR;
2330          /* Fall through */
2331        case OP_QUERY:        case OP_QUERY:
2332        case OP_MINQUERY:        case OP_MINQUERY:
2333        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1512  for (;;) Line 2337  for (;;)
2337        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2338        if (clen > 0)        if (clen > 0)
2339          {          {
2340          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2341          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2342            {            {
2343  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2344            if (utf8 && d >= 128)            if (utf && d >= 128)
2345              {              {
2346  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2347              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2348  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2349              }              }
2350            else            else
2351  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2352            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2353            }            }
2354          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2355            {            {
# Line 1539  for (;;) Line 2364  for (;;)
2364        break;        break;
2365    
2366        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2367          case OP_STARI:
2368          case OP_MINSTARI:
2369          case OP_POSSTARI:
2370          case OP_NOTSTARI:
2371          case OP_NOTMINSTARI:
2372          case OP_NOTPOSSTARI:
2373          caseless = TRUE;
2374          codevalue -= OP_STARI - OP_STAR;
2375          /* Fall through */
2376        case OP_STAR:        case OP_STAR:
2377        case OP_MINSTAR:        case OP_MINSTAR:
2378        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1548  for (;;) Line 2382  for (;;)
2382        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2383        if (clen > 0)        if (clen > 0)
2384          {          {
2385          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2386          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2387            {            {
2388  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2389            if (utf8 && d >= 128)            if (utf && d >= 128)
2390              {              {
2391  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2392              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2393  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2394              }              }
2395            else            else
2396  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2397            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2398            }            }
2399          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2400            {            {
# Line 1575  for (;;) Line 2409  for (;;)
2409        break;        break;
2410    
2411        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2412          case OP_EXACTI:
2413          case OP_NOTEXACTI:
2414          caseless = TRUE;
2415          codevalue -= OP_STARI - OP_STAR;
2416          /* Fall through */
2417        case OP_EXACT:        case OP_EXACT:
2418        case OP_NOTEXACT:        case OP_NOTEXACT:
2419        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2420        if (clen > 0)        if (clen > 0)
2421          {          {
2422          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2423          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2424            {            {
2425  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2426            if (utf8 && d >= 128)            if (utf && d >= 128)
2427              {              {
2428  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2429              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2430  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2431              }              }
2432            else            else
2433  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2434            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2435            }            }
2436          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2437            {            {
2438            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2439              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2440            else            else
2441              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2442            }            }
# Line 1605  for (;;) Line 2444  for (;;)
2444        break;        break;
2445    
2446        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2447          case OP_UPTOI:
2448          case OP_MINUPTOI:
2449          case OP_POSUPTOI:
2450          case OP_NOTUPTOI:
2451          case OP_NOTMINUPTOI:
2452          case OP_NOTPOSUPTOI:
2453          caseless = TRUE;
2454          codevalue -= OP_STARI - OP_STAR;
2455          /* Fall through */
2456        case OP_UPTO:        case OP_UPTO:
2457        case OP_MINUPTO:        case OP_MINUPTO:
2458        case OP_POSUPTO:        case OP_POSUPTO:
2459        case OP_NOTUPTO:        case OP_NOTUPTO:
2460        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2461        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2462        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2463        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2464        if (clen > 0)        if (clen > 0)
2465          {          {
2466          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2467          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2468            {            {
2469  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2470            if (utf8 && d >= 128)            if (utf && d >= 128)
2471              {              {
2472  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2473              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2474  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2475              }              }
2476            else            else
2477  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2478            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2479            }            }
2480          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2481            {            {
# Line 1636  for (;;) Line 2484  for (;;)
2484              active_count--;             /* Remove non-match possibility */              active_count--;             /* Remove non-match possibility */
2485              next_active_state--;              next_active_state--;
2486              }              }
2487            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2488              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2489            else            else
2490              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2491            }            }
# Line 1654  for (;;) Line 2502  for (;;)
2502          {          {
2503          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2504          int next_state_offset;          int next_state_offset;
2505          const uschar *ecode;          const pcre_uchar *ecode;
2506    
2507          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2508          can set isinclass from it. */          can set isinclass from it. */
2509    
2510          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2511            {            {
2512            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2513            if (clen > 0)            if (clen > 0)
2514              {              {
2515              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2516                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2517              }              }
2518            }            }
2519    
# Line 1676  for (;;) Line 2524  for (;;)
2524          else          else
2525           {           {
2526           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2527           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2528           }           }
2529    
2530          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2531          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2532          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2533    
2534          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2535    
2536          switch (*ecode)          switch (*ecode)
2537            {            {
# Line 1709  for (;;) Line 2557  for (;;)
2557            case OP_CRRANGE:            case OP_CRRANGE:
2558            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2559            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2560            if (count >= GET2(ecode, 1))            if (count >= (int)GET2(ecode, 1))
2561              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2562            if (isinclass)            if (isinclass)
2563              {              {
2564              int max = GET2(ecode, 3);              int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2565              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2566                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2567              else              else
2568                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2569              }              }
# Line 1730  for (;;) Line 2578  for (;;)
2578    
2579  /* ========================================================================== */  /* ========================================================================== */
2580        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2581        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2582          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2583          though the other "backtracking verbs" are not supported. */
2584    
2585          case OP_FAIL:
2586          forced_fail++;    /* Count FAILs for multiple states */
2587          break;
2588    
2589        case OP_ASSERT:        case OP_ASSERT:
2590        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1740  for (;;) Line 2594  for (;;)
2594          int rc;          int rc;
2595          int local_offsets[2];          int local_offsets[2];
2596          int local_workspace[1000];          int local_workspace[1000];
2597          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2598    
2599          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2600    
# Line 1748  for (;;) Line 2602  for (;;)
2602            md,                                   /* static match data */            md,                                   /* static match data */
2603            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2604            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2605            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2606            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2607            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2608            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2609            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2610            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2611    
2612            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2613          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2614              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2615          }          }
2616        break;        break;
2617    
# Line 1768  for (;;) Line 2621  for (;;)
2621          {          {
2622          int local_offsets[1000];          int local_offsets[1000];
2623          int local_workspace[1000];          int local_workspace[1000];
2624          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2625            int condcode;
2626    
2627            /* Because of the way auto-callout works during compile, a callout item
2628            is inserted between OP_COND and an assertion condition. This does not
2629            happen for the other conditions. */
2630    
2631            if (code[LINK_SIZE+1] == OP_CALLOUT)
2632              {
2633              rrc = 0;
2634              if (PUBL(callout) != NULL)
2635                {
2636                PUBL(callout_block) cb;
2637                cb.version          = 1;   /* Version 1 of the callout block */
2638                cb.callout_number   = code[LINK_SIZE+2];
2639                cb.offset_vector    = offsets;
2640    #if defined COMPILE_PCRE8
2641                cb.subject          = (PCRE_SPTR)start_subject;
2642    #elif defined COMPILE_PCRE16
2643                cb.subject          = (PCRE_SPTR16)start_subject;
2644    #elif defined COMPILE_PCRE32
2645                cb.subject          = (PCRE_SPTR32)start_subject;
2646    #endif
2647                cb.subject_length   = (int)(end_subject - start_subject);
2648                cb.start_match      = (int)(current_subject - start_subject);
2649                cb.current_position = (int)(ptr - start_subject);
2650                cb.pattern_position = GET(code, LINK_SIZE + 3);
2651                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2652                cb.capture_top      = 1;
2653                cb.capture_last     = -1;
2654                cb.callout_data     = md->callout_data;
2655                cb.mark             = NULL;   /* No (*MARK) support */
2656                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2657                }
2658              if (rrc > 0) break;                      /* Fail this thread */
2659              code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2660              }
2661    
2662            condcode = code[LINK_SIZE+1];
2663    
2664          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2665    
2666          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2667              return PCRE_ERROR_DFA_UCOND;
2668    
2669          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2670    
2671          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2672            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2673    
2674          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2675          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2676          recursed groups. */          recursed groups. */
2677    
2678          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2679            {            {
2680            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2681            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2682            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (md->recursive != NULL)
2683              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2684              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2685            }            }
2686    
2687          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1798  for (;;) Line 2689  for (;;)
2689          else          else
2690            {            {
2691            int rc;            int rc;
2692            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2693            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2694    
2695            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2696    
# Line 1807  for (;;) Line 2698  for (;;)
2698              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2699              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2700              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2701              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2702              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2703              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2704              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2705              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2706              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2707    
2708              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2709            if ((rc >= 0) ==            if ((rc >= 0) ==
2710                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2711              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2712            else            else
2713              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2714            }            }
2715          }          }
2716        break;        break;
# Line 1828  for (;;) Line 2718  for (;;)
2718        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2719        case OP_RECURSE:        case OP_RECURSE:
2720          {          {
2721            dfa_recursion_info *ri;
2722          int local_offsets[1000];          int local_offsets[1000];
2723          int local_workspace[1000];          int local_workspace[1000];
2724            const pcre_uchar *callpat = start_code + GET(code, 1);
2725            int recno = (callpat == md->start_code)? 0 :
2726              GET2(callpat, 1 + LINK_SIZE);
2727          int rc;          int rc;
2728    
2729          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2730            recursing + 1));  
2731            /* Check for repeating a recursion without advancing the subject
2732            pointer. This should catch convoluted mutual recursions. (Some simple
2733            cases are caught at compile time.) */
2734    
2735            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2736              if (recno == ri->group_num && ptr == ri->subject_position)
2737                return PCRE_ERROR_RECURSELOOP;
2738    
2739            /* Remember this recursion and where we started it so as to
2740            catch infinite loops. */
2741    
2742            new_recursive.group_num = recno;
2743            new_recursive.subject_position = ptr;
2744            new_recursive.prevrec = md->recursive;
2745            md->recursive = &new_recursive;
2746    
2747          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2748            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2749            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2750            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2751            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2752            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2753            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2754            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2755            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2756            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
2757            rlevel,                               /* function recursion level */  
2758            recursing + 1);                       /* regex recurse level */          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2759    
2760          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2761            recursing + 1, rc));            rc));
2762    
2763          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2764    
# Line 1863  for (;;) Line 2772  for (;;)
2772            {            {
2773            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2774              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2775              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2776              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2777                if (utf)
2778                  {
2779                  const pcre_uchar *p = start_subject + local_offsets[rc];
2780                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2781                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2782                  }
2783    #endif
2784              if (charcount > 0)              if (charcount > 0)
2785                {                {
2786                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 1882  for (;;) Line 2796  for (;;)
2796        break;        break;
2797    
2798        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2799          case OP_BRAPOS:
2800          case OP_SBRAPOS:
2801          case OP_CBRAPOS:
2802          case OP_SCBRAPOS:
2803          case OP_BRAPOSZERO:
2804            {
2805            int charcount, matched_count;
2806            const pcre_uchar *local_ptr = ptr;
2807            BOOL allow_zero;
2808    
2809            if (codevalue == OP_BRAPOSZERO)
2810              {
2811              allow_zero = TRUE;
2812              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2813              }
2814            else allow_zero = FALSE;
2815    
2816            /* Loop to match the subpattern as many times as possible as if it were
2817            a complete pattern. */
2818    
2819            for (matched_count = 0;; matched_count++)
2820              {
2821              int local_offsets[2];
2822              int local_workspace[1000];
2823    
2824              int rc = internal_dfa_exec(
2825                md,                                   /* fixed match data */
2826                code,                                 /* this subexpression's code */
2827                local_ptr,                            /* where we currently are */
2828                (int)(ptr - start_subject),           /* start offset */
2829                local_offsets,                        /* offset vector */
2830                sizeof(local_offsets)/sizeof(int),    /* size of same */
2831                local_workspace,                      /* workspace vector */
2832                sizeof(local_workspace)/sizeof(int),  /* size of same */
2833                rlevel);                              /* function recursion level */
2834    
2835              /* Failed to match */
2836    
2837              if (rc < 0)
2838                {
2839                if (rc != PCRE_ERROR_NOMATCH) return rc;
2840                break;
2841                }
2842    
2843              /* Matched: break the loop if zero characters matched. */
2844    
2845              charcount = local_offsets[1] - local_offsets[0];
2846              if (charcount == 0) break;
2847              local_ptr += charcount;    /* Advance temporary position ptr */
2848              }
2849    
2850            /* At this point we have matched the subpattern matched_count
2851            times, and local_ptr is pointing to the character after the end of the
2852            last match. */
2853    
2854            if (matched_count > 0 || allow_zero)
2855              {
2856              const pcre_uchar *end_subpattern = code;
2857              int next_state_offset;
2858    
2859              do { end_subpattern += GET(end_subpattern, 1); }
2860                while (*end_subpattern == OP_ALT);
2861              next_state_offset =
2862                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2863    
2864              /* Optimization: if there are no more active states, and there
2865              are no new states yet set up, then skip over the subject string
2866              right here, to save looping. Otherwise, set up the new state to swing
2867              into action when the end of the matched substring is reached. */
2868    
2869              if (i + 1 >= active_count && new_count == 0)
2870                {
2871                ptr = local_ptr;
2872                clen = 0;
2873                ADD_NEW(next_state_offset, 0);
2874                }
2875              else
2876                {
2877                const pcre_uchar *p = ptr;
2878                const pcre_uchar *pp = local_ptr;
2879                charcount = (int)(pp - p);
2880    #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2881                if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2882    #endif
2883                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2884                }
2885              }
2886            }
2887          break;
2888    
2889          /*-----------------------------------------------------------------*/
2890        case OP_ONCE:        case OP_ONCE:
2891          case OP_ONCE_NC:
2892          {          {
2893          int local_offsets[2];          int local_offsets[2];
2894          int local_workspace[1000];          int local_workspace[1000];
# Line 1891  for (;;) Line 2897  for (;;)
2897            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2898            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2899            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2900            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2901            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2902            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2903            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2904            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2905            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2906    
2907          if (rc >= 0)          if (rc >= 0)
2908            {            {
2909            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2910            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2911            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2912    
2913            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2914              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2915            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2916                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2917    
2918            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2919            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 1916  for (;;) Line 2921  for (;;)
2921    
2922            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2923                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2924              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2925    
2926            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2927            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 1931  for (;;) Line 2936  for (;;)
2936            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2937            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2938            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2939            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2940    
2941            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2942              {              {
# Line 1954  for (;;) Line 2959  for (;;)
2959              }              }
2960            else            else
2961              {              {
2962              const uschar *p = start_subject + local_offsets[0];  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2963              const uschar *pp = start_subject + local_offsets[1];              if (utf)
2964              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;                {
2965                  const pcre_uchar *p = start_subject + local_offsets[0];
2966                  const pcre_uchar *pp = start_subject + local_offsets[1];
2967                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2968                  }
2969    #endif
2970              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2971              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2972                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2973              }              }
   
2974            }            }
2975          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2976          }          }
# Line 1972  for (;;) Line 2981  for (;;)
2981        /* Handle callouts */        /* Handle callouts */
2982    
2983        case OP_CALLOUT:        case OP_CALLOUT:
2984        if (pcre_callout != NULL)        rrc = 0;
2985          if (PUBL(callout) != NULL)
2986          {          {
2987          int rrc;          PUBL(callout_block) cb;
         pcre_callout_block cb;  
2988          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2989          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2990          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2991    #if defined COMPILE_PCRE8
2992          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2993          cb.subject_length   = end_subject - start_subject;  #elif defined COMPILE_PCRE16
2994          cb.start_match      = current_subject - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
2995          cb.current_position = ptr - start_subject;  #elif defined COMPILE_PCRE32
2996            cb.subject          = (PCRE_SPTR32)start_subject;
2997    #endif
2998            cb.subject_length   = (int)(end_subject - start_subject);
2999            cb.start_match      = (int)(current_subject - start_subject);
3000            cb.current_position = (int)(ptr - start_subject);
3001          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
3002          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
3003          cb.capture_top      = 1;          cb.capture_top      = 1;
3004          cb.capture_last     = -1;          cb.capture_last     = -1;
3005          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
3006          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
3007          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3008          }          }
3009          if (rrc == 0)
3010            { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3011        break;        break;
3012    
3013    
# Line 2006  for (;;) Line 3023  for (;;)
3023    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
3024    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
3025    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
3026    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
3027    
3028      The "forced_ fail" variable counts the number of (*F) encountered for the
3029      character. If it is equal to the original active_count (saved in
3030      workspace[1]) it means that (*F) was found on every active state. In this
3031      case we don't want to give a partial match.
3032    
3033      The "could_continue" variable is true if a state could have continued but
3034      for the fact that the end of the subject was reached. */
3035    
3036    if (new_count <= 0)    if (new_count <= 0)
3037      {      {
3038      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
3039          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on, and */
3040          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3041          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
3042          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3043        {          ||                                           /* or... */
3044        if (offsetcount >= 2)          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3045          {           match_count < 0)                            /* no matches */
3046          offsets[0] = current_subject - start_subject;          ) &&                                         /* And... */
3047          offsets[1] = end_subject - start_subject;          (
3048          }          partial_newline ||                           /* Either partial NL */
3049              (                                          /* or ... */
3050              ptr >= end_subject &&                /* End of subject and */
3051              ptr > md->start_used_ptr)            /* Inspected non-empty string */
3052              )
3053            )
3054        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
       }  
   
3055      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3056        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3057        rlevel*2-2, SP));        rlevel*2-2, SP));
# Line 2073  Returns:          > 0 => number of match Line 3101  Returns:          > 0 => number of match
3101                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3102  */  */
3103    
3104  PCRE_EXP_DEFN int  #if defined COMPILE_PCRE8
3105    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3106  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3107    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3108    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3109    #elif defined COMPILE_PCRE16
3110    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3111    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3112      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3113      int offsetcount, int *workspace, int wscount)
3114    #elif defined COMPILE_PCRE32
3115    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3116    pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3117      PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3118      int offsetcount, int *workspace, int wscount)
3119    #endif
3120  {  {
3121  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3122  dfa_match_data match_block;  dfa_match_data match_block;
3123  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3124  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3125  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
   
 pcre_study_data internal_study;  
3126  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3127    
3128  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3129  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3130  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3131  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3132  int first_byte = -1;  pcre_uchar first_char = 0;
3133  int req_byte = -1;  pcre_uchar first_char2 = 0;
3134  int req_byte2 = -1;  pcre_uchar req_char = 0;
3135    pcre_uchar req_char2 = 0;
3136  int newline;  int newline;
3137    
3138  /* Plausibility checks */  /* Plausibility checks */
# Line 2104  if (re == NULL || subject == NULL || wor Line 3142  if (re == NULL || subject == NULL || wor
3142     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3143  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3144  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3145    if (length < 0) return PCRE_ERROR_BADLENGTH;
3146    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3147    
3148  /* We need to find the pointer to any study data before we test for byte  /* Check that the first field in the block is the magic number. If it is not,
3149  flipping, so we scan the extra_data block first. This may set two fields in the  return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3150  match block, so we must initialize them beforehand. However, the other fields  REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3151  in the match block must not be set until after the byte flipping. */  means that the pattern is likely compiled with different endianness. */
3152    
3153    if (re->magic_number != MAGIC_NUMBER)
3154      return re->magic_number == REVERSED_MAGIC_NUMBER?
3155        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3156    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3157    
3158    /* If restarting after a partial match, do some sanity checks on the contents
3159    of the workspace. */
3160    
3161    if ((options & PCRE_DFA_RESTART) != 0)
3162      {
3163      if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3164        workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3165          return PCRE_ERROR_DFA_BADRESTART;
3166      }
3167    
3168    /* Set up study, callout, and table data */
3169    
3170  md->tables = re->tables;  md->tables = re->tables;
3171  md->callout_data = NULL;  md->callout_data = NULL;
# Line 2127  if (extra_data != NULL) Line 3184  if (extra_data != NULL)
3184      md->tables = extra_data->tables;      md->tables = extra_data->tables;
3185    }    }
3186    
 /* Check that the first field in the block is the magic number. If it is not,  
 test for a regex that was compiled on a host of opposite endianness. If this is  
 the case, flipped values are put in internal_re and internal_study if there was  
 study data too. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   {  
   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);  
   if (re == NULL) return PCRE_ERROR_BADMAGIC;  
   if (study != NULL) study = &internal_study;  
   }  
   
3187  /* Set some local values */  /* Set some local values */
3188    
3189  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3190  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3191  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3192    
3193  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3194  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3195    utf = (re->options & PCRE_UTF8) != 0;
3196  #else  #else
3197  utf8 = FALSE;  utf = FALSE;
3198  #endif  #endif
3199    
3200  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2156  anchored = (options & (PCRE_ANCHORED|PCR Line 3202  anchored = (options & (PCRE_ANCHORED|PCR
3202    
3203  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3204    
3205  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3206      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3207  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3208  md->end_subject = end_subject;  md->end_subject = end_subject;
3209    md->start_offset = start_offset;
3210  md->moptions = options;  md->moptions = options;
3211  md->poptions = re->options;  md->poptions = re->options;
3212    
3213    /* If the BSR option is not set at match time, copy what was set
3214    at compile time. */
3215    
3216    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3217      {
3218      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3219        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3220    #ifdef BSR_ANYCRLF
3221      else md->moptions |= PCRE_BSR_ANYCRLF;
3222    #endif
3223      }
3224    
3225  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
3226  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
3227    
# Line 2170  switch ((((options & PCRE_NEWLINE_BITS) Line 3229  switch ((((options & PCRE_NEWLINE_BITS)
3229           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
3230    {    {
3231    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
3232    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3233    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3234    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
3235         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3236    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
3237      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3238    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
3239    }    }
3240    
3241  if (newline < 0)  if (newline == -2)
3242      {
3243      md->nltype = NLTYPE_ANYCRLF;
3244      }
3245    else if (newline < 0)
3246    {    {
3247    md->nltype = NLTYPE_ANY;    md->nltype = NLTYPE_ANY;
3248    }    }
# Line 2201  else Line 3265  else
3265  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3266  back the character offset. */  back the character offset. */
3267    
3268  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3269  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3270    {    {
3271    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3272      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3273    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3274      {      {
3275      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3276        {        {
3277        tb &= 0xc0;        offsets[0] = erroroffset;
3278        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3279        }        }
3280    #if defined COMPILE_PCRE8
3281        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3282          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3283    #elif defined COMPILE_PCRE16
3284        return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3285          PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3286    #elif defined COMPILE_PCRE32
3287        return PCRE_ERROR_BADUTF32;
3288    #endif
3289      }      }
3290    #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3291      if (start_offset > 0 && start_offset < length &&
3292            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3293        return PCRE_ERROR_BADUTF8_OFFSET;
3294    #endif
3295    }    }
3296  #endif  #endif
3297    
# Line 2222  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3299  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3299  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3300  in other programs later. */  in other programs later. */
3301    
3302  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3303    
3304  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3305  used in a loop when finding where to start. */  where to start. */
3306    
3307  lcc = md->tables + lcc_offset;  startline = (re->flags & PCRE_STARTLINE) != 0;
 startline = (re->options & PCRE_STARTLINE) != 0;  
3308  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3309    
3310  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2239  studied, there may be a bitmap of possib Line 3315  studied, there may be a bitmap of possib
3315    
3316  if (!anchored)  if (!anchored)
3317    {    {
3318    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3319      {      {
3320      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3321      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = (pcre_uchar)(re->first_char);
3322        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3323          {
3324          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3325    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3326          if (utf && first_char > 127)
3327            first_char2 = UCD_OTHERCASE(first_char);
3328    #endif
3329          }
3330      }      }
3331    else    else
3332      {      {
3333      if (startline && study != NULL &&      if (!startline && study != NULL &&
3334           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3335        start_bits = study->start_bits;        start_bits = study->start_bits;
3336      }      }
3337    }    }
# Line 2256  if (!anchored) Line 3339  if (!anchored)
3339  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
3340  character" set. */  character" set. */
3341    
3342  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3343    {    {
3344    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3345    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = (pcre_uchar)(re->req_char);
3346    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3347        {
3348        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3349    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3350        if (utf && req_char > 127)
3351          req_char2 = UCD_OTHERCASE(req_char);
3352    #endif
3353        }
3354    }    }
3355    
3356  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
3357  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
3358  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
3359    
3360  for (;;)  for (;;)
3361    {    {
# Line 2274  for (;;) Line 3363  for (;;)
3363    
3364    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3365      {      {
3366      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3367    
3368      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
3369      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
3370      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
3371      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
3372    
3373      if (firstline)      if (firstline)
3374        {        {
3375        const uschar *t = current_subject;        PCRE_PUCHAR t = current_subject;
3376    #ifdef SUPPORT_UTF
3377          if (utf)
3378            {
3379            while (t < md->end_subject && !IS_NEWLINE(t))
3380              {
3381              t++;
3382              ACROSSCHAR(t < end_subject, *t, t++);
3383              }
3384            }
3385          else
3386    #endif
3387        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3388        end_subject = t;        end_subject = t;
3389        }        }
3390    
3391      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
3392        starting point is not found. However, there is an option that disables
3393        these, for testing and for ensuring that all callouts do actually occur.
3394        The option can be set in the regex by (*NO_START_OPT) or passed in
3395        match-time options. */
3396    
3397        if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3398        {        {
3399        if (first_byte_caseless)        /* Advance to a known first char. */
3400          while (current_subject < end_subject &&  
3401                 lcc[*current_subject] != first_byte)        if (has_first_char)
3402            current_subject++;          {
3403        else          if (first_char != first_char2)
3404          while (current_subject < end_subject && *current_subject != first_byte)            {
3405            current_subject++;            pcre_uchar csc;
3406        }            while (current_subject < end_subject &&
3407                     (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3408                current_subject++;
3409              }
3410            else
3411              while (current_subject < end_subject &&
3412                     RAWUCHARTEST(current_subject) != first_char)
3413                current_subject++;
3414            }
3415    
3416      /* Or to just after a linebreak for a multiline match if possible */        /* Or to just after a linebreak for a multiline match if possible */
3417    
3418      else if (startline)        else if (startline)
       {  
       if (current_subject > md->start_subject + start_offset)  
3419          {          {
3420          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (current_subject > md->start_subject + start_offset)
3421            current_subject++;            {
3422    #ifdef SUPPORT_UTF
3423              if (utf)
3424                {
3425                while (current_subject < end_subject &&
3426                       !WAS_NEWLINE(current_subject))
3427                  {
3428                  current_subject++;
3429                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3430                    current_subject++);
3431                  }
3432                }
3433              else
3434    #endif
3435              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3436                current_subject++;
3437    
3438          /* If we have just passed a CR and the newline option is ANY, and we            /* If we have just passed a CR and the newline option is ANY or
3439          are now at a LF, advance the match position by one more character. */            ANYCRLF, and we are now at a LF, advance the match position by one
3440              more character. */
3441    
3442          if (current_subject[-1] == '\r' &&            if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3443               md->nltype == NLTYPE_ANY &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3444               current_subject < end_subject &&                 current_subject < end_subject &&
3445               *current_subject == '\n')                 RAWUCHARTEST(current_subject) == CHAR_NL)
3446            current_subject++;              current_subject++;
3447              }
3448          }          }
       }  
3449    
3450      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
3451    
3452      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
3453          {          {
3454          register unsigned int c = *current_subject;          while (current_subject < end_subject)
3455          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
3456              register pcre_uint32 c = RAWUCHARTEST(current_subject);
3457    #ifndef COMPILE_PCRE8
3458              if (c > 255) c = 255;
3459    #endif
3460              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3461                {
3462                current_subject++;
3463    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3464                /* In non 8-bit mode, the iteration will stop for
3465                characters > 255 at the beginning or not stop at all. */
3466                if (utf)
3467                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3468                    current_subject++);
3469    #endif
3470                }
3471            else break;            else break;
3472              }
3473          }          }
3474        }        }
3475    
3476      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3477    
3478      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
   
   /* If req_byte is set, we know that that character must appear in the subject  
   for the match to succeed. If the first character is set, req_byte must be  
   later in the subject; otherwise the test starts at the match point. This  
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
3479    
3480      /* We don't need to repeat the search if we haven't yet reached the      /* The following two optimizations are disabled for partial matching or if
3481      place we found it at last time. */      disabling is explicitly requested (and of course, by the test above, this
3482        code is not obeyed when restarting after a partial match). */
3483    
3484      if (p > req_byte_ptr)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3485            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3486        {        {
3487        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3488          {        is a lower bound; no actual string of that length may actually match the
3489          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3490            {        bytes to avoid spending too much time in this optimization. */
3491            register int pp = *p++;  
3492            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3493            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3494          }          return PCRE_ERROR_NOMATCH;
3495        else  
3496          /* If req_char is set, we know that that character must appear in the
3497          subject for the match to succeed. If the first character is set, req_char
3498          must be later in the subject; otherwise the test starts at the match
3499          point. This optimization can save a huge amount of work in patterns with
3500          nested unlimited repeats that aren't going to match. Writing separate
3501          code for cased/caseless versions makes it go faster, as does using an
3502          autoincrement and backing off on a match.
3503    
3504          HOWEVER: when the subject string is very, very long, searching to its end
3505          can take a long time, and give bad performance on quite ordinary
3506          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3507          string... so we don't do this when the string is sufficiently long. */
3508    
3509          if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3510          {          {
3511          while (p < end_subject)          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3512    
3513            /* We don't need to repeat the search if we haven't yet reached the
3514            place we found it at last time. */
3515    
3516            if (p > req_char_ptr)
3517            {            {
3518            if (*p++ == req_byte) { p--; break; }            if (req_char != req_char2)
3519            }              {
3520          }              while (p < end_subject)
3521                  {
3522                  register pcre_uint32 pp = RAWUCHARINCTEST(p);
3523                  if (pp == req_char || pp == req_char2) { p--; break; }
3524                  }
3525                }
3526              else
3527                {
3528                while (p < end_subject)
3529                  {
3530                  if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3531                  }
3532                }
3533    
3534        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3535        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3536    
3537        if (p >= end_subject) break;            if (p >= end_subject) break;
3538    
3539        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3540        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3541        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3542    
3543        req_byte_ptr = p;            req_char_ptr = p;
3544              }
3545            }
3546        }        }
3547      }      }   /* End of optimizations that are done when not restarting */
3548    
3549    /* OK, now we can do the business */    /* OK, now we can do the business */
3550    
3551      md->start_used_ptr = current_subject;
3552      md->recursive = NULL;
3553    
3554    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3555      md,                                /* fixed match data */      md,                                /* fixed match data */
3556      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2404  for (;;) Line 3560  for (;;)
3560      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3561      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3562      wscount,                           /* size of same */      wscount,                           /* size of same */
3563      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3564    
3565    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3566    on only if not anchored. */    on only if not anchored. */
3567    
3568    if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;    if (rc != PCRE_ERROR_NOMATCH || anchored)
3569        {
3570        if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3571          {
3572          offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3573          offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3574          if (offsetcount > 2)
3575            offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3576          }
3577        return rc;
3578        }
3579    
3580    /* Advance to the next subject character unless we are at the end of a line    /* Advance to the next subject character unless we are at the end of a line
3581    and firstline is set. */    and firstline is set. */
3582    
3583    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3584    current_subject++;    current_subject++;
3585    if (utf8)  #ifdef SUPPORT_UTF
3586      if (utf)
3587      {      {
3588      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3589        current_subject++;        current_subject++);
3590      }      }
3591    #endif
3592    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3593    
3594    /* If we have just passed a CR and the newline option is CRLF or ANY, and we    /* If we have just passed a CR and we are now at a LF, and the pattern does
3595    are now at a LF, advance the match position by one more character. */    not contain any explicit matches for \r or \n, and the newline option is CRLF
3596      or ANY or ANYCRLF, advance the match position by one more character. */
3597    if (current_subject[-1] == '\r' &&  
3598         (md->nltype == NLTYPE_ANY || md->nllen == 2) &&    if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3599         current_subject < end_subject &&        current_subject < end_subject &&
3600         *current_subject == '\n')        RAWUCHARTEST(current_subject) == CHAR_NL &&
3601          (re->flags & PCRE_HASCRORLF) == 0 &&
3602            (md->nltype == NLTYPE_ANY ||
3603             md->nltype == NLTYPE_ANYCRLF ||
3604             md->nllen == 2))
3605      current_subject++;      current_subject++;
3606    
3607    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.145  
changed lines
  Added in v.1363

  ViewVC Help
Powered by ViewVC 1.1.5