/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 145 by ph10, Wed Apr 4 14:06:52 2007 UTC revision 1033 by ph10, Mon Sep 10 11:02:48 2012 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 37  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a sort of DFA algorithm (not a true  alternative matching function that uses a sort of DFA algorithm (not a true
43  FSM). This is NOT Perl- compatible, but it has advantages in certain  FSM). This is NOT Perl-compatible, but it has advantages in certain
44  applications. */  applications. */
45    
46    
47    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48    the performance of his patterns greatly. I could not use it as it stood, as it
49    was not thread safe, and made assumptions about pattern sizes. Also, it caused
50    test 7 to loop, and test 9 to crash with a segfault.
51    
52    The issue is the check for duplicate states, which is done by a simple linear
53    search up the state list. (Grep for "duplicate" below to find the code.) For
54    many patterns, there will never be many states active at one time, so a simple
55    linear search is fine. In patterns that have many active states, it might be a
56    bottleneck. The suggested code used an indexing scheme to remember which states
57    had previously been used for each character, and avoided the linear search when
58    it knew there was no chance of a duplicate. This was implemented when adding
59    states to the state lists.
60    
61    I wrote some thread-safe, not-limited code to try something similar at the time
62    of checking for duplicates (instead of when adding states), using index vectors
63    on the stack. It did give a 13% improvement with one specially constructed
64    pattern for certain subject strings, but on other strings and on many of the
65    simpler patterns in the test suite it did worse. The major problem, I think,
66    was the extra time to initialize the index. This had to be done for each call
67    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68    only once - I suspect this was the cause of the problems with the tests.)
69    
70    Overall, I concluded that the gains in some cases did not outweigh the losses
71    in others, so I abandoned this code. */
72    
73    
74    
75    #ifdef HAVE_CONFIG_H
76    #include "config.h"
77    #endif
78    
79  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
80  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
81  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 56  applications. */ Line 88  applications. */
88  #define SP "                   "  #define SP "                   "
89    
90    
   
91  /*************************************************  /*************************************************
92  *      Code parameters and static tables         *  *      Code parameters and static tables         *
93  *************************************************/  *************************************************/
94    
95  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
97  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
98    never stored, so we push them well clear of the normal opcodes. */
99    
100  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
101  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
102  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
103    #define OP_HSPACE_EXTRA     360
104    #define OP_VSPACE_EXTRA     380
105    
106    
107  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
108  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
109  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
110  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111  small value. */  small value. Non-zero values in the table are the offsets from the opcode where
112    the character is to be found. ***NOTE*** If the start of this table is
113    modified, the three tables that follow must also be modified. */
114    
115  static uschar coptable[] = {  static const pcre_uint8 coptable[] = {
116    0,                             /* End                                    */    0,                             /* End                                    */
117    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0,                          /* \P, \p                                 */
121      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122      0,                             /* \X                                     */
123      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
124    1,                             /* Char                                   */    1,                             /* Char                                   */
125    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
126    1,                             /* not                                    */    1,                             /* not                                    */
127      1,                             /* noti                                   */
128    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
129    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
132      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135      1+IMM2_SIZE,                   /* exact I                                */
136      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
138    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
141      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144      1+IMM2_SIZE,                   /* NOT exact I                            */
145      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
147    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
150      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
152    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 104  static uschar coptable[] = { Line 155  static uschar coptable[] = {
155    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
156    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
157    0,                             /* REF                                    */    0,                             /* REF                                    */
158      0,                             /* REFI                                   */
159    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
160    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
161    0,                             /* Alt                                    */    0,                             /* Alt                                    */
162    0,                             /* Ket                                    */    0,                             /* Ket                                    */
163    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
164    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
165      0,                             /* KetRpos                                */
166      0,                             /* Reverse                                */
167    0,                             /* Assert                                 */    0,                             /* Assert                                 */
168    0,                             /* Assert not                             */    0,                             /* Assert not                             */
169    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
170    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
171      0, 0,                          /* ONCE, ONCE_NC                          */
172      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
173      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
174      0, 0,                          /* CREF, NCREF                            */
175      0, 0,                          /* RREF, NRREF                            */
176      0,                             /* DEF                                    */
177      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
178      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
179      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
180      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
181      0, 0                           /* CLOSE, SKIPZERO  */
182    };
183    
184    /* This table identifies those opcodes that inspect a character. It is used to
185    remember the fact that a character could have been inspected when the end of
186    the subject is reached. ***NOTE*** If the start of this table is modified, the
187    two tables that follow must also be modified. */
188    
189    static const pcre_uint8 poptable[] = {
190      0,                             /* End                                    */
191      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
192      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
193      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
194      1, 1,                          /* \P, \p                                 */
195      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
196      1,                             /* \X                                     */
197      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
198      1,                             /* Char                                   */
199      1,                             /* Chari                                  */
200      1,                             /* not                                    */
201      1,                             /* noti                                   */
202      /* Positive single-char repeats                                          */
203      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
204      1, 1, 1,                       /* upto, minupto, exact                   */
205      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
206      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
207      1, 1, 1,                       /* upto I, minupto I, exact I             */
208      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
209      /* Negative single-char repeats - only for chars < 256                   */
210      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
211      1, 1, 1,                       /* NOT upto, minupto, exact               */
212      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
213      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
214      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
215      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
216      /* Positive type repeats                                                 */
217      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
218      1, 1, 1,                       /* Type upto, minupto, exact              */
219      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
220      /* Character class & ref repeats                                         */
221      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
222      1, 1,                          /* CRRANGE, CRMINRANGE                    */
223      1,                             /* CLASS                                  */
224      1,                             /* NCLASS                                 */
225      1,                             /* XCLASS - variable length               */
226      0,                             /* REF                                    */
227      0,                             /* REFI                                   */
228      0,                             /* RECURSE                                */
229      0,                             /* CALLOUT                                */
230      0,                             /* Alt                                    */
231      0,                             /* Ket                                    */
232      0,                             /* KetRmax                                */
233      0,                             /* KetRmin                                */
234      0,                             /* KetRpos                                */
235    0,                             /* Reverse                                */    0,                             /* Reverse                                */
236    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
237    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
238    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
239    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
240      0, 0,                          /* ONCE, ONCE_NC                          */
241      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
242      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
243      0, 0,                          /* CREF, NCREF                            */
244      0, 0,                          /* RREF, NRREF                            */
245    0,                             /* DEF                                    */    0,                             /* DEF                                    */
246    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
247      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
248      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
249      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
250      0, 0                           /* CLOSE, SKIPZERO                        */
251  };  };
252    
253  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254  and \w */  and \w */
255    
256  static uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
257    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
258    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
259    ctype_space, ctype_space,    ctype_space, ctype_space,
260    ctype_word,  ctype_word,    ctype_word,  ctype_word,
261    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
262  };  };
263    
264  static uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
265    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
266    ctype_digit, 0,    ctype_digit, 0,
267    ctype_space, 0,    ctype_space, 0,
268    ctype_word,  0,    ctype_word,  0,
269    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
270  };  };
271    
272    
# Line 151  these structures in, is a vector of ints Line 278  these structures in, is a vector of ints
278  typedef struct stateblock {  typedef struct stateblock {
279    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
280    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
281    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
282  } stateblock;  } stateblock;
283    
284  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
285    
286    
287  #ifdef DEBUG  #ifdef PCRE_DEBUG
288  /*************************************************  /*************************************************
289  *             Print character string             *  *             Print character string             *
290  *************************************************/  *************************************************/
# Line 174  Returns:       nothing Line 300  Returns:       nothing
300  */  */
301    
302  static void  static void
303  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
304  {  {
305  int c;  int c;
306  while (length-- > 0)  while (length-- > 0)
# Line 207  Arguments: Line 333  Arguments:
333    offsetcount       size of same    offsetcount       size of same
334    workspace         vector of workspace    workspace         vector of workspace
335    wscount           size of same    wscount           size of same
   ims               the current ims flags  
336    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
337    
338  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
339                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
340                       -1 => failed to match                       -1 => failed to match
341                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
342    
# Line 224  for the current character, one for the f Line 348  for the current character, one for the f
348      { \      { \
349      next_active_state->offset = (x); \      next_active_state->offset = (x); \
350      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
351      next_active_state++; \      next_active_state++; \
352      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353      } \      } \
# Line 235  for the current character, one for the f Line 358  for the current character, one for the f
358      { \      { \
359      next_active_state->offset = (x); \      next_active_state->offset = (x); \
360      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
361      next_active_state->data   = (z); \      next_active_state->data   = (z); \
362      next_active_state++; \      next_active_state++; \
363      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 247  for the current character, one for the f Line 369  for the current character, one for the f
369      { \      { \
370      next_new_state->offset = (x); \      next_new_state->offset = (x); \
371      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
372      next_new_state++; \      next_new_state++; \
373      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
374      } \      } \
# Line 258  for the current character, one for the f Line 379  for the current character, one for the f
379      { \      { \
380      next_new_state->offset = (x); \      next_new_state->offset = (x); \
381      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
382      next_new_state->data   = (z); \      next_new_state->data   = (z); \
383      next_new_state++; \      next_new_state++; \
384      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385          (x), (y), (z), __LINE__)); \
386      } \      } \
387    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
388    
# Line 270  for the current character, one for the f Line 391  for the current character, one for the f
391  static int  static int
392  internal_dfa_exec(  internal_dfa_exec(
393    dfa_match_data *md,    dfa_match_data *md,
394    const uschar *this_start_code,    const pcre_uchar *this_start_code,
395    const uschar *current_subject,    const pcre_uchar *current_subject,
396    int start_offset,    int start_offset,
397    int *offsets,    int *offsets,
398    int offsetcount,    int offsetcount,
399    int *workspace,    int *workspace,
400    int wscount,    int wscount,
401    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
402  {  {
403  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
404  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
405    
406  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
407  const uschar *ptr;  const pcre_uchar *ptr;
408  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
409    
410    dfa_recursion_info new_recursive;
411    
412  int active_count, new_count, match_count;  int active_count, new_count, match_count;
413    
414  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
415  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
416    
417  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
418  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
419  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
420    
421  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
422  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423  #else  #else
424  BOOL utf8 = FALSE;  BOOL utf = FALSE;
425  #endif  #endif
426    
427    BOOL reset_could_continue = FALSE;
428    
429  rlevel++;  rlevel++;
430  offsetcount &= (-2);  offsetcount &= (-2);
431    
# Line 311  wscount = (wscount - (wscount % (INTS_PE Line 434  wscount = (wscount - (wscount % (INTS_PE
434            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
435    
436  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
437    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
438    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439    
440  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
441  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 325  next_new_state = new_states = active_sta Line 448  next_new_state = new_states = active_sta
448  new_count = 0;  new_count = 0;
449    
450  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
451    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453        ? IMM2_SIZE:0);
454    
455  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 353  if (*first_op == OP_REVERSE) Line 478  if (*first_op == OP_REVERSE)
478    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
479    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
480    
481  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
482    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
483    
484    if (utf8)    if (utf)
485      {      {
486      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
487        {        {
488        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
489        current_subject--;        current_subject--;
490        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
491        }        }
492      }      }
493    else    else
# Line 374  if (*first_op == OP_REVERSE) Line 497  if (*first_op == OP_REVERSE)
497    
498      {      {
499      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
500        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
501      current_subject -= gone_back;      current_subject -= gone_back;
502      }      }
503    
504      /* Save the earliest consulted character */
505    
506      if (current_subject < md->start_used_ptr)
507        md->start_used_ptr = current_subject;
508    
509    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
510    
511    end_code = this_start_code;    end_code = this_start_code;
# Line 386  if (*first_op == OP_REVERSE) Line 514  if (*first_op == OP_REVERSE)
514      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
515      if (back <= gone_back)      if (back <= gone_back)
516        {        {
517        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
519        }        }
520      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 419  else Line 547  else
547    else    else
548      {      {
549      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
550        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552            ? IMM2_SIZE:0);
553      do      do
554        {        {
555        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
556        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
557        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
558        }        }
# Line 432  else Line 562  else
562    
563  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
564    
565  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566    
567  /* Loop for scanning the subject */  /* Loop for scanning the subject */
568    
# Line 442  for (;;) Line 572  for (;;)
572    int i, j;    int i, j;
573    int clen, dlen;    int clen, dlen;
574    unsigned int c, d;    unsigned int c, d;
575      int forced_fail = 0;
576      BOOL partial_newline = FALSE;
577      BOOL could_continue = reset_could_continue;
578      reset_could_continue = FALSE;
579    
580    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
581    new state list. */    new state list. */
# Line 455  for (;;) Line 589  for (;;)
589    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
590    workspace[1] = active_count;    workspace[1] = active_count;
591    
592  #ifdef DEBUG  #ifdef PCRE_DEBUG
593    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
595    printf("\"\n");    printf("\"\n");
596    
597    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 477  for (;;) Line 611  for (;;)
611    
612    if (ptr < end_subject)    if (ptr < end_subject)
613      {      {
614      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of data items in the character */
615  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
616      if (utf8) { GETCHARLEN(c, ptr, clen); } else      if (utf) { GETCHARLEN(c, ptr, clen); } else
617  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
618      c = *ptr;      c = *ptr;
619      }      }
620    else    else
# Line 497  for (;;) Line 631  for (;;)
631    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
632      {      {
633      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
634      const uschar *code;      BOOL caseless = FALSE;
635        const pcre_uchar *code;
636      int state_offset = current_state->offset;      int state_offset = current_state->offset;
637      int count, codevalue;      int count, codevalue, rrc;
     int chartype, script;  
638    
639  #ifdef DEBUG  #ifdef PCRE_DEBUG
640      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
641      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
642        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
643          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
644  #endif  #endif
645    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
646      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
647      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
648      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
649        state, arrange for it to passed on. */
650    
651      if (state_offset < 0)      if (state_offset < 0)
652        {        {
# Line 524  for (;;) Line 655  for (;;)
655          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
656          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
657            current_state->data - 1);            current_state->data - 1);
658            if (could_continue) reset_could_continue = TRUE;
659          continue;          continue;
660          }          }
661        else        else
# Line 532  for (;;) Line 664  for (;;)
664          }          }
665        }        }
666    
667      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
668        See the note at the head of this module about the possibility of improving
669        performance here. */
670    
671      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
672        {        {
# Line 549  for (;;) Line 683  for (;;)
683      code = start_code + state_offset;      code = start_code + state_offset;
684      codevalue = *code;      codevalue = *code;
685    
686        /* If this opcode inspects a character, but we are at the end of the
687        subject, remember the fact for use when testing for a partial match. */
688    
689        if (clen == 0 && poptable[codevalue] != 0)
690          could_continue = TRUE;
691    
692      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
693      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
694      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
695      permitted.      permitted.
696    
697      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
698      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long because
699      Unfortunately, we have to take special action to deal with  \P, \p, and      the values are small. We have to take special action to deal with  \P, \p,
700      \X in this case. To keep the other cases fast, convert these ones to new      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
701      opcodes. */      these ones to new opcodes. */
702    
703      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
704        {        {
705        dlen = 1;        dlen = 1;
706  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
707        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
708  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
709        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
710        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
711          {          {
# Line 576  for (;;) Line 716  for (;;)
716            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
717            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
718            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
719              case OP_NOT_HSPACE:
720              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
721              case OP_NOT_VSPACE:
722              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
723            default: break;            default: break;
724            }            }
725          }          }
# Line 591  for (;;) Line 735  for (;;)
735    
736      switch (codevalue)      switch (codevalue)
737        {        {
738    /* ========================================================================== */
739          /* These cases are never obeyed. This is a fudge that causes a compile-
740          time error if the vectors coptable or poptable, which are indexed by
741          opcode, are not the correct length. It seems to be the only way to do
742          such a check at compile time, as the sizeof() operator does not work
743          in the C preprocessor. */
744    
745          case OP_TABLE_LENGTH:
746          case OP_TABLE_LENGTH +
747            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
748             (sizeof(poptable) == OP_TABLE_LENGTH)):
749          break;
750    
751  /* ========================================================================== */  /* ========================================================================== */
752        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
753        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
754        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
755          subpattern, because the possessive subpattern repeats are always handled
756          using recursive calls. Thus, it never adds any new states.
757    
758          At the end of the (sub)pattern, unless we have an empty string and
759          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
760          start of the subject, save the match data, shifting up all previous
761        matches so we always have the longest first. */        matches so we always have the longest first. */
762    
763        case OP_KET:        case OP_KET:
764        case OP_KETRMIN:        case OP_KETRMIN:
765        case OP_KETRMAX:        case OP_KETRMAX:
766          case OP_KETRPOS:
767        if (code != end_code)        if (code != end_code)
768          {          {
769          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 609  for (;;) Line 772  for (;;)
772            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
773            }            }
774          }          }
775        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
776          {          {
777          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
778            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
779              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
780          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
781          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
782          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
783            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
784            offsets[0] = current_subject - start_subject;                match_count = 0;
785            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
786            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
787              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
788            }              {
789          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
790            {              offsets[1] = (int)(ptr - start_subject);
791            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
792              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], (char *)current_subject));
793              match_count, rlevel*2-2, SP));              }
794            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
795                {
796                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
797                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
798                  match_count, rlevel*2-2, SP));
799                return match_count;
800                }
801            }            }
802          }          }
803        break;        break;
# Line 640  for (;;) Line 809  for (;;)
809        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
810        case OP_ALT:        case OP_ALT:
811        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
812        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
813        break;        break;
814    
815        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 648  for (;;) Line 817  for (;;)
817        case OP_SBRA:        case OP_SBRA:
818        do        do
819          {          {
820          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821          code += GET(code, 1);          code += GET(code, 1);
822          }          }
823        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 657  for (;;) Line 826  for (;;)
826        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
827        case OP_CBRA:        case OP_CBRA:
828        case OP_SCBRA:        case OP_SCBRA:
829        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
830        code += GET(code, 1);        code += GET(code, 1);
831        while (*code == OP_ALT)        while (*code == OP_ALT)
832          {          {
833          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
834          code += GET(code, 1);          code += GET(code, 1);
835          }          }
836        break;        break;
# Line 672  for (;;) Line 841  for (;;)
841        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
842        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
843        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
844        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
845          break;
846    
847          /*-----------------------------------------------------------------*/
848          case OP_SKIPZERO:
849          code += 1 + GET(code, 2);
850          while (*code == OP_ALT) code += GET(code, 1);
851          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
852        break;        break;
853    
854        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
855        case OP_CIRC:        case OP_CIRC:
856        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
857          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
858        break;        break;
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_EOD:        case OP_CIRCM:
862        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
863              (ptr != end_subject && WAS_NEWLINE(ptr)))
864            { ADD_ACTIVE(state_offset + 1, 0); }
865        break;        break;
866    
867        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
868        case OP_OPT:        case OP_EOD:
869        ims = code[1];        if (ptr >= end_subject)
870        ADD_ACTIVE(state_offset + 2, 0);          {
871            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
872              could_continue = TRUE;
873            else { ADD_ACTIVE(state_offset + 1, 0); }
874            }
875        break;        break;
876    
877        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 714  for (;;) Line 893  for (;;)
893    
894        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
895        case OP_ANY:        case OP_ANY:
896        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
897            {
898            if (ptr + 1 >= md->end_subject &&
899                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
900                NLBLOCK->nltype == NLTYPE_FIXED &&
901                NLBLOCK->nllen == 2 &&
902                c == NLBLOCK->nl[0])
903              {
904              could_continue = partial_newline = TRUE;
905              }
906            else
907              {
908              ADD_NEW(state_offset + 1, 0);
909              }
910            }
911          break;
912    
913          /*-----------------------------------------------------------------*/
914          case OP_ALLANY:
915          if (clen > 0)
916          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
917        break;        break;
918    
919        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
920        case OP_EODN:        case OP_EODN:
921        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
922            could_continue = TRUE;
923          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
924          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
925        break;        break;
926    
# Line 728  for (;;) Line 928  for (;;)
928        case OP_DOLL:        case OP_DOLL:
929        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
930          {          {
931          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
932              (IS_NEWLINE(ptr) &&            could_continue = TRUE;
933                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)          else if (clen == 0 ||
934                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
935                   (ptr == end_subject - md->nllen)
936              ))              ))
937            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
938            else if (ptr + 1 >= md->end_subject &&
939                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
940                     NLBLOCK->nltype == NLTYPE_FIXED &&
941                     NLBLOCK->nllen == 2 &&
942                     c == NLBLOCK->nl[0])
943              {
944              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
945                {
946                reset_could_continue = TRUE;
947                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
948                }
949              else could_continue = partial_newline = TRUE;
950              }
951            }
952          break;
953    
954          /*-----------------------------------------------------------------*/
955          case OP_DOLLM:
956          if ((md->moptions & PCRE_NOTEOL) == 0)
957            {
958            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
959              could_continue = TRUE;
960            else if (clen == 0 ||
961                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
962              { ADD_ACTIVE(state_offset + 1, 0); }
963            else if (ptr + 1 >= md->end_subject &&
964                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
965                     NLBLOCK->nltype == NLTYPE_FIXED &&
966                     NLBLOCK->nllen == 2 &&
967                     c == NLBLOCK->nl[0])
968              {
969              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
970                {
971                reset_could_continue = TRUE;
972                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
973                }
974              else could_continue = partial_newline = TRUE;
975              }
976          }          }
977        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        else if (IS_NEWLINE(ptr))
978          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
979        break;        break;
980    
# Line 765  for (;;) Line 1005  for (;;)
1005    
1006          if (ptr > start_subject)          if (ptr > start_subject)
1007            {            {
1008            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1009  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1010            if (utf8) BACKCHAR(temp);  #ifdef SUPPORT_UTF
1011              if (utf) { BACKCHAR(temp); }
1012  #endif  #endif
1013            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1014    #ifdef SUPPORT_UCP
1015              if ((md->poptions & PCRE_UCP) != 0)
1016                {
1017                if (d == '_') left_word = TRUE; else
1018                  {
1019                  int cat = UCD_CATEGORY(d);
1020                  left_word = (cat == ucp_L || cat == ucp_N);
1021                  }
1022                }
1023              else
1024    #endif
1025            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1026            }            }
1027          else left_word = 0;          else left_word = FALSE;
1028    
1029          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
1030            else right_word = 0;            {
1031    #ifdef SUPPORT_UCP
1032              if ((md->poptions & PCRE_UCP) != 0)
1033                {
1034                if (c == '_') right_word = TRUE; else
1035                  {
1036                  int cat = UCD_CATEGORY(c);
1037                  right_word = (cat == ucp_L || cat == ucp_N);
1038                  }
1039                }
1040              else
1041    #endif
1042              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1043              }
1044            else right_word = FALSE;
1045    
1046          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1047            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 783  for (;;) Line 1049  for (;;)
1049        break;        break;
1050    
1051    
 #ifdef SUPPORT_UCP  
   
1052        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1053        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
1054        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
1055        */        */
1056    
1057    #ifdef SUPPORT_UCP
1058        case OP_PROP:        case OP_PROP:
1059        case OP_NOTPROP:        case OP_NOTPROP:
1060        if (clen > 0)        if (clen > 0)
1061          {          {
1062          BOOL OK;          BOOL OK;
1063          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1064          switch(code[1])          switch(code[1])
1065            {            {
1066            case PT_ANY:            case PT_ANY:
# Line 803  for (;;) Line 1068  for (;;)
1068            break;            break;
1069    
1070            case PT_LAMP:            case PT_LAMP:
1071            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1072                   prop->chartype == ucp_Lt;
1073            break;            break;
1074    
1075            case PT_GC:            case PT_GC:
1076            OK = category == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1077            break;            break;
1078    
1079            case PT_PC:            case PT_PC:
1080            OK = chartype == code[2];            OK = prop->chartype == code[2];
1081            break;            break;
1082    
1083            case PT_SC:            case PT_SC:
1084            OK = script == code[2];            OK = prop->script == code[2];
1085              break;
1086    
1087              /* These are specials for combination cases. */
1088    
1089              case PT_ALNUM:
1090              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1091                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1092              break;
1093    
1094              case PT_SPACE:    /* Perl space */
1095              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1096                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1097              break;
1098    
1099              case PT_PXSPACE:  /* POSIX space */
1100              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1101                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1102                   c == CHAR_FF || c == CHAR_CR;
1103              break;
1104    
1105              case PT_WORD:
1106              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1107                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1108                   c == CHAR_UNDERSCORE;
1109            break;            break;
1110    
1111            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 835  for (;;) Line 1125  for (;;)
1125  /* ========================================================================== */  /* ========================================================================== */
1126        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
1127        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
1128        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1129        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1130    
1131        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1132        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 845  for (;;) Line 1135  for (;;)
1135        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1136        if (clen > 0)        if (clen > 0)
1137          {          {
1138          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1139                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1140                NLBLOCK->nltype == NLTYPE_FIXED &&
1141                NLBLOCK->nllen == 2 &&
1142                c == NLBLOCK->nl[0])
1143              {
1144              could_continue = partial_newline = TRUE;
1145              }
1146            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1147              (c < 256 &&              (c < 256 &&
1148                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1149                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1150            {            {
1151            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 871  for (;;) Line 1166  for (;;)
1166        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1167        if (clen > 0)        if (clen > 0)
1168          {          {
1169          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1170                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1171                NLBLOCK->nltype == NLTYPE_FIXED &&
1172                NLBLOCK->nllen == 2 &&
1173                c == NLBLOCK->nl[0])
1174              {
1175              could_continue = partial_newline = TRUE;
1176              }
1177            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1178              (c < 256 &&              (c < 256 &&
1179                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1180                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1181            {            {
1182            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 896  for (;;) Line 1196  for (;;)
1196        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1197        if (clen > 0)        if (clen > 0)
1198          {          {
1199          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1200                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1201                NLBLOCK->nltype == NLTYPE_FIXED &&
1202                NLBLOCK->nllen == 2 &&
1203                c == NLBLOCK->nl[0])
1204              {
1205              could_continue = partial_newline = TRUE;
1206              }
1207            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208              (c < 256 &&              (c < 256 &&
1209                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1210                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211            {            {
1212            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 919  for (;;) Line 1224  for (;;)
1224        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1225        if (clen > 0)        if (clen > 0)
1226          {          {
1227          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1228                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1229                NLBLOCK->nltype == NLTYPE_FIXED &&
1230                NLBLOCK->nllen == 2 &&
1231                c == NLBLOCK->nl[0])
1232              {
1233              could_continue = partial_newline = TRUE;
1234              }
1235            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1236              (c < 256 &&              (c < 256 &&
1237                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1238                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1239            {            {
1240            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1241              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1242            else            else
1243              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1244            }            }
# Line 939  for (;;) Line 1249  for (;;)
1249        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1250        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1251        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1252        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1253        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1254        if (clen > 0)        if (clen > 0)
1255          {          {
1256          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1257                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1258                NLBLOCK->nltype == NLTYPE_FIXED &&
1259                NLBLOCK->nllen == 2 &&
1260                c == NLBLOCK->nl[0])
1261              {
1262              could_continue = partial_newline = TRUE;
1263              }
1264            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265              (c < 256 &&              (c < 256 &&
1266                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1267                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268            {            {
1269            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 957  for (;;) Line 1272  for (;;)
1272              next_active_state--;              next_active_state--;
1273              }              }
1274            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1275              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1276            else            else
1277              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1278            }            }
# Line 970  for (;;) Line 1285  for (;;)
1285        argument. It keeps the code above fast for the other cases. The argument        argument. It keeps the code above fast for the other cases. The argument
1286        is in the d variable. */        is in the d variable. */
1287    
1288    #ifdef SUPPORT_UCP
1289        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
1290        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1291        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
# Line 978  for (;;) Line 1294  for (;;)
1294        if (clen > 0)        if (clen > 0)
1295          {          {
1296          BOOL OK;          BOOL OK;
1297          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1298          switch(code[2])          switch(code[2])
1299            {            {
1300            case PT_ANY:            case PT_ANY:
# Line 986  for (;;) Line 1302  for (;;)
1302            break;            break;
1303    
1304            case PT_LAMP:            case PT_LAMP:
1305            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1306                prop->chartype == ucp_Lt;
1307            break;            break;
1308    
1309            case PT_GC:            case PT_GC:
1310            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1311            break;            break;
1312    
1313            case PT_PC:            case PT_PC:
1314            OK = chartype == code[3];            OK = prop->chartype == code[3];
1315            break;            break;
1316    
1317            case PT_SC:            case PT_SC:
1318            OK = script == code[3];            OK = prop->script == code[3];
1319              break;
1320    
1321              /* These are specials for combination cases. */
1322    
1323              case PT_ALNUM:
1324              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1325                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1326              break;
1327    
1328              case PT_SPACE:    /* Perl space */
1329              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1330                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1331              break;
1332    
1333              case PT_PXSPACE:  /* POSIX space */
1334              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1335                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1336                   c == CHAR_FF || c == CHAR_CR;
1337              break;
1338    
1339              case PT_WORD:
1340              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1341                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1342                   c == CHAR_UNDERSCORE;
1343            break;            break;
1344    
1345            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1027  for (;;) Line 1368  for (;;)
1368        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1369        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1370        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1371        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1372          {          {
1373          const uschar *nptr = ptr + clen;          int lgb, rgb;
1374            const pcre_uchar *nptr = ptr + clen;
1375          int ncount = 0;          int ncount = 0;
1376          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1377            {            {
1378            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1379            next_active_state--;            next_active_state--;
1380            }            }
1381            lgb = UCD_GRAPHBREAK(c);
1382          while (nptr < end_subject)          while (nptr < end_subject)
1383            {            {
1384            int nd;            dlen = 1;
1385            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1386            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1387            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1388            ncount++;            ncount++;
1389            nptr += ndlen;            lgb = rgb;
1390              nptr += dlen;
1391            }            }
1392          count++;          count++;
1393          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1394          }          }
1395        break;        break;
1396    #endif
1397    
1398        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1399        case OP_ANYNL_EXTRA + OP_TYPEPLUS:        case OP_ANYNL_EXTRA + OP_TYPEPLUS:
# Line 1061  for (;;) Line 1406  for (;;)
1406          int ncount = 0;          int ncount = 0;
1407          switch (c)          switch (c)
1408            {            {
1409            case 0x000d:            case CHAR_VT:
1410            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            case CHAR_FF:
1411            /* Fall through */            case CHAR_NEL:
1412            case 0x000a:  #ifndef EBCDIC
           case 0x000b:  
           case 0x000c:  
           case 0x0085:  
1413            case 0x2028:            case 0x2028:
1414            case 0x2029:            case 0x2029:
1415    #endif  /* Not EBCDIC */
1416              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1417              goto ANYNL01;
1418    
1419              case CHAR_CR:
1420              if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1421              /* Fall through */
1422    
1423              ANYNL01:
1424              case CHAR_LF:
1425            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1426              {              {
1427              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1078  for (;;) Line 1430  for (;;)
1430            count++;            count++;
1431            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, ncount);
1432            break;            break;
1433    
1434              default:
1435              break;
1436              }
1437            }
1438          break;
1439    
1440          /*-----------------------------------------------------------------*/
1441          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1442          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1443          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1444          count = current_state->count;  /* Already matched */
1445          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1446          if (clen > 0)
1447            {
1448            BOOL OK;
1449            switch (c)
1450              {
1451              case CHAR_LF:
1452              case CHAR_VT:
1453              case CHAR_FF:
1454              case CHAR_CR:
1455              case CHAR_NEL:
1456    #ifndef EBCDIC
1457              case 0x2028:
1458              case 0x2029:
1459    #endif  /* Not EBCDIC */
1460              OK = TRUE;
1461              break;
1462    
1463              default:
1464              OK = FALSE;
1465              break;
1466              }
1467    
1468            if (OK == (d == OP_VSPACE))
1469              {
1470              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1471                {
1472                active_count--;           /* Remove non-match possibility */
1473                next_active_state--;
1474                }
1475              count++;
1476              ADD_NEW_DATA(-state_offset, count, 0);
1477              }
1478            }
1479          break;
1480    
1481          /*-----------------------------------------------------------------*/
1482          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1483          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1484          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1485          count = current_state->count;  /* Already matched */
1486          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1487          if (clen > 0)
1488            {
1489            BOOL OK;
1490            switch (c)
1491              {
1492              case CHAR_HT:
1493              case CHAR_SPACE:
1494    #ifndef EBCDIC
1495              case 0xa0:      /* NBSP */
1496              case 0x1680:    /* OGHAM SPACE MARK */
1497              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1498              case 0x2000:    /* EN QUAD */
1499              case 0x2001:    /* EM QUAD */
1500              case 0x2002:    /* EN SPACE */
1501              case 0x2003:    /* EM SPACE */
1502              case 0x2004:    /* THREE-PER-EM SPACE */
1503              case 0x2005:    /* FOUR-PER-EM SPACE */
1504              case 0x2006:    /* SIX-PER-EM SPACE */
1505              case 0x2007:    /* FIGURE SPACE */
1506              case 0x2008:    /* PUNCTUATION SPACE */
1507              case 0x2009:    /* THIN SPACE */
1508              case 0x200A:    /* HAIR SPACE */
1509              case 0x202f:    /* NARROW NO-BREAK SPACE */
1510              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1511              case 0x3000:    /* IDEOGRAPHIC SPACE */
1512    #endif  /* Not EBCDIC */
1513              OK = TRUE;
1514              break;
1515    
1516            default:            default:
1517              OK = FALSE;
1518            break;            break;
1519            }            }
1520    
1521            if (OK == (d == OP_HSPACE))
1522              {
1523              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1524                {
1525                active_count--;           /* Remove non-match possibility */
1526                next_active_state--;
1527                }
1528              count++;
1529              ADD_NEW_DATA(-state_offset, count, 0);
1530              }
1531          }          }
1532        break;        break;
1533    
1534        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1535    #ifdef SUPPORT_UCP
1536        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1537        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1538        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
# Line 1102  for (;;) Line 1550  for (;;)
1550        if (clen > 0)        if (clen > 0)
1551          {          {
1552          BOOL OK;          BOOL OK;
1553          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1554          switch(code[2])          switch(code[2])
1555            {            {
1556            case PT_ANY:            case PT_ANY:
# Line 1110  for (;;) Line 1558  for (;;)
1558            break;            break;
1559    
1560            case PT_LAMP:            case PT_LAMP:
1561            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1562                prop->chartype == ucp_Lt;
1563            break;            break;
1564    
1565            case PT_GC:            case PT_GC:
1566            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1567            break;            break;
1568    
1569            case PT_PC:            case PT_PC:
1570            OK = chartype == code[3];            OK = prop->chartype == code[3];
1571            break;            break;
1572    
1573            case PT_SC:            case PT_SC:
1574            OK = script == code[3];            OK = prop->script == code[3];
1575              break;
1576    
1577              /* These are specials for combination cases. */
1578    
1579              case PT_ALNUM:
1580              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1581                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1582              break;
1583    
1584              case PT_SPACE:    /* Perl space */
1585              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1586                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1587              break;
1588    
1589              case PT_PXSPACE:  /* POSIX space */
1590              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1591                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1592                   c == CHAR_FF || c == CHAR_CR;
1593              break;
1594    
1595              case PT_WORD:
1596              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1597                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1598                   c == CHAR_UNDERSCORE;
1599            break;            break;
1600    
1601            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1160  for (;;) Line 1633  for (;;)
1633        QS2:        QS2:
1634    
1635        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1636        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
1637          {          {
1638          const uschar *nptr = ptr + clen;          int lgb, rgb;
1639            const pcre_uchar *nptr = ptr + clen;
1640          int ncount = 0;          int ncount = 0;
1641          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1642              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1170  for (;;) Line 1644  for (;;)
1644            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1645            next_active_state--;            next_active_state--;
1646            }            }
1647            lgb = UCD_GRAPHBREAK(c);
1648          while (nptr < end_subject)          while (nptr < end_subject)
1649            {            {
1650            int nd;            dlen = 1;
1651            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1652            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1653            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1654            ncount++;            ncount++;
1655            nptr += ndlen;            lgb = rgb;
1656              nptr += dlen;
1657            }            }
1658          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1659          }          }
1660        break;        break;
1661    #endif
1662    
1663        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1664        case OP_ANYNL_EXTRA + OP_TYPEQUERY:        case OP_ANYNL_EXTRA + OP_TYPEQUERY:
# Line 1202  for (;;) Line 1679  for (;;)
1679          int ncount = 0;          int ncount = 0;
1680          switch (c)          switch (c)
1681            {            {
1682            case 0x000d:            case CHAR_VT:
1683            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            case CHAR_FF:
1684            /* Fall through */            case CHAR_NEL:
1685            case 0x000a:  #ifndef EBCDIC
           case 0x000b:  
           case 0x000c:  
           case 0x0085:  
1686            case 0x2028:            case 0x2028:
1687            case 0x2029:            case 0x2029:
1688    #endif  /* Not EBCDIC */
1689              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1690              goto ANYNL02;
1691    
1692              case CHAR_CR:
1693              if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1694              /* Fall through */
1695    
1696              ANYNL02:
1697              case CHAR_LF:
1698            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1699                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1700              {              {
# Line 1219  for (;;) Line 1703  for (;;)
1703              }              }
1704            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1705            break;            break;
1706    
1707            default:            default:
1708            break;            break;
1709            }            }
# Line 1226  for (;;) Line 1711  for (;;)
1711        break;        break;
1712    
1713        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1714        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1715        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1716        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1717        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        count = 2;
1718        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        goto QS4;
1719          { ADD_ACTIVE(state_offset + 6, 0); }  
1720        count = current_state->count;  /* Number already matched */        case OP_VSPACE_EXTRA + OP_TYPESTAR:
1721          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1722          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1723          count = 0;
1724    
1725          QS4:
1726          ADD_ACTIVE(state_offset + 2, 0);
1727        if (clen > 0)        if (clen > 0)
1728          {          {
1729          BOOL OK;          BOOL OK;
1730          int category = _pcre_ucp_findprop(c, &chartype, &script);          switch (c)
         switch(code[4])  
1731            {            {
1732            case PT_ANY:            case CHAR_LF:
1733              case CHAR_VT:
1734              case CHAR_FF:
1735              case CHAR_CR:
1736              case CHAR_NEL:
1737    #ifndef EBCDIC
1738              case 0x2028:
1739              case 0x2029:
1740    #endif  /* Not EBCDIC */
1741            OK = TRUE;            OK = TRUE;
1742            break;            break;
1743    
           case PT_LAMP:  
           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;  
           break;  
   
           case PT_GC:  
           OK = category == code[5];  
           break;  
   
           case PT_PC:  
           OK = chartype == code[5];  
           break;  
   
           case PT_SC:  
           OK = script == code[5];  
           break;  
   
           /* Should never occur, but keep compilers from grumbling. */  
   
1744            default:            default:
1745            OK = codevalue != OP_PROP;            OK = FALSE;
1746            break;            break;
1747            }            }
1748            if (OK == (d == OP_VSPACE))
         if (OK == (d == OP_PROP))  
1749            {            {
1750            if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1751                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1752              {              {
1753              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1754              next_active_state--;              next_active_state--;
1755              }              }
1756            if (++count >= GET2(code, 1))            ADD_NEW_DATA(-(state_offset + count), 0, 0);
             { ADD_NEW(state_offset + 6, 0); }  
           else  
             { ADD_NEW(state_offset, count); }  
1757            }            }
1758          }          }
1759        break;        break;
1760    
1761        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1762        case OP_EXTUNI_EXTRA + OP_TYPEEXACT:        case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1763        case OP_EXTUNI_EXTRA + OP_TYPEUPTO:        case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1764        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1765        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        count = 2;
1766        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        goto QS5;
1767          { ADD_ACTIVE(state_offset + 4, 0); }  
1768        count = current_state->count;  /* Number already matched */        case OP_HSPACE_EXTRA + OP_TYPESTAR:
1769        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1770          {        case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1771          const uschar *nptr = ptr + clen;        count = 0;
1772          int ncount = 0;  
1773          QS5:
1774          ADD_ACTIVE(state_offset + 2, 0);
1775          if (clen > 0)
1776            {
1777            BOOL OK;
1778            switch (c)
1779              {
1780              case CHAR_HT:
1781              case CHAR_SPACE:
1782    #ifndef EBCDIC
1783              case 0xa0:      /* NBSP */
1784              case 0x1680:    /* OGHAM SPACE MARK */
1785              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1786              case 0x2000:    /* EN QUAD */
1787              case 0x2001:    /* EM QUAD */
1788              case 0x2002:    /* EN SPACE */
1789              case 0x2003:    /* EM SPACE */
1790              case 0x2004:    /* THREE-PER-EM SPACE */
1791              case 0x2005:    /* FOUR-PER-EM SPACE */
1792              case 0x2006:    /* SIX-PER-EM SPACE */
1793              case 0x2007:    /* FIGURE SPACE */
1794              case 0x2008:    /* PUNCTUATION SPACE */
1795              case 0x2009:    /* THIN SPACE */
1796              case 0x200A:    /* HAIR SPACE */
1797              case 0x202f:    /* NARROW NO-BREAK SPACE */
1798              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1799              case 0x3000:    /* IDEOGRAPHIC SPACE */
1800    #endif  /* Not EBCDIC */
1801              OK = TRUE;
1802              break;
1803    
1804              default:
1805              OK = FALSE;
1806              break;
1807              }
1808    
1809            if (OK == (d == OP_HSPACE))
1810              {
1811              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1812                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1813                {
1814                active_count--;           /* Remove non-match possibility */
1815                next_active_state--;
1816                }
1817              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1818              }
1819            }
1820          break;
1821    
1822          /*-----------------------------------------------------------------*/
1823    #ifdef SUPPORT_UCP
1824          case OP_PROP_EXTRA + OP_TYPEEXACT:
1825          case OP_PROP_EXTRA + OP_TYPEUPTO:
1826          case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1827          case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1828          if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1829            { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1830          count = current_state->count;  /* Number already matched */
1831          if (clen > 0)
1832            {
1833            BOOL OK;
1834            const ucd_record * prop = GET_UCD(c);
1835            switch(code[1 + IMM2_SIZE + 1])
1836              {
1837              case PT_ANY:
1838              OK = TRUE;
1839              break;
1840    
1841              case PT_LAMP:
1842              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1843                prop->chartype == ucp_Lt;
1844              break;
1845    
1846              case PT_GC:
1847              OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1848              break;
1849    
1850              case PT_PC:
1851              OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1852              break;
1853    
1854              case PT_SC:
1855              OK = prop->script == code[1 + IMM2_SIZE + 2];
1856              break;
1857    
1858              /* These are specials for combination cases. */
1859    
1860              case PT_ALNUM:
1861              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1862                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1863              break;
1864    
1865              case PT_SPACE:    /* Perl space */
1866              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1867                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1868              break;
1869    
1870              case PT_PXSPACE:  /* POSIX space */
1871              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1872                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1873                   c == CHAR_FF || c == CHAR_CR;
1874              break;
1875    
1876              case PT_WORD:
1877              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1878                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1879                   c == CHAR_UNDERSCORE;
1880              break;
1881    
1882              /* Should never occur, but keep compilers from grumbling. */
1883    
1884              default:
1885              OK = codevalue != OP_PROP;
1886              break;
1887              }
1888    
1889            if (OK == (d == OP_PROP))
1890              {
1891              if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1892                {
1893                active_count--;           /* Remove non-match possibility */
1894                next_active_state--;
1895                }
1896              if (++count >= GET2(code, 1))
1897                { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1898              else
1899                { ADD_NEW(state_offset, count); }
1900              }
1901            }
1902          break;
1903    
1904          /*-----------------------------------------------------------------*/
1905          case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1906          case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1907          case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1908          case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1909          if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1910            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1911          count = current_state->count;  /* Number already matched */
1912          if (clen > 0)
1913            {
1914            int lgb, rgb;
1915            const pcre_uchar *nptr = ptr + clen;
1916            int ncount = 0;
1917          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1918            {            {
1919            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1920            next_active_state--;            next_active_state--;
1921            }            }
1922            lgb = UCD_GRAPHBREAK(c);
1923          while (nptr < end_subject)          while (nptr < end_subject)
1924            {            {
1925            int nd;            dlen = 1;
1926            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1927            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1928            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1929            ncount++;            ncount++;
1930            nptr += ndlen;            lgb = rgb;
1931              nptr += dlen;
1932            }            }
1933            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1934                reset_could_continue = TRUE;
1935          if (++count >= GET2(code, 1))          if (++count >= GET2(code, 1))
1936            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1937          else          else
1938            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1939          }          }
1940        break;        break;
1941    #endif
1942    
1943        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1944        case OP_ANYNL_EXTRA + OP_TYPEEXACT:        case OP_ANYNL_EXTRA + OP_TYPEEXACT:
# Line 1320  for (;;) Line 1946  for (;;)
1946        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1947        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1948        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1949          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1950        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1951        if (clen > 0)        if (clen > 0)
1952          {          {
1953          int ncount = 0;          int ncount = 0;
1954          switch (c)          switch (c)
1955            {            {
1956            case 0x000d:            case CHAR_VT:
1957            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            case CHAR_FF:
1958            /* Fall through */            case CHAR_NEL:
1959            case 0x000a:  #ifndef EBCDIC
           case 0x000b:  
           case 0x000c:  
           case 0x0085:  
1960            case 0x2028:            case 0x2028:
1961            case 0x2029:            case 0x2029:
1962    #endif  /* Not EBCDIC */
1963              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1964              goto ANYNL03;
1965    
1966              case CHAR_CR:
1967              if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
1968              /* Fall through */
1969    
1970              ANYNL03:
1971              case CHAR_LF:
1972            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1973              {              {
1974              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1975              next_active_state--;              next_active_state--;
1976              }              }
1977            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1978              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1979            else            else
1980              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1981            break;            break;
1982    
1983            default:            default:
1984            break;            break;
1985            }            }
1986          }          }
1987        break;        break;
1988    
1989          /*-----------------------------------------------------------------*/
1990          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1991          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1992          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1993          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1994          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1995            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1996          count = current_state->count;  /* Number already matched */
1997          if (clen > 0)
1998            {
1999            BOOL OK;
2000            switch (c)
2001              {
2002              case CHAR_LF:
2003              case CHAR_VT:
2004              case CHAR_FF:
2005              case CHAR_CR:
2006              case CHAR_NEL:
2007    #ifndef EBCDIC
2008              case 0x2028:
2009              case 0x2029:
2010    #endif  /* Not EBCDIC */
2011              OK = TRUE;
2012              break;
2013    
2014              default:
2015              OK = FALSE;
2016              }
2017    
2018            if (OK == (d == OP_VSPACE))
2019              {
2020              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2021                {
2022                active_count--;           /* Remove non-match possibility */
2023                next_active_state--;
2024                }
2025              if (++count >= GET2(code, 1))
2026                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2027              else
2028                { ADD_NEW_DATA(-state_offset, count, 0); }
2029              }
2030            }
2031          break;
2032    
2033          /*-----------------------------------------------------------------*/
2034          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2035          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2036          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2037          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2038          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2039            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2040          count = current_state->count;  /* Number already matched */
2041          if (clen > 0)
2042            {
2043            BOOL OK;
2044            switch (c)
2045              {
2046              case CHAR_HT:
2047              case CHAR_SPACE:
2048    #ifndef EBCDIC
2049              case 0xa0:      /* NBSP */
2050              case 0x1680:    /* OGHAM SPACE MARK */
2051              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2052              case 0x2000:    /* EN QUAD */
2053              case 0x2001:    /* EM QUAD */
2054              case 0x2002:    /* EN SPACE */
2055              case 0x2003:    /* EM SPACE */
2056              case 0x2004:    /* THREE-PER-EM SPACE */
2057              case 0x2005:    /* FOUR-PER-EM SPACE */
2058              case 0x2006:    /* SIX-PER-EM SPACE */
2059              case 0x2007:    /* FIGURE SPACE */
2060              case 0x2008:    /* PUNCTUATION SPACE */
2061              case 0x2009:    /* THIN SPACE */
2062              case 0x200A:    /* HAIR SPACE */
2063              case 0x202f:    /* NARROW NO-BREAK SPACE */
2064              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2065              case 0x3000:    /* IDEOGRAPHIC SPACE */
2066    #endif  /* Not EBCDIC */
2067              OK = TRUE;
2068              break;
2069    
2070              default:
2071              OK = FALSE;
2072              break;
2073              }
2074    
2075            if (OK == (d == OP_HSPACE))
2076              {
2077              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2078                {
2079                active_count--;           /* Remove non-match possibility */
2080                next_active_state--;
2081                }
2082              if (++count >= GET2(code, 1))
2083                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2084              else
2085                { ADD_NEW_DATA(-state_offset, count, 0); }
2086              }
2087            }
2088          break;
2089    
2090  /* ========================================================================== */  /* ========================================================================== */
2091        /* These opcodes are followed by a character that is usually compared        /* These opcodes are followed by a character that is usually compared
2092        to the current subject character; it is loaded into d. We still get        to the current subject character; it is loaded into d. We still get
# Line 1364  for (;;) Line 2099  for (;;)
2099        break;        break;
2100    
2101        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2102        case OP_CHARNC:        case OP_CHARI:
2103        if (clen == 0) break;        if (clen == 0) break;
2104    
2105  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2106        if (utf8)        if (utf)
2107          {          {
2108          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2109            {            {
2110            unsigned int othercase;            unsigned int othercase;
2111            if (c < 128) othercase = fcc[c]; else            if (c < 128)
2112                othercase = fcc[c];
2113            /* If we have Unicode property support, we can use it to test the            else
2114            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2115                other case of the character. */
2116  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2117            othercase = _pcre_ucp_othercase(c);              othercase = UCD_OTHERCASE(c);
2118  #else  #else
2119            othercase = NOTACHAR;              othercase = NOTACHAR;
2120  #endif  #endif
2121    
2122            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2123            }            }
2124          }          }
2125        else        else
2126  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2127          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2128          {          {
2129          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2130              { ADD_NEW(state_offset + 2, 0); }
2131          }          }
2132        break;        break;
2133    
# Line 1404  for (;;) Line 2139  for (;;)
2139        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2140    
2141        case OP_EXTUNI:        case OP_EXTUNI:
2142        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0)
2143          {          {
2144          const uschar *nptr = ptr + clen;          int lgb, rgb;
2145            const pcre_uchar *nptr = ptr + clen;
2146          int ncount = 0;          int ncount = 0;
2147            lgb = UCD_GRAPHBREAK(c);
2148          while (nptr < end_subject)          while (nptr < end_subject)
2149            {            {
2150            int nclen = 1;            dlen = 1;
2151            GETCHARLEN(c, nptr, nclen);            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2152            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            rgb = UCD_GRAPHBREAK(d);
2153              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2154            ncount++;            ncount++;
2155            nptr += nclen;            lgb = rgb;
2156              nptr += dlen;
2157            }            }
2158            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2159                reset_could_continue = TRUE;
2160          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2161          }          }
2162        break;        break;
# Line 1429  for (;;) Line 2170  for (;;)
2170        case OP_ANYNL:        case OP_ANYNL:
2171        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2172          {          {
2173          case 0x000a:          case CHAR_VT:
2174          case 0x000b:          case CHAR_FF:
2175          case 0x000c:          case CHAR_NEL:
2176          case 0x0085:  #ifndef EBCDIC
2177          case 0x2028:          case 0x2028:
2178          case 0x2029:          case 0x2029:
2179    #endif  /* Not EBCDIC */
2180            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2181    
2182            case CHAR_LF:
2183          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2184          break;          break;
2185          case 0x000d:  
2186          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          case CHAR_CR:
2187            if (ptr + 1 >= end_subject)
2188              {
2189              ADD_NEW(state_offset + 1, 0);
2190              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2191                reset_could_continue = TRUE;
2192              }
2193            else if (ptr[1] == CHAR_LF)
2194            {            {
2195            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2196            }            }
# Line 1451  for (;;) Line 2203  for (;;)
2203        break;        break;
2204    
2205        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2206        /* Match a negated single character. This is only used for one-byte        case OP_NOT_VSPACE:
2207        characters, that is, we know that d < 256. The character we are        if (clen > 0) switch(c)
2208        checking (c) can be multibyte. */          {
2209            case CHAR_LF:
2210            case CHAR_VT:
2211            case CHAR_FF:
2212            case CHAR_CR:
2213            case CHAR_NEL:
2214    #ifndef EBCDIC
2215            case 0x2028:
2216            case 0x2029:
2217    #endif  /* Not EBCDIC */
2218            break;
2219    
2220            default:
2221            ADD_NEW(state_offset + 1, 0);
2222            break;
2223            }
2224          break;
2225    
2226          /*-----------------------------------------------------------------*/
2227          case OP_VSPACE:
2228          if (clen > 0) switch(c)
2229            {
2230            case CHAR_LF:
2231            case CHAR_VT:
2232            case CHAR_FF:
2233            case CHAR_CR:
2234            case CHAR_NEL:
2235    #ifndef EBCDIC
2236            case 0x2028:
2237            case 0x2029:
2238    #endif  /* Not EBCDIC */
2239            ADD_NEW(state_offset + 1, 0);
2240            break;
2241    
2242            default: break;
2243            }
2244          break;
2245    
2246          /*-----------------------------------------------------------------*/
2247          case OP_NOT_HSPACE:
2248          if (clen > 0) switch(c)
2249            {
2250            case CHAR_HT:
2251            case CHAR_SPACE:
2252    #ifndef EBCDIC
2253            case 0xa0:      /* NBSP */
2254            case 0x1680:    /* OGHAM SPACE MARK */
2255            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2256            case 0x2000:    /* EN QUAD */
2257            case 0x2001:    /* EM QUAD */
2258            case 0x2002:    /* EN SPACE */
2259            case 0x2003:    /* EM SPACE */
2260            case 0x2004:    /* THREE-PER-EM SPACE */
2261            case 0x2005:    /* FOUR-PER-EM SPACE */
2262            case 0x2006:    /* SIX-PER-EM SPACE */
2263            case 0x2007:    /* FIGURE SPACE */
2264            case 0x2008:    /* PUNCTUATION SPACE */
2265            case 0x2009:    /* THIN SPACE */
2266            case 0x200A:    /* HAIR SPACE */
2267            case 0x202f:    /* NARROW NO-BREAK SPACE */
2268            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2269            case 0x3000:    /* IDEOGRAPHIC SPACE */
2270    #endif  /* Not EBCDIC */
2271            break;
2272    
2273            default:
2274            ADD_NEW(state_offset + 1, 0);
2275            break;
2276            }
2277          break;
2278    
2279          /*-----------------------------------------------------------------*/
2280          case OP_HSPACE:
2281          if (clen > 0) switch(c)
2282            {
2283            case CHAR_HT:
2284            case CHAR_SPACE:
2285    #ifndef EBCDIC
2286            case 0xa0:      /* NBSP */
2287            case 0x1680:    /* OGHAM SPACE MARK */
2288            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2289            case 0x2000:    /* EN QUAD */
2290            case 0x2001:    /* EM QUAD */
2291            case 0x2002:    /* EN SPACE */
2292            case 0x2003:    /* EM SPACE */
2293            case 0x2004:    /* THREE-PER-EM SPACE */
2294            case 0x2005:    /* FOUR-PER-EM SPACE */
2295            case 0x2006:    /* SIX-PER-EM SPACE */
2296            case 0x2007:    /* FIGURE SPACE */
2297            case 0x2008:    /* PUNCTUATION SPACE */
2298            case 0x2009:    /* THIN SPACE */
2299            case 0x200A:    /* HAIR SPACE */
2300            case 0x202f:    /* NARROW NO-BREAK SPACE */
2301            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2302            case 0x3000:    /* IDEOGRAPHIC SPACE */
2303    #endif  /* Not EBCDIC */
2304            ADD_NEW(state_offset + 1, 0);
2305            break;
2306            }
2307          break;
2308    
2309          /*-----------------------------------------------------------------*/
2310          /* Match a negated single character casefully. */
2311    
2312        case OP_NOT:        case OP_NOT:
2313          if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2314          break;
2315    
2316          /*-----------------------------------------------------------------*/
2317          /* Match a negated single character caselessly. */
2318    
2319          case OP_NOTI:
2320        if (clen > 0)        if (clen > 0)
2321          {          {
2322          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          unsigned int otherd;
2323          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  #ifdef SUPPORT_UTF
2324            if (utf && d >= 128)
2325              {
2326    #ifdef SUPPORT_UCP
2327              otherd = UCD_OTHERCASE(d);
2328    #endif  /* SUPPORT_UCP */
2329              }
2330            else
2331    #endif  /* SUPPORT_UTF */
2332            otherd = TABLE_GET(d, fcc, d);
2333            if (c != d && c != otherd)
2334              { ADD_NEW(state_offset + dlen + 1, 0); }
2335          }          }
2336        break;        break;
2337    
2338        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2339          case OP_PLUSI:
2340          case OP_MINPLUSI:
2341          case OP_POSPLUSI:
2342          case OP_NOTPLUSI:
2343          case OP_NOTMINPLUSI:
2344          case OP_NOTPOSPLUSI:
2345          caseless = TRUE;
2346          codevalue -= OP_STARI - OP_STAR;
2347    
2348          /* Fall through */
2349        case OP_PLUS:        case OP_PLUS:
2350        case OP_MINPLUS:        case OP_MINPLUS:
2351        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1475  for (;;) Line 2357  for (;;)
2357        if (clen > 0)        if (clen > 0)
2358          {          {
2359          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2360          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2361            {            {
2362  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2363            if (utf8 && d >= 128)            if (utf && d >= 128)
2364              {              {
2365  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2366              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2367  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2368              }              }
2369            else            else
2370  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2371            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2372            }            }
2373          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2374            {            {
# Line 1503  for (;;) Line 2385  for (;;)
2385        break;        break;
2386    
2387        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2388          case OP_QUERYI:
2389          case OP_MINQUERYI:
2390          case OP_POSQUERYI:
2391          case OP_NOTQUERYI:
2392          case OP_NOTMINQUERYI:
2393          case OP_NOTPOSQUERYI:
2394          caseless = TRUE;
2395          codevalue -= OP_STARI - OP_STAR;
2396          /* Fall through */
2397        case OP_QUERY:        case OP_QUERY:
2398        case OP_MINQUERY:        case OP_MINQUERY:
2399        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1513  for (;;) Line 2404  for (;;)
2404        if (clen > 0)        if (clen > 0)
2405          {          {
2406          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2407          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2408            {            {
2409  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2410            if (utf8 && d >= 128)            if (utf && d >= 128)
2411              {              {
2412  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2413              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2414  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2415              }              }
2416            else            else
2417  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2418            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2419            }            }
2420          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2421            {            {
# Line 1539  for (;;) Line 2430  for (;;)
2430        break;        break;
2431    
2432        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2433          case OP_STARI:
2434          case OP_MINSTARI:
2435          case OP_POSSTARI:
2436          case OP_NOTSTARI:
2437          case OP_NOTMINSTARI:
2438          case OP_NOTPOSSTARI:
2439          caseless = TRUE;
2440          codevalue -= OP_STARI - OP_STAR;
2441          /* Fall through */
2442        case OP_STAR:        case OP_STAR:
2443        case OP_MINSTAR:        case OP_MINSTAR:
2444        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1549  for (;;) Line 2449  for (;;)
2449        if (clen > 0)        if (clen > 0)
2450          {          {
2451          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2452          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2453            {            {
2454  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2455            if (utf8 && d >= 128)            if (utf && d >= 128)
2456              {              {
2457  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2458              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2459  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2460              }              }
2461            else            else
2462  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2463            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2464            }            }
2465          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2466            {            {
# Line 1575  for (;;) Line 2475  for (;;)
2475        break;        break;
2476    
2477        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2478          case OP_EXACTI:
2479          case OP_NOTEXACTI:
2480          caseless = TRUE;
2481          codevalue -= OP_STARI - OP_STAR;
2482          /* Fall through */
2483        case OP_EXACT:        case OP_EXACT:
2484        case OP_NOTEXACT:        case OP_NOTEXACT:
2485        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2486        if (clen > 0)        if (clen > 0)
2487          {          {
2488          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2489          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2490            {            {
2491  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2492            if (utf8 && d >= 128)            if (utf && d >= 128)
2493              {              {
2494  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2495              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2496  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2497              }              }
2498            else            else
2499  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2500            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2501            }            }
2502          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2503            {            {
2504            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2505              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2506            else            else
2507              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2508            }            }
# Line 1605  for (;;) Line 2510  for (;;)
2510        break;        break;
2511    
2512        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2513          case OP_UPTOI:
2514          case OP_MINUPTOI:
2515          case OP_POSUPTOI:
2516          case OP_NOTUPTOI:
2517          case OP_NOTMINUPTOI:
2518          case OP_NOTPOSUPTOI:
2519          caseless = TRUE;
2520          codevalue -= OP_STARI - OP_STAR;
2521          /* Fall through */
2522        case OP_UPTO:        case OP_UPTO:
2523        case OP_MINUPTO:        case OP_MINUPTO:
2524        case OP_POSUPTO:        case OP_POSUPTO:
2525        case OP_NOTUPTO:        case OP_NOTUPTO:
2526        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2527        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2528        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2529        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2530        if (clen > 0)        if (clen > 0)
2531          {          {
2532          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2533          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2534            {            {
2535  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2536            if (utf8 && d >= 128)            if (utf && d >= 128)
2537              {              {
2538  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2539              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2540  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2541              }              }
2542            else            else
2543  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2544            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2545            }            }
2546          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2547            {            {
# Line 1637  for (;;) Line 2551  for (;;)
2551              next_active_state--;              next_active_state--;
2552              }              }
2553            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2554              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2555            else            else
2556              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2557            }            }
# Line 1654  for (;;) Line 2568  for (;;)
2568          {          {
2569          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2570          int next_state_offset;          int next_state_offset;
2571          const uschar *ecode;          const pcre_uchar *ecode;
2572    
2573          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2574          can set isinclass from it. */          can set isinclass from it. */
2575    
2576          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2577            {            {
2578            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2579            if (clen > 0)            if (clen > 0)
2580              {              {
2581              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2582                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2583              }              }
2584            }            }
2585    
# Line 1676  for (;;) Line 2590  for (;;)
2590          else          else
2591           {           {
2592           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2593           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2594           }           }
2595    
2596          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2597          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2598          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2599    
2600          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2601    
2602          switch (*ecode)          switch (*ecode)
2603            {            {
# Line 1710  for (;;) Line 2624  for (;;)
2624            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2625            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2626            if (count >= GET2(ecode, 1))            if (count >= GET2(ecode, 1))
2627              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2628            if (isinclass)            if (isinclass)
2629              {              {
2630              int max = GET2(ecode, 3);              int max = GET2(ecode, 1 + IMM2_SIZE);
2631              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2632                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2633              else              else
2634                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2635              }              }
# Line 1730  for (;;) Line 2644  for (;;)
2644    
2645  /* ========================================================================== */  /* ========================================================================== */
2646        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2647        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2648          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2649          though the other "backtracking verbs" are not supported. */
2650    
2651          case OP_FAIL:
2652          forced_fail++;    /* Count FAILs for multiple states */
2653          break;
2654    
2655        case OP_ASSERT:        case OP_ASSERT:
2656        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1740  for (;;) Line 2660  for (;;)
2660          int rc;          int rc;
2661          int local_offsets[2];          int local_offsets[2];
2662          int local_workspace[1000];          int local_workspace[1000];
2663          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2664    
2665          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2666    
# Line 1748  for (;;) Line 2668  for (;;)
2668            md,                                   /* static match data */            md,                                   /* static match data */
2669            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2670            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2671            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2672            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2673            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2674            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2675            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2676            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2677    
2678            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2679          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2680              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2681          }          }
2682        break;        break;
2683    
# Line 1768  for (;;) Line 2687  for (;;)
2687          {          {
2688          int local_offsets[1000];          int local_offsets[1000];
2689          int local_workspace[1000];          int local_workspace[1000];
2690          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2691            int condcode;
2692    
2693            /* Because of the way auto-callout works during compile, a callout item
2694            is inserted between OP_COND and an assertion condition. This does not
2695            happen for the other conditions. */
2696    
2697            if (code[LINK_SIZE+1] == OP_CALLOUT)
2698              {
2699              rrc = 0;
2700              if (PUBL(callout) != NULL)
2701                {
2702                PUBL(callout_block) cb;
2703                cb.version          = 1;   /* Version 1 of the callout block */
2704                cb.callout_number   = code[LINK_SIZE+2];
2705                cb.offset_vector    = offsets;
2706    #ifdef COMPILE_PCRE8
2707                cb.subject          = (PCRE_SPTR)start_subject;
2708    #else
2709                cb.subject          = (PCRE_SPTR16)start_subject;
2710    #endif
2711                cb.subject_length   = (int)(end_subject - start_subject);
2712                cb.start_match      = (int)(current_subject - start_subject);
2713                cb.current_position = (int)(ptr - start_subject);
2714                cb.pattern_position = GET(code, LINK_SIZE + 3);
2715                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2716                cb.capture_top      = 1;
2717                cb.capture_last     = -1;
2718                cb.callout_data     = md->callout_data;
2719                cb.mark             = NULL;   /* No (*MARK) support */
2720                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2721                }
2722              if (rrc > 0) break;                      /* Fail this thread */
2723              code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2724              }
2725    
2726            condcode = code[LINK_SIZE+1];
2727    
2728          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2729    
2730          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2731              return PCRE_ERROR_DFA_UCOND;
2732    
2733          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2734    
2735          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2736            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2737    
2738          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2739          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2740          recursed groups. */          recursed groups. */
2741    
2742          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2743            {            {
2744            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2745            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2746            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (md->recursive != NULL)
2747              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2748              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2749            }            }
2750    
2751          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1798  for (;;) Line 2753  for (;;)
2753          else          else
2754            {            {
2755            int rc;            int rc;
2756            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2757            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2758    
2759            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2760    
# Line 1807  for (;;) Line 2762  for (;;)
2762              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2763              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2764              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2765              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2766              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2767              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2768              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2769              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2770              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2771    
2772              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2773            if ((rc >= 0) ==            if ((rc >= 0) ==
2774                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2775              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2776            else            else
2777              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2778            }            }
2779          }          }
2780        break;        break;
# Line 1828  for (;;) Line 2782  for (;;)
2782        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2783        case OP_RECURSE:        case OP_RECURSE:
2784          {          {
2785            dfa_recursion_info *ri;
2786          int local_offsets[1000];          int local_offsets[1000];
2787          int local_workspace[1000];          int local_workspace[1000];
2788            const pcre_uchar *callpat = start_code + GET(code, 1);
2789            int recno = (callpat == md->start_code)? 0 :
2790              GET2(callpat, 1 + LINK_SIZE);
2791          int rc;          int rc;
2792    
2793          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2794            recursing + 1));  
2795            /* Check for repeating a recursion without advancing the subject
2796            pointer. This should catch convoluted mutual recursions. (Some simple
2797            cases are caught at compile time.) */
2798    
2799            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2800              if (recno == ri->group_num && ptr == ri->subject_position)
2801                return PCRE_ERROR_RECURSELOOP;
2802    
2803            /* Remember this recursion and where we started it so as to
2804            catch infinite loops. */
2805    
2806            new_recursive.group_num = recno;
2807            new_recursive.subject_position = ptr;
2808            new_recursive.prevrec = md->recursive;
2809            md->recursive = &new_recursive;
2810    
2811          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2812            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2813            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2814            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2815            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2816            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2817            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2818            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2819            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2820            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing + 1);                       /* regex recurse level */  
2821    
2822          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2823            recursing + 1, rc));  
2824            DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2825              rc));
2826    
2827          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2828    
# Line 1863  for (;;) Line 2836  for (;;)
2836            {            {
2837            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2838              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2839              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2840              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #ifdef SUPPORT_UTF
2841                if (utf)
2842                  {
2843                  const pcre_uchar *p = start_subject + local_offsets[rc];
2844                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2845                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2846                  }
2847    #endif
2848              if (charcount > 0)              if (charcount > 0)
2849                {                {
2850                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 1882  for (;;) Line 2860  for (;;)
2860        break;        break;
2861    
2862        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2863          case OP_BRAPOS:
2864          case OP_SBRAPOS:
2865          case OP_CBRAPOS:
2866          case OP_SCBRAPOS:
2867          case OP_BRAPOSZERO:
2868            {
2869            int charcount, matched_count;
2870            const pcre_uchar *local_ptr = ptr;
2871            BOOL allow_zero;
2872    
2873            if (codevalue == OP_BRAPOSZERO)
2874              {
2875              allow_zero = TRUE;
2876              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2877              }
2878            else allow_zero = FALSE;
2879    
2880            /* Loop to match the subpattern as many times as possible as if it were
2881            a complete pattern. */
2882    
2883            for (matched_count = 0;; matched_count++)
2884              {
2885              int local_offsets[2];
2886              int local_workspace[1000];
2887    
2888              int rc = internal_dfa_exec(
2889                md,                                   /* fixed match data */
2890                code,                                 /* this subexpression's code */
2891                local_ptr,                            /* where we currently are */
2892                (int)(ptr - start_subject),           /* start offset */
2893                local_offsets,                        /* offset vector */
2894                sizeof(local_offsets)/sizeof(int),    /* size of same */
2895                local_workspace,                      /* workspace vector */
2896                sizeof(local_workspace)/sizeof(int),  /* size of same */
2897                rlevel);                              /* function recursion level */
2898    
2899              /* Failed to match */
2900    
2901              if (rc < 0)
2902                {
2903                if (rc != PCRE_ERROR_NOMATCH) return rc;
2904                break;
2905                }
2906    
2907              /* Matched: break the loop if zero characters matched. */
2908    
2909              charcount = local_offsets[1] - local_offsets[0];
2910              if (charcount == 0) break;
2911              local_ptr += charcount;    /* Advance temporary position ptr */
2912              }
2913    
2914            /* At this point we have matched the subpattern matched_count
2915            times, and local_ptr is pointing to the character after the end of the
2916            last match. */
2917    
2918            if (matched_count > 0 || allow_zero)
2919              {
2920              const pcre_uchar *end_subpattern = code;
2921              int next_state_offset;
2922    
2923              do { end_subpattern += GET(end_subpattern, 1); }
2924                while (*end_subpattern == OP_ALT);
2925              next_state_offset =
2926                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2927    
2928              /* Optimization: if there are no more active states, and there
2929              are no new states yet set up, then skip over the subject string
2930              right here, to save looping. Otherwise, set up the new state to swing
2931              into action when the end of the matched substring is reached. */
2932    
2933              if (i + 1 >= active_count && new_count == 0)
2934                {
2935                ptr = local_ptr;
2936                clen = 0;
2937                ADD_NEW(next_state_offset, 0);
2938                }
2939              else
2940                {
2941                const pcre_uchar *p = ptr;
2942                const pcre_uchar *pp = local_ptr;
2943                charcount = (int)(pp - p);
2944    #ifdef SUPPORT_UTF
2945                if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2946    #endif
2947                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2948                }
2949              }
2950            }
2951          break;
2952    
2953          /*-----------------------------------------------------------------*/
2954        case OP_ONCE:        case OP_ONCE:
2955          case OP_ONCE_NC:
2956          {          {
2957          int local_offsets[2];          int local_offsets[2];
2958          int local_workspace[1000];          int local_workspace[1000];
# Line 1891  for (;;) Line 2961  for (;;)
2961            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2962            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2963            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2964            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2965            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2966            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2967            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2968            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2969            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2970    
2971          if (rc >= 0)          if (rc >= 0)
2972            {            {
2973            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2974            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2975            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2976    
2977            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2978              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2979            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2980                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2981    
2982            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2983            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 1916  for (;;) Line 2985  for (;;)
2985    
2986            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2987                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2988              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2989    
2990            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2991            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 1931  for (;;) Line 3000  for (;;)
3000            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
3001            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
3002            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
3003            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
3004    
3005            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
3006              {              {
# Line 1954  for (;;) Line 3023  for (;;)
3023              }              }
3024            else            else
3025              {              {
3026              const uschar *p = start_subject + local_offsets[0];  #ifdef SUPPORT_UTF
3027              const uschar *pp = start_subject + local_offsets[1];              if (utf)
3028              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;                {
3029                  const pcre_uchar *p = start_subject + local_offsets[0];
3030                  const pcre_uchar *pp = start_subject + local_offsets[1];
3031                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
3032                  }
3033    #endif
3034              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
3035              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
3036                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
3037              }              }
   
3038            }            }
3039          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
3040          }          }
# Line 1972  for (;;) Line 3045  for (;;)
3045        /* Handle callouts */        /* Handle callouts */
3046    
3047        case OP_CALLOUT:        case OP_CALLOUT:
3048        if (pcre_callout != NULL)        rrc = 0;
3049          if (PUBL(callout) != NULL)
3050          {          {
3051          int rrc;          PUBL(callout_block) cb;
         pcre_callout_block cb;  
3052          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
3053          cb.callout_number   = code[1];          cb.callout_number   = code[1];
3054          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
3055    #ifdef COMPILE_PCRE8
3056          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
3057          cb.subject_length   = end_subject - start_subject;  #else
3058          cb.start_match      = current_subject - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
3059          cb.current_position = ptr - start_subject;  #endif
3060            cb.subject_length   = (int)(end_subject - start_subject);
3061            cb.start_match      = (int)(current_subject - start_subject);
3062            cb.current_position = (int)(ptr - start_subject);
3063          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
3064          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
3065          cb.capture_top      = 1;          cb.capture_top      = 1;
3066          cb.capture_last     = -1;          cb.capture_last     = -1;
3067          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
3068          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
3069          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3070          }          }
3071          if (rrc == 0)
3072            { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3073        break;        break;
3074    
3075    
# Line 2006  for (;;) Line 3085  for (;;)
3085    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
3086    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
3087    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
3088    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
3089    
3090      The "forced_ fail" variable counts the number of (*F) encountered for the
3091      character. If it is equal to the original active_count (saved in
3092      workspace[1]) it means that (*F) was found on every active state. In this
3093      case we don't want to give a partial match.
3094    
3095      The "could_continue" variable is true if a state could have continued but
3096      for the fact that the end of the subject was reached. */
3097    
3098    if (new_count <= 0)    if (new_count <= 0)
3099      {      {
3100      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
3101          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on, and */
3102          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3103          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
3104          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3105            ||                                           /* or... */
3106            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3107             match_count < 0)                            /* no matches */
3108            ) &&                                         /* And... */
3109            (
3110            partial_newline ||                           /* Either partial NL */
3111              (                                          /* or ... */
3112              ptr >= end_subject &&                /* End of subject and */
3113              ptr > md->start_used_ptr)            /* Inspected non-empty string */
3114              )
3115            )
3116        {        {
3117        if (offsetcount >= 2)        if (offsetcount >= 2)
3118          {          {
3119          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
3120          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
3121          }          }
3122        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
3123        }        }
# Line 2073  Returns:          > 0 => number of match Line 3171  Returns:          > 0 => number of match
3171                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3172  */  */
3173    
3174  PCRE_EXP_DEFN int  #ifdef COMPILE_PCRE8
3175    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3176  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3177    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3178    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3179    #else
3180    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3181    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3182      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3183      int offsetcount, int *workspace, int wscount)
3184    #endif
3185  {  {
3186  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3187  dfa_match_data match_block;  dfa_match_data match_block;
3188  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3189  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3190  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
   
 pcre_study_data internal_study;  
3191  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3192    
3193  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3194  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3195  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3196  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3197  int first_byte = -1;  pcre_uchar first_char = 0;
3198  int req_byte = -1;  pcre_uchar first_char2 = 0;
3199  int req_byte2 = -1;  pcre_uchar req_char = 0;
3200    pcre_uchar req_char2 = 0;
3201  int newline;  int newline;
3202    
3203  /* Plausibility checks */  /* Plausibility checks */
# Line 2104  if (re == NULL || subject == NULL || wor Line 3207  if (re == NULL || subject == NULL || wor
3207     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3208  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3209  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3210    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3211    
3212    /* Check that the first field in the block is the magic number. If it is not,
3213    return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3214    REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3215    means that the pattern is likely compiled with different endianness. */
3216    
3217    if (re->magic_number != MAGIC_NUMBER)
3218      return re->magic_number == REVERSED_MAGIC_NUMBER?
3219        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3220    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3221    
3222    /* If restarting after a partial match, do some sanity checks on the contents
3223    of the workspace. */
3224    
3225    if ((options & PCRE_DFA_RESTART) != 0)
3226      {
3227      if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3228        workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3229          return PCRE_ERROR_DFA_BADRESTART;
3230      }
3231    
3232  /* We need to find the pointer to any study data before we test for byte  /* Set up study, callout, and table data */
 flipping, so we scan the extra_data block first. This may set two fields in the  
 match block, so we must initialize them beforehand. However, the other fields  
 in the match block must not be set until after the byte flipping. */  
3233    
3234  md->tables = re->tables;  md->tables = re->tables;
3235  md->callout_data = NULL;  md->callout_data = NULL;
# Line 2127  if (extra_data != NULL) Line 3248  if (extra_data != NULL)
3248      md->tables = extra_data->tables;      md->tables = extra_data->tables;
3249    }    }
3250    
 /* Check that the first field in the block is the magic number. If it is not,  
 test for a regex that was compiled on a host of opposite endianness. If this is  
 the case, flipped values are put in internal_re and internal_study if there was  
 study data too. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   {  
   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);  
   if (re == NULL) return PCRE_ERROR_BADMAGIC;  
   if (study != NULL) study = &internal_study;  
   }  
   
3251  /* Set some local values */  /* Set some local values */
3252    
3253  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3254  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3255  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3256    
3257  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3258  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3259    utf = (re->options & PCRE_UTF8) != 0;
3260  #else  #else
3261  utf8 = FALSE;  utf = FALSE;
3262  #endif  #endif
3263    
3264  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2156  anchored = (options & (PCRE_ANCHORED|PCR Line 3266  anchored = (options & (PCRE_ANCHORED|PCR
3266    
3267  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3268    
3269  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3270      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3271  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3272  md->end_subject = end_subject;  md->end_subject = end_subject;
3273    md->start_offset = start_offset;
3274  md->moptions = options;  md->moptions = options;
3275  md->poptions = re->options;  md->poptions = re->options;
3276    
3277    /* If the BSR option is not set at match time, copy what was set
3278    at compile time. */
3279    
3280    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3281      {
3282      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3283        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3284    #ifdef BSR_ANYCRLF
3285      else md->moptions |= PCRE_BSR_ANYCRLF;
3286    #endif
3287      }
3288    
3289  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
3290  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
3291    
# Line 2170  switch ((((options & PCRE_NEWLINE_BITS) Line 3293  switch ((((options & PCRE_NEWLINE_BITS)
3293           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
3294    {    {
3295    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
3296    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3297    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3298    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
3299         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3300    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
3301      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3302    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
3303    }    }
3304    
3305  if (newline < 0)  if (newline == -2)
3306      {
3307      md->nltype = NLTYPE_ANYCRLF;
3308      }
3309    else if (newline < 0)
3310    {    {
3311    md->nltype = NLTYPE_ANY;    md->nltype = NLTYPE_ANY;
3312    }    }
# Line 2201  else Line 3329  else
3329  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3330  back the character offset. */  back the character offset. */
3331    
3332  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3333  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3334    {    {
3335    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3336      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3337    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3338      {      {
3339      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3340        {        {
3341        tb &= 0xc0;        offsets[0] = erroroffset;
3342        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3343        }        }
3344        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3345          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3346      }      }
3347      if (start_offset > 0 && start_offset < length &&
3348            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3349        return PCRE_ERROR_BADUTF8_OFFSET;
3350    }    }
3351  #endif  #endif
3352    
# Line 2222  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3354  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3354  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3355  in other programs later. */  in other programs later. */
3356    
3357  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3358    
3359  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3360  used in a loop when finding where to start. */  where to start. */
3361    
3362  lcc = md->tables + lcc_offset;  startline = (re->flags & PCRE_STARTLINE) != 0;
 startline = (re->options & PCRE_STARTLINE) != 0;  
3363  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3364    
3365  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2239  studied, there may be a bitmap of possib Line 3370  studied, there may be a bitmap of possib
3370    
3371  if (!anchored)  if (!anchored)
3372    {    {
3373    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3374      {      {
3375      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3376      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = (pcre_uchar)(re->first_char);
3377        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3378          {
3379          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3380    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3381          if (utf && first_char > 127)
3382            first_char2 = UCD_OTHERCASE(first_char);
3383    #endif
3384          }
3385      }      }
3386    else    else
3387      {      {
3388      if (startline && study != NULL &&      if (!startline && study != NULL &&
3389           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3390        start_bits = study->start_bits;        start_bits = study->start_bits;
3391      }      }
3392    }    }
# Line 2256  if (!anchored) Line 3394  if (!anchored)
3394  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
3395  character" set. */  character" set. */
3396    
3397  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3398    {    {
3399    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3400    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = (pcre_uchar)(re->req_char);
3401    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3402        {
3403        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3404    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3405        if (utf && req_char > 127)
3406          req_char2 = UCD_OTHERCASE(req_char);
3407    #endif
3408        }
3409    }    }
3410    
3411  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
3412  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
3413  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
3414    
3415  for (;;)  for (;;)
3416    {    {
# Line 2274  for (;;) Line 3418  for (;;)
3418    
3419    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3420      {      {
3421      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3422    
3423      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
3424      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
3425      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
3426      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
3427    
3428      if (firstline)      if (firstline)
3429        {        {
3430        const uschar *t = current_subject;        PCRE_PUCHAR t = current_subject;
3431    #ifdef SUPPORT_UTF
3432          if (utf)
3433            {
3434            while (t < md->end_subject && !IS_NEWLINE(t))
3435              {
3436              t++;
3437              ACROSSCHAR(t < end_subject, *t, t++);
3438              }
3439            }
3440          else
3441    #endif
3442        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3443        end_subject = t;        end_subject = t;
3444        }        }
3445    
3446      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
3447        starting point is not found. However, there is an option that disables
3448        these, for testing and for ensuring that all callouts do actually occur.
3449        The option can be set in the regex by (*NO_START_OPT) or passed in
3450        match-time options. */
3451    
3452        if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3453        {        {
3454        if (first_byte_caseless)        /* Advance to a known first char. */
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
3455    
3456      /* Or to just after a linebreak for a multiline match if possible */        if (has_first_char)
3457            {
3458            if (first_char != first_char2)
3459              while (current_subject < end_subject &&
3460                  *current_subject != first_char && *current_subject != first_char2)
3461                current_subject++;
3462            else
3463              while (current_subject < end_subject &&
3464                     *current_subject != first_char)
3465                current_subject++;
3466            }
3467    
3468      else if (startline)        /* Or to just after a linebreak for a multiline match if possible */
3469        {  
3470        if (current_subject > md->start_subject + start_offset)        else if (startline)
3471          {          {
3472          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (current_subject > md->start_subject + start_offset)
3473            current_subject++;            {
3474    #ifdef SUPPORT_UTF
3475              if (utf)
3476                {
3477                while (current_subject < end_subject &&
3478                       !WAS_NEWLINE(current_subject))
3479                  {
3480                  current_subject++;
3481                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3482                    current_subject++);
3483                  }
3484                }
3485              else
3486    #endif
3487              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3488                current_subject++;
3489    
3490          /* If we have just passed a CR and the newline option is ANY, and we            /* If we have just passed a CR and the newline option is ANY or
3491          are now at a LF, advance the match position by one more character. */            ANYCRLF, and we are now at a LF, advance the match position by one
3492              more character. */
3493    
3494          if (current_subject[-1] == '\r' &&            if (current_subject[-1] == CHAR_CR &&
3495               md->nltype == NLTYPE_ANY &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3496               current_subject < end_subject &&                 current_subject < end_subject &&
3497               *current_subject == '\n')                 *current_subject == CHAR_NL)
3498            current_subject++;              current_subject++;
3499              }
3500          }          }
       }  
3501    
3502      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
3503    
3504      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
3505          {          {
3506          register unsigned int c = *current_subject;          while (current_subject < end_subject)
3507          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
3508              register unsigned int c = *current_subject;
3509    #ifndef COMPILE_PCRE8
3510              if (c > 255) c = 255;
3511    #endif
3512              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3513                {
3514                current_subject++;
3515    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3516                /* In non 8-bit mode, the iteration will stop for
3517                characters > 255 at the beginning or not stop at all. */
3518                if (utf)
3519                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3520                    current_subject++);
3521    #endif
3522                }
3523            else break;            else break;
3524              }
3525          }          }
3526        }        }
3527    
3528      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3529    
3530      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
   
   /* If req_byte is set, we know that that character must appear in the subject  
   for the match to succeed. If the first character is set, req_byte must be  
   later in the subject; otherwise the test starts at the match point. This  
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
3531    
3532      /* We don't need to repeat the search if we haven't yet reached the      /* The following two optimizations are disabled for partial matching or if
3533      place we found it at last time. */      disabling is explicitly requested (and of course, by the test above, this
3534        code is not obeyed when restarting after a partial match). */
3535    
3536      if (p > req_byte_ptr)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3537            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3538        {        {
3539        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3540          {        is a lower bound; no actual string of that length may actually match the
3541          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3542            {        bytes to avoid spending too much time in this optimization. */
3543            register int pp = *p++;  
3544            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3545            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3546          }          return PCRE_ERROR_NOMATCH;
3547        else  
3548          /* If req_char is set, we know that that character must appear in the
3549          subject for the match to succeed. If the first character is set, req_char
3550          must be later in the subject; otherwise the test starts at the match
3551          point. This optimization can save a huge amount of work in patterns with
3552          nested unlimited repeats that aren't going to match. Writing separate
3553          code for cased/caseless versions makes it go faster, as does using an
3554          autoincrement and backing off on a match.
3555    
3556          HOWEVER: when the subject string is very, very long, searching to its end
3557          can take a long time, and give bad performance on quite ordinary
3558          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3559          string... so we don't do this when the string is sufficiently long. */
3560    
3561          if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3562          {          {
3563          while (p < end_subject)          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3564    
3565            /* We don't need to repeat the search if we haven't yet reached the
3566            place we found it at last time. */
3567    
3568            if (p > req_char_ptr)
3569            {            {
3570            if (*p++ == req_byte) { p--; break; }            if (req_char != req_char2)
3571            }              {
3572          }              while (p < end_subject)
3573                  {
3574                  register int pp = *p++;
3575                  if (pp == req_char || pp == req_char2) { p--; break; }
3576                  }
3577                }
3578              else
3579                {
3580                while (p < end_subject)
3581                  {
3582                  if (*p++ == req_char) { p--; break; }
3583                  }
3584                }
3585    
3586        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3587        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3588    
3589        if (p >= end_subject) break;            if (p >= end_subject) break;
3590    
3591        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3592        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3593        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3594    
3595        req_byte_ptr = p;            req_char_ptr = p;
3596              }
3597            }
3598        }        }
3599      }      }   /* End of optimizations that are done when not restarting */
3600    
3601    /* OK, now we can do the business */    /* OK, now we can do the business */
3602    
3603      md->start_used_ptr = current_subject;
3604      md->recursive = NULL;
3605    
3606    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3607      md,                                /* fixed match data */      md,                                /* fixed match data */
3608      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2404  for (;;) Line 3612  for (;;)
3612      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3613      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3614      wscount,                           /* size of same */      wscount,                           /* size of same */
3615      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3616    
3617    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3618    on only if not anchored. */    on only if not anchored. */
# Line 2418  for (;;) Line 3624  for (;;)
3624    
3625    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3626    current_subject++;    current_subject++;
3627    if (utf8)  #ifdef SUPPORT_UTF
3628      if (utf)
3629      {      {
3630      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3631        current_subject++;        current_subject++);
3632      }      }
3633    #endif
3634    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3635    
3636    /* If we have just passed a CR and the newline option is CRLF or ANY, and we    /* If we have just passed a CR and we are now at a LF, and the pattern does
3637    are now at a LF, advance the match position by one more character. */    not contain any explicit matches for \r or \n, and the newline option is CRLF
3638      or ANY or ANYCRLF, advance the match position by one more character. */
3639    if (current_subject[-1] == '\r' &&  
3640         (md->nltype == NLTYPE_ANY || md->nllen == 2) &&    if (current_subject[-1] == CHAR_CR &&
3641         current_subject < end_subject &&        current_subject < end_subject &&
3642         *current_subject == '\n')        *current_subject == CHAR_NL &&
3643          (re->flags & PCRE_HASCRORLF) == 0 &&
3644            (md->nltype == NLTYPE_ANY ||
3645             md->nltype == NLTYPE_ANYCRLF ||
3646             md->nllen == 2))
3647      current_subject++;      current_subject++;
3648    
3649    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.145  
changed lines
  Added in v.1033

  ViewVC Help
Powered by ViewVC 1.1.5