/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 85 by nigel, Sat Feb 24 21:41:13 2007 UTC revision 1364 by ph10, Sat Oct 5 15:45:11 2013 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2013 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 37  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a DFA algorithm. This is NOT Perl-  alternative matching function that uses a sort of DFA algorithm (not a true
43  compatible, but it has advantages in certain applications. */  FSM). This is NOT Perl-compatible, but it has advantages in certain
44    applications. */
45    
46    
47    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48    the performance of his patterns greatly. I could not use it as it stood, as it
49    was not thread safe, and made assumptions about pattern sizes. Also, it caused
50    test 7 to loop, and test 9 to crash with a segfault.
51    
52    The issue is the check for duplicate states, which is done by a simple linear
53    search up the state list. (Grep for "duplicate" below to find the code.) For
54    many patterns, there will never be many states active at one time, so a simple
55    linear search is fine. In patterns that have many active states, it might be a
56    bottleneck. The suggested code used an indexing scheme to remember which states
57    had previously been used for each character, and avoided the linear search when
58    it knew there was no chance of a duplicate. This was implemented when adding
59    states to the state lists.
60    
61    I wrote some thread-safe, not-limited code to try something similar at the time
62    of checking for duplicates (instead of when adding states), using index vectors
63    on the stack. It did give a 13% improvement with one specially constructed
64    pattern for certain subject strings, but on other strings and on many of the
65    simpler patterns in the test suite it did worse. The major problem, I think,
66    was the extra time to initialize the index. This had to be done for each call
67    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68    only once - I suspect this was the cause of the problems with the tests.)
69    
70    Overall, I concluded that the gains in some cases did not outweigh the losses
71    in others, so I abandoned this code. */
72    
73    
74    
75    #ifdef HAVE_CONFIG_H
76    #include "config.h"
77    #endif
78    
79    #define NLBLOCK md             /* Block containing newline information */
80    #define PSSTART start_subject  /* Field containing processed string start */
81    #define PSEND   end_subject    /* Field containing processed string end */
82    
83  #include "pcre_internal.h"  #include "pcre_internal.h"
84    
# Line 51  compatible, but it has advantages in cer Line 88  compatible, but it has advantages in cer
88  #define SP "                   "  #define SP "                   "
89    
90    
   
91  /*************************************************  /*************************************************
92  *      Code parameters and static tables         *  *      Code parameters and static tables         *
93  *************************************************/  *************************************************/
94    
95  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96  into others, under special conditions. A gap of 10 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
97  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
98    never stored, so we push them well clear of the normal opcodes. */
99  #define OP_PROP_EXTRA    (EXTRACT_BASIC_MAX+1)  
100  #define OP_EXTUNI_EXTRA  (EXTRACT_BASIC_MAX+11)  #define OP_PROP_EXTRA       300
101    #define OP_EXTUNI_EXTRA     320
102    #define OP_ANYNL_EXTRA      340
103    #define OP_HSPACE_EXTRA     360
104    #define OP_VSPACE_EXTRA     380
105    
106    
107  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
108  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
109  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
110  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111  small value. */  small value. Non-zero values in the table are the offsets from the opcode where
112    the character is to be found. ***NOTE*** If the start of this table is
113    modified, the three tables that follow must also be modified. */
114    
115  static uschar coptable[] = {  static const pcre_uint8 coptable[] = {
116    0,                             /* End                                    */    0,                             /* End                                    */
117    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0,                          /* \P, \p                                 */
121      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122      0,                             /* \X                                     */
123      0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124    1,                             /* Char                                   */    1,                             /* Char                                   */
125    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
126    1,                             /* not                                    */    1,                             /* not                                    */
127      1,                             /* noti                                   */
128    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
129    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131      1+IMM2_SIZE,                   /* exact                                  */
132      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135      1+IMM2_SIZE,                   /* exact I                                */
136      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
138    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140      1+IMM2_SIZE,                   /* NOT exact                              */
141      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144      1+IMM2_SIZE,                   /* NOT exact I                            */
145      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
147    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149      1+IMM2_SIZE,                   /* Type exact                             */
150      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
152    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 95  static uschar coptable[] = { Line 155  static uschar coptable[] = {
155    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
156    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
157    0,                             /* REF                                    */    0,                             /* REF                                    */
158      0,                             /* REFI                                   */
159      0,                             /* DNREF                                  */
160      0,                             /* DNREFI                                 */
161    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
162    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
163    0,                             /* Alt                                    */    0,                             /* Alt                                    */
164    0,                             /* Ket                                    */    0,                             /* Ket                                    */
165    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
166    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
167      0,                             /* KetRpos                                */
168      0,                             /* Reverse                                */
169    0,                             /* Assert                                 */    0,                             /* Assert                                 */
170    0,                             /* Assert not                             */    0,                             /* Assert not                             */
171    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
172    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
173      0, 0,                          /* ONCE, ONCE_NC                          */
174      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
175      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
176      0, 0,                          /* CREF, NCREF                            */
177      0, 0,                          /* RREF, NRREF                            */
178      0,                             /* DEF                                    */
179      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
180      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
181      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
182      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
183      0, 0                           /* CLOSE, SKIPZERO  */
184    };
185    
186    /* This table identifies those opcodes that inspect a character. It is used to
187    remember the fact that a character could have been inspected when the end of
188    the subject is reached. ***NOTE*** If the start of this table is modified, the
189    two tables that follow must also be modified. */
190    
191    static const pcre_uint8 poptable[] = {
192      0,                             /* End                                    */
193      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
194      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
195      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
196      1, 1,                          /* \P, \p                                 */
197      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
198      1,                             /* \X                                     */
199      0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
200      1,                             /* Char                                   */
201      1,                             /* Chari                                  */
202      1,                             /* not                                    */
203      1,                             /* noti                                   */
204      /* Positive single-char repeats                                          */
205      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
206      1, 1, 1,                       /* upto, minupto, exact                   */
207      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
208      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
209      1, 1, 1,                       /* upto I, minupto I, exact I             */
210      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
211      /* Negative single-char repeats - only for chars < 256                   */
212      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
213      1, 1, 1,                       /* NOT upto, minupto, exact               */
214      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
215      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
216      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
217      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
218      /* Positive type repeats                                                 */
219      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
220      1, 1, 1,                       /* Type upto, minupto, exact              */
221      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
222      /* Character class & ref repeats                                         */
223      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
224      1, 1,                          /* CRRANGE, CRMINRANGE                    */
225      1,                             /* CLASS                                  */
226      1,                             /* NCLASS                                 */
227      1,                             /* XCLASS - variable length               */
228      0,                             /* REF                                    */
229      0,                             /* REFI                                   */
230      0,                             /* DNREF                                  */
231      0,                             /* DNREFI                                 */
232      0,                             /* RECURSE                                */
233      0,                             /* CALLOUT                                */
234      0,                             /* Alt                                    */
235      0,                             /* Ket                                    */
236      0,                             /* KetRmax                                */
237      0,                             /* KetRmin                                */
238      0,                             /* KetRpos                                */
239    0,                             /* Reverse                                */    0,                             /* Reverse                                */
240    0,                             /* Once                                   */    0,                             /* Assert                                 */
241    0,                             /* COND                                   */    0,                             /* Assert not                             */
242    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
243    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0,                             /* Assert behind not                      */
244    0,                             /* BRANUMBER                              */    0, 0,                          /* ONCE, ONCE_NC                          */
245    0                              /* BRA                                    */    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
246      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
247      0, 0,                          /* CREF, NCREF                            */
248      0, 0,                          /* RREF, NRREF                            */
249      0,                             /* DEF                                    */
250      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
251      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
252      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
253      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
254      0, 0                           /* CLOSE, SKIPZERO                        */
255  };  };
256    
257  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
258  and \w */  and \w */
259    
260  static uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
261    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
262    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
263    ctype_space, ctype_space,    ctype_space, ctype_space,
264    ctype_word,  ctype_word,    ctype_word,  ctype_word,
265    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
266  };  };
267    
268  static uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
269    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
270    ctype_digit, 0,    ctype_digit, 0,
271    ctype_space, 0,    ctype_space, 0,
272    ctype_word,  0,    ctype_word,  0,
273    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
274  };  };
275    
276    
# Line 142  these structures in, is a vector of ints Line 282  these structures in, is a vector of ints
282  typedef struct stateblock {  typedef struct stateblock {
283    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
284    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
285    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
286  } stateblock;  } stateblock;
287    
288  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
289    
290    
291  #ifdef DEBUG  #ifdef PCRE_DEBUG
292  /*************************************************  /*************************************************
293  *             Print character string             *  *             Print character string             *
294  *************************************************/  *************************************************/
# Line 165  Returns:       nothing Line 304  Returns:       nothing
304  */  */
305    
306  static void  static void
307  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
308  {  {
309  int c;  pcre_uint32 c;
310  while (length-- > 0)  while (length-- > 0)
311    {    {
312    if (isprint(c = *(p++)))    if (isprint(c = *(p++)))
313      fprintf(f, "%c", c);      fprintf(f, "%c", c);
314    else    else
315      fprintf(f, "\\x%02x", c);      fprintf(f, "\\x{%02x}", c);
316    }    }
317  }  }
318  #endif  #endif
# Line 198  Arguments: Line 337  Arguments:
337    offsetcount       size of same    offsetcount       size of same
338    workspace         vector of workspace    workspace         vector of workspace
339    wscount           size of same    wscount           size of same
   ims               the current ims flags  
340    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
341    
342  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
343                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
344                       -1 => failed to match                       -1 => failed to match
345                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
346    
# Line 215  for the current character, one for the f Line 352  for the current character, one for the f
352      { \      { \
353      next_active_state->offset = (x); \      next_active_state->offset = (x); \
354      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
355      next_active_state++; \      next_active_state++; \
356      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
357      } \      } \
# Line 226  for the current character, one for the f Line 362  for the current character, one for the f
362      { \      { \
363      next_active_state->offset = (x); \      next_active_state->offset = (x); \
364      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
365      next_active_state->data   = (z); \      next_active_state->data   = (z); \
366      next_active_state++; \      next_active_state++; \
367      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 238  for the current character, one for the f Line 373  for the current character, one for the f
373      { \      { \
374      next_new_state->offset = (x); \      next_new_state->offset = (x); \
375      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
376      next_new_state++; \      next_new_state++; \
377      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
378      } \      } \
# Line 249  for the current character, one for the f Line 383  for the current character, one for the f
383      { \      { \
384      next_new_state->offset = (x); \      next_new_state->offset = (x); \
385      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
386      next_new_state->data   = (z); \      next_new_state->data   = (z); \
387      next_new_state++; \      next_new_state++; \
388      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
389          (x), (y), (z), __LINE__)); \
390      } \      } \
391    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
392    
# Line 261  for the current character, one for the f Line 395  for the current character, one for the f
395  static int  static int
396  internal_dfa_exec(  internal_dfa_exec(
397    dfa_match_data *md,    dfa_match_data *md,
398    const uschar *this_start_code,    const pcre_uchar *this_start_code,
399    const uschar *current_subject,    const pcre_uchar *current_subject,
400    int start_offset,    int start_offset,
401    int *offsets,    int *offsets,
402    int offsetcount,    int offsetcount,
403    int *workspace,    int *workspace,
404    int wscount,    int wscount,
405    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
406  {  {
407  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
408  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
409    
410  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
411  const uschar *ptr;  const pcre_uchar *ptr;
412  const uschar *end_code;  const pcre_uchar *end_code, *first_op;
413    
414    dfa_recursion_info new_recursive;
415    
416  int active_count, new_count, match_count;  int active_count, new_count, match_count;
417    
418  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
419  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
420    
421  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
422  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
423  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
424    
425    #ifdef SUPPORT_UTF
426    BOOL utf = (md->poptions & PCRE_UTF8) != 0;
427    #else
428    BOOL utf = FALSE;
429    #endif
430    
431  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL reset_could_continue = FALSE;
432    
433  rlevel++;  rlevel++;
434  offsetcount &= (-2);  offsetcount &= (-2);
# Line 298  wscount = (wscount - (wscount % (INTS_PE Line 438  wscount = (wscount - (wscount % (INTS_PE
438            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
439    
440  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
441    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
442    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
443    
444  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
445  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 311  active_states = (stateblock *)(workspace Line 451  active_states = (stateblock *)(workspace
451  next_new_state = new_states = active_states + wscount;  next_new_state = new_states = active_states + wscount;
452  new_count = 0;  new_count = 0;
453    
454    first_op = this_start_code + 1 + LINK_SIZE +
455      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
456        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
457        ? IMM2_SIZE:0);
458    
459  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
460  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
461  makes is possible to use this function recursively, when we want to stop at a  makes is possible to use this function recursively, when we want to stop at a
# Line 320  If the first opcode in the first alterna Line 465  If the first opcode in the first alterna
465  a backward assertion. In that case, we have to find out the maximum amount to  a backward assertion. In that case, we have to find out the maximum amount to
466  move back, and set up each alternative appropriately. */  move back, and set up each alternative appropriately. */
467    
468  if (this_start_code[1+LINK_SIZE] == OP_REVERSE)  if (*first_op == OP_REVERSE)
469    {    {
470    int max_back = 0;    int max_back = 0;
471    int gone_back;    int gone_back;
# Line 337  if (this_start_code[1+LINK_SIZE] == OP_R Line 482  if (this_start_code[1+LINK_SIZE] == OP_R
482    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
483    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
484    
485  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
486    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
487    
488    if (utf8)    if (utf)
489      {      {
490      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
491        {        {
492        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
493        current_subject--;        current_subject--;
494        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
495        }        }
496      }      }
497    else    else
# Line 358  if (this_start_code[1+LINK_SIZE] == OP_R Line 501  if (this_start_code[1+LINK_SIZE] == OP_R
501    
502      {      {
503      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
504        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
505      current_subject -= gone_back;      current_subject -= gone_back;
506      }      }
507    
508      /* Save the earliest consulted character */
509    
510      if (current_subject < md->start_used_ptr)
511        md->start_used_ptr = current_subject;
512    
513    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
514    
515    end_code = this_start_code;    end_code = this_start_code;
# Line 370  if (this_start_code[1+LINK_SIZE] == OP_R Line 518  if (this_start_code[1+LINK_SIZE] == OP_R
518      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
519      if (back <= gone_back)      if (back <= gone_back)
520        {        {
521        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
522        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
523        }        }
524      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 402  else Line 550  else
550    
551    else    else
552      {      {
553        int length = 1 + LINK_SIZE +
554          ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
555            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
556            ? IMM2_SIZE:0);
557      do      do
558        {        {
559        ADD_NEW(end_code - start_code + 1 + LINK_SIZE, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
560        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
561          length = 1 + LINK_SIZE;
562        }        }
563      while (*end_code == OP_ALT);      while (*end_code == OP_ALT);
564      }      }
# Line 413  else Line 566  else
566    
567  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
568    
569  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
570    
571  /* Loop for scanning the subject */  /* Loop for scanning the subject */
572    
# Line 421  ptr = current_subject; Line 574  ptr = current_subject;
574  for (;;)  for (;;)
575    {    {
576    int i, j;    int i, j;
577    int c, d, clen, dlen;    int clen, dlen;
578      pcre_uint32 c, d;
579      int forced_fail = 0;
580      BOOL partial_newline = FALSE;
581      BOOL could_continue = reset_could_continue;
582      reset_could_continue = FALSE;
583    
584    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
585    new state list. */    new state list. */
# Line 435  for (;;) Line 593  for (;;)
593    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
594    workspace[1] = active_count;    workspace[1] = active_count;
595    
596  #ifdef DEBUG  #ifdef PCRE_DEBUG
597    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
598    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
599    printf("\"\n");    printf("\"\n");
600    
601    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 457  for (;;) Line 615  for (;;)
615    
616    if (ptr < end_subject)    if (ptr < end_subject)
617      {      {
618      clen = 1;      clen = 1;        /* Number of data items in the character */
619  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
620      if (utf8) { GETCHARLEN(c, ptr, clen); } else      GETCHARLENTEST(c, ptr, clen);
621  #endif  /* SUPPORT_UTF8 */  #else
622      c = *ptr;      c = *ptr;
623    #endif  /* SUPPORT_UTF */
624      }      }
625    else    else
626      {      {
627      clen = 0;    /* At end subject */      clen = 0;        /* This indicates the end of the subject */
628      c = -1;      c = NOTACHAR;    /* This value should never actually be used */
629      }      }
630    
631    /* Scan up the active states and act on each one. The result of an action    /* Scan up the active states and act on each one. The result of an action
# Line 477  for (;;) Line 636  for (;;)
636    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
637      {      {
638      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
639      const uschar *code;      BOOL caseless = FALSE;
640        const pcre_uchar *code;
641      int state_offset = current_state->offset;      int state_offset = current_state->offset;
642      int count, codevalue;      int codevalue, rrc;
643      int chartype, othercase;      int count;
644    
645  #ifdef DEBUG  #ifdef PCRE_DEBUG
646      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
647      if (c < 0) printf("-1\n");      if (clen == 0) printf("EOL\n");
648        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
649          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
650  #endif  #endif
651    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
652      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
653      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
654      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
655        state, arrange for it to passed on. */
656    
657      if (state_offset < 0)      if (state_offset < 0)
658        {        {
# Line 504  for (;;) Line 661  for (;;)
661          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
662          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
663            current_state->data - 1);            current_state->data - 1);
664            if (could_continue) reset_could_continue = TRUE;
665          continue;          continue;
666          }          }
667        else        else
# Line 512  for (;;) Line 670  for (;;)
670          }          }
671        }        }
672    
673      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
674        See the note at the head of this module about the possibility of improving
675        performance here. */
676    
677      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
678        {        {
# Line 528  for (;;) Line 688  for (;;)
688    
689      code = start_code + state_offset;      code = start_code + state_offset;
690      codevalue = *code;      codevalue = *code;
691      if (codevalue >= OP_BRA) codevalue = OP_BRA; /* All brackets are equal */  
692        /* If this opcode inspects a character, but we are at the end of the
693        subject, remember the fact for use when testing for a partial match. */
694    
695        if (clen == 0 && poptable[codevalue] != 0)
696          could_continue = TRUE;
697    
698      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
699      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
# Line 536  for (;;) Line 701  for (;;)
701      permitted.      permitted.
702    
703      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
704      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long because
705      Unfortunately, we have to take special action to deal with  \P, \p, and      the values are small. We have to take special action to deal with  \P, \p,
706      \X in this case. To keep the other cases fast, convert these ones to new      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
707      opcodes. */      these ones to new opcodes. */
708    
709      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
710        {        {
711        dlen = 1;        dlen = 1;
712  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
713        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
714  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
715        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
716        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
717          {          {
718          if (d == OP_ANYBYTE) return PCRE_ERROR_DFA_UITEM;          switch(d)
719          if (d >= OP_NOTPROP)            {
720            codevalue += (d == OP_EXTUNI)? OP_EXTUNI_EXTRA : OP_PROP_EXTRA;            case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
721              case OP_NOTPROP:
722              case OP_PROP: codevalue += OP_PROP_EXTRA; break;
723              case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
724              case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
725              case OP_NOT_HSPACE:
726              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
727              case OP_NOT_VSPACE:
728              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
729              default: break;
730              }
731          }          }
732        }        }
733      else      else
734        {        {
735        dlen = 0;         /* Not strictly necessary, but compilers moan */        dlen = 0;         /* Not strictly necessary, but compilers moan */
736        d = -1;           /* if these variables are not set. */        d = NOTACHAR;     /* if these variables are not set. */
737        }        }
738    
739    
# Line 566  for (;;) Line 741  for (;;)
741    
742      switch (codevalue)      switch (codevalue)
743        {        {
744    /* ========================================================================== */
745          /* These cases are never obeyed. This is a fudge that causes a compile-
746          time error if the vectors coptable or poptable, which are indexed by
747          opcode, are not the correct length. It seems to be the only way to do
748          such a check at compile time, as the sizeof() operator does not work
749          in the C preprocessor. */
750    
751          case OP_TABLE_LENGTH:
752          case OP_TABLE_LENGTH +
753            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
754             (sizeof(poptable) == OP_TABLE_LENGTH)):
755          break;
756    
757  /* ========================================================================== */  /* ========================================================================== */
758        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
759        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
760        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
761          subpattern, because the possessive subpattern repeats are always handled
762          using recursive calls. Thus, it never adds any new states.
763    
764          At the end of the (sub)pattern, unless we have an empty string and
765          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
766          start of the subject, save the match data, shifting up all previous
767        matches so we always have the longest first. */        matches so we always have the longest first. */
768    
769        case OP_KET:        case OP_KET:
770        case OP_KETRMIN:        case OP_KETRMIN:
771        case OP_KETRMAX:        case OP_KETRMAX:
772          case OP_KETRPOS:
773        if (code != end_code)        if (code != end_code)
774          {          {
775          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 584  for (;;) Line 778  for (;;)
778            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
779            }            }
780          }          }
781        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
782          {          {
783          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
784            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
785              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
786          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
787          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
788          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
789            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
790            offsets[0] = current_subject - start_subject;                match_count = 0;
791            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
792            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
793              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
794            }              {
795          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
796            {              offsets[1] = (int)(ptr - start_subject);
797            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
798              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], (char *)current_subject));
799              match_count, rlevel*2-2, SP));              }
800            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
801                {
802                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
803                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
804                  match_count, rlevel*2-2, SP));
805                return match_count;
806                }
807            }            }
808          }          }
809        break;        break;
# Line 615  for (;;) Line 815  for (;;)
815        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
816        case OP_ALT:        case OP_ALT:
817        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
818        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
819        break;        break;
820    
821        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
822        case OP_BRA:        case OP_BRA:
823          case OP_SBRA:
824        do        do
825          {          {
826          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
827          code += GET(code, 1);          code += GET(code, 1);
828          }          }
829        while (*code == OP_ALT);        while (*code == OP_ALT);
830        break;        break;
831    
832        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
833          case OP_CBRA:
834          case OP_SCBRA:
835          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
836          code += GET(code, 1);
837          while (*code == OP_ALT)
838            {
839            ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
840            code += GET(code, 1);
841            }
842          break;
843    
844          /*-----------------------------------------------------------------*/
845        case OP_BRAZERO:        case OP_BRAZERO:
846        case OP_BRAMINZERO:        case OP_BRAMINZERO:
847        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
848        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
849        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
850        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
851        break;        break;
852    
853        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
854        case OP_BRANUMBER:        case OP_SKIPZERO:
855        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);        code += 1 + GET(code, 2);
856          while (*code == OP_ALT) code += GET(code, 1);
857          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
858        break;        break;
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_CIRC:        case OP_CIRC:
862        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE))  
863          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
864        break;        break;
865    
866        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
867        case OP_EOD:        case OP_CIRCM:
868        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
869              (ptr != end_subject && WAS_NEWLINE(ptr)))
870            { ADD_ACTIVE(state_offset + 1, 0); }
871        break;        break;
872    
873        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
874        case OP_OPT:        case OP_EOD:
875        ims = code[1];        if (ptr >= end_subject)
876        ADD_ACTIVE(state_offset + 2, 0);          {
877            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
878              could_continue = TRUE;
879            else { ADD_ACTIVE(state_offset + 1, 0); }
880            }
881        break;        break;
882    
883        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 679  for (;;) Line 899  for (;;)
899    
900        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
901        case OP_ANY:        case OP_ANY:
902        if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0))        if (clen > 0 && !IS_NEWLINE(ptr))
903            {
904            if (ptr + 1 >= md->end_subject &&
905                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
906                NLBLOCK->nltype == NLTYPE_FIXED &&
907                NLBLOCK->nllen == 2 &&
908                c == NLBLOCK->nl[0])
909              {
910              could_continue = partial_newline = TRUE;
911              }
912            else
913              {
914              ADD_NEW(state_offset + 1, 0);
915              }
916            }
917          break;
918    
919          /*-----------------------------------------------------------------*/
920          case OP_ALLANY:
921          if (clen > 0)
922          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
923        break;        break;
924    
925        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
926        case OP_EODN:        case OP_EODN:
927        if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
928            could_continue = TRUE;
929          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
930          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
931        break;        break;
932    
# Line 693  for (;;) Line 934  for (;;)
934        case OP_DOLL:        case OP_DOLL:
935        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
936          {          {
937          if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
938                                  (ims & PCRE_MULTILINE) != 0)))            could_continue = TRUE;
939            else if (clen == 0 ||
940                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
941                   (ptr == end_subject - md->nllen)
942                ))
943              { ADD_ACTIVE(state_offset + 1, 0); }
944            else if (ptr + 1 >= md->end_subject &&
945                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
946                     NLBLOCK->nltype == NLTYPE_FIXED &&
947                     NLBLOCK->nllen == 2 &&
948                     c == NLBLOCK->nl[0])
949              {
950              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
951                {
952                reset_could_continue = TRUE;
953                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
954                }
955              else could_continue = partial_newline = TRUE;
956              }
957            }
958          break;
959    
960          /*-----------------------------------------------------------------*/
961          case OP_DOLLM:
962          if ((md->moptions & PCRE_NOTEOL) == 0)
963            {
964            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
965              could_continue = TRUE;
966            else if (clen == 0 ||
967                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
968            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
969            else if (ptr + 1 >= md->end_subject &&
970                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
971                     NLBLOCK->nltype == NLTYPE_FIXED &&
972                     NLBLOCK->nllen == 2 &&
973                     c == NLBLOCK->nl[0])
974              {
975              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
976                {
977                reset_could_continue = TRUE;
978                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
979                }
980              else could_continue = partial_newline = TRUE;
981              }
982          }          }
983        else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0)        else if (IS_NEWLINE(ptr))
984          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
985        break;        break;
986    
# Line 728  for (;;) Line 1011  for (;;)
1011    
1012          if (ptr > start_subject)          if (ptr > start_subject)
1013            {            {
1014            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1015  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1016            if (utf8) BACKCHAR(temp);  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1017              if (utf) { BACKCHAR(temp); }
1018  #endif  #endif
1019            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1020    #ifdef SUPPORT_UCP
1021              if ((md->poptions & PCRE_UCP) != 0)
1022                {
1023                if (d == '_') left_word = TRUE; else
1024                  {
1025                  int cat = UCD_CATEGORY(d);
1026                  left_word = (cat == ucp_L || cat == ucp_N);
1027                  }
1028                }
1029              else
1030    #endif
1031            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1032            }            }
1033          else left_word = 0;          else left_word = FALSE;
1034    
1035          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
1036            else right_word = 0;            {
1037    #ifdef SUPPORT_UCP
1038              if ((md->poptions & PCRE_UCP) != 0)
1039                {
1040                if (c == '_') right_word = TRUE; else
1041                  {
1042                  int cat = UCD_CATEGORY(c);
1043                  right_word = (cat == ucp_L || cat == ucp_N);
1044                  }
1045                }
1046              else
1047    #endif
1048              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1049              }
1050            else right_word = FALSE;
1051    
1052          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1053            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 746  for (;;) Line 1055  for (;;)
1055        break;        break;
1056    
1057    
 #ifdef SUPPORT_UCP  
   
1058        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1059        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
1060        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
1061        */        */
1062    
1063    #ifdef SUPPORT_UCP
1064        case OP_PROP:        case OP_PROP:
1065        case OP_NOTPROP:        case OP_NOTPROP:
1066        if (clen > 0)        if (clen > 0)
1067          {          {
1068          int rqdtype, category;          BOOL OK;
1069          category = _pcre_ucp_findchar(c, &chartype, &othercase);          const pcre_uint32 *cp;
1070          rqdtype = code[1];          const ucd_record * prop = GET_UCD(c);
1071          if (rqdtype >= 128)          switch(code[1])
           {  
           if ((rqdtype - 128 == category) == (codevalue == OP_PROP))  
             { ADD_NEW(state_offset + 2, 0); }  
           }  
         else  
1072            {            {
1073            if ((rqdtype == chartype) == (codevalue == OP_PROP))            case PT_ANY:
1074              { ADD_NEW(state_offset + 2, 0); }            OK = TRUE;
1075              break;
1076    
1077              case PT_LAMP:
1078              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1079                   prop->chartype == ucp_Lt;
1080              break;
1081    
1082              case PT_GC:
1083              OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1084              break;
1085    
1086              case PT_PC:
1087              OK = prop->chartype == code[2];
1088              break;
1089    
1090              case PT_SC:
1091              OK = prop->script == code[2];
1092              break;
1093    
1094              /* These are specials for combination cases. */
1095    
1096              case PT_ALNUM:
1097              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1098                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1099              break;
1100    
1101              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1102              which means that Perl space and POSIX space are now identical. PCRE
1103              was changed at release 8.34. */
1104    
1105              case PT_SPACE:    /* Perl space */
1106              case PT_PXSPACE:  /* POSIX space */
1107              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1108                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1109                   c == CHAR_FF || c == CHAR_CR;
1110              break;
1111    
1112              case PT_WORD:
1113              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1114                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1115                   c == CHAR_UNDERSCORE;
1116              break;
1117    
1118              case PT_CLIST:
1119              cp = PRIV(ucd_caseless_sets) + code[2];
1120              for (;;)
1121                {
1122                if (c < *cp) { OK = FALSE; break; }
1123                if (c == *cp++) { OK = TRUE; break; }
1124                }
1125              break;
1126    
1127              case PT_UCNC:
1128              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1129                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1130                   c >= 0xe000;
1131              break;
1132    
1133              /* Should never occur, but keep compilers from grumbling. */
1134    
1135              default:
1136              OK = codevalue != OP_PROP;
1137              break;
1138            }            }
1139    
1140            if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1141          }          }
1142        break;        break;
1143  #endif  #endif
# Line 779  for (;;) Line 1147  for (;;)
1147  /* ========================================================================== */  /* ========================================================================== */
1148        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
1149        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
1150        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1151        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1152    
1153        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1154        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
1155          case OP_TYPEPOSPLUS:
1156        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1157        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1158        if (clen > 0)        if (clen > 0)
1159          {          {
1160          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1161                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1162                NLBLOCK->nltype == NLTYPE_FIXED &&
1163                NLBLOCK->nllen == 2 &&
1164                c == NLBLOCK->nl[0])
1165              {
1166              could_continue = partial_newline = TRUE;
1167              }
1168            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1169              (c < 256 &&              (c < 256 &&
1170                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1171                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1172            {            {
1173              if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1174                {
1175                active_count--;            /* Remove non-match possibility */
1176                next_active_state--;
1177                }
1178            count++;            count++;
1179            ADD_NEW(state_offset, count);            ADD_NEW(state_offset, count);
1180            }            }
# Line 802  for (;;) Line 1184  for (;;)
1184        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1185        case OP_TYPEQUERY:        case OP_TYPEQUERY:
1186        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
1187          case OP_TYPEPOSQUERY:
1188        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1189        if (clen > 0)        if (clen > 0)
1190          {          {
1191          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1192                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1193                NLBLOCK->nltype == NLTYPE_FIXED &&
1194                NLBLOCK->nllen == 2 &&
1195                c == NLBLOCK->nl[0])
1196              {
1197              could_continue = partial_newline = TRUE;
1198              }
1199            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1200              (c < 256 &&              (c < 256 &&
1201                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1202                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1203            {            {
1204              if (codevalue == OP_TYPEPOSQUERY)
1205                {
1206                active_count--;            /* Remove non-match possibility */
1207                next_active_state--;
1208                }
1209            ADD_NEW(state_offset + 2, 0);            ADD_NEW(state_offset + 2, 0);
1210            }            }
1211          }          }
# Line 818  for (;;) Line 1214  for (;;)
1214        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1215        case OP_TYPESTAR:        case OP_TYPESTAR:
1216        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
1217          case OP_TYPEPOSSTAR:
1218        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1219        if (clen > 0)        if (clen > 0)
1220          {          {
1221          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1222                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1223                NLBLOCK->nltype == NLTYPE_FIXED &&
1224                NLBLOCK->nllen == 2 &&
1225                c == NLBLOCK->nl[0])
1226              {
1227              could_continue = partial_newline = TRUE;
1228              }
1229            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1230              (c < 256 &&              (c < 256 &&
1231                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1232                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1233            {            {
1234              if (codevalue == OP_TYPEPOSSTAR)
1235                {
1236                active_count--;            /* Remove non-match possibility */
1237                next_active_state--;
1238                }
1239            ADD_NEW(state_offset, 0);            ADD_NEW(state_offset, 0);
1240            }            }
1241          }          }
# Line 833  for (;;) Line 1243  for (;;)
1243    
1244        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1245        case OP_TYPEEXACT:        case OP_TYPEEXACT:
1246          count = current_state->count;  /* Number already matched */
1247          if (clen > 0)
1248            {
1249            if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1250                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1251                NLBLOCK->nltype == NLTYPE_FIXED &&
1252                NLBLOCK->nllen == 2 &&
1253                c == NLBLOCK->nl[0])
1254              {
1255              could_continue = partial_newline = TRUE;
1256              }
1257            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1258                (c < 256 &&
1259                  (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1260                  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1261              {
1262              if (++count >= (int)GET2(code, 1))
1263                { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1264              else
1265                { ADD_NEW(state_offset, count); }
1266              }
1267            }
1268          break;
1269    
1270          /*-----------------------------------------------------------------*/
1271        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1272        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1273        if (codevalue != OP_TYPEEXACT)        case OP_TYPEPOSUPTO:
1274          { ADD_ACTIVE(state_offset + 4, 0); }        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1275        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1276        if (clen > 0)        if (clen > 0)
1277          {          {
1278          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1279                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1280                NLBLOCK->nltype == NLTYPE_FIXED &&
1281                NLBLOCK->nllen == 2 &&
1282                c == NLBLOCK->nl[0])
1283              {
1284              could_continue = partial_newline = TRUE;
1285              }
1286            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1287              (c < 256 &&              (c < 256 &&
1288                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1289                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1290            {            {
1291            if (++count >= GET2(code, 1))            if (codevalue == OP_TYPEPOSUPTO)
1292              { ADD_NEW(state_offset + 4, 0); }              {
1293                active_count--;           /* Remove non-match possibility */
1294                next_active_state--;
1295                }
1296              if (++count >= (int)GET2(code, 1))
1297                { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1298            else            else
1299              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1300            }            }
# Line 855  for (;;) Line 1303  for (;;)
1303    
1304  /* ========================================================================== */  /* ========================================================================== */
1305        /* These are virtual opcodes that are used when something like        /* These are virtual opcodes that are used when something like
1306        OP_TYPEPLUS has OP_PROP, OP_NOTPROP, or OP_EXTUNI as its argument. It        OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1307        keeps the code above fast for the other cases. The argument is in the        argument. It keeps the code above fast for the other cases. The argument
1308        d variable. */        is in the d variable. */
1309    
1310    #ifdef SUPPORT_UCP
1311        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
1312        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1313          case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1314        count = current_state->count;           /* Already matched */        count = current_state->count;           /* Already matched */
1315        if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1316        if (clen > 0)        if (clen > 0)
1317          {          {
1318          int category = _pcre_ucp_findchar(c, &chartype, &othercase);          BOOL OK;
1319          int rqdtype = code[2];          const pcre_uint32 *cp;
1320          if ((d == OP_PROP) ==          const ucd_record * prop = GET_UCD(c);
1321              (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))          switch(code[2])
1322            { count++; ADD_NEW(state_offset, count); }            {
1323              case PT_ANY:
1324              OK = TRUE;
1325              break;
1326    
1327              case PT_LAMP:
1328              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1329                prop->chartype == ucp_Lt;
1330              break;
1331    
1332              case PT_GC:
1333              OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1334              break;
1335    
1336              case PT_PC:
1337              OK = prop->chartype == code[3];
1338              break;
1339    
1340              case PT_SC:
1341              OK = prop->script == code[3];
1342              break;
1343    
1344              /* These are specials for combination cases. */
1345    
1346              case PT_ALNUM:
1347              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1348                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1349              break;
1350    
1351              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1352              which means that Perl space and POSIX space are now identical. PCRE
1353              was changed at release 8.34. */
1354    
1355              case PT_SPACE:    /* Perl space */
1356              case PT_PXSPACE:  /* POSIX space */
1357              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1358                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1359                   c == CHAR_FF || c == CHAR_CR;
1360              break;
1361    
1362              case PT_WORD:
1363              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1364                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1365                   c == CHAR_UNDERSCORE;
1366              break;
1367    
1368              case PT_CLIST:
1369              cp = PRIV(ucd_caseless_sets) + code[3];
1370              for (;;)
1371                {
1372                if (c < *cp) { OK = FALSE; break; }
1373                if (c == *cp++) { OK = TRUE; break; }
1374                }
1375              break;
1376    
1377              case PT_UCNC:
1378              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1379                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1380                   c >= 0xe000;
1381              break;
1382    
1383              /* Should never occur, but keep compilers from grumbling. */
1384    
1385              default:
1386              OK = codevalue != OP_PROP;
1387              break;
1388              }
1389    
1390            if (OK == (d == OP_PROP))
1391              {
1392              if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1393                {
1394                active_count--;           /* Remove non-match possibility */
1395                next_active_state--;
1396                }
1397              count++;
1398              ADD_NEW(state_offset, count);
1399              }
1400          }          }
1401        break;        break;
1402    
1403        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1404        case OP_EXTUNI_EXTRA + OP_TYPEPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1405        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1406          case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1407        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1408        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1409        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0)
1410          {          {
1411          const uschar *nptr = ptr + clen;          int lgb, rgb;
1412            const pcre_uchar *nptr = ptr + clen;
1413          int ncount = 0;          int ncount = 0;
1414            if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1415              {
1416              active_count--;           /* Remove non-match possibility */
1417              next_active_state--;
1418              }
1419            lgb = UCD_GRAPHBREAK(c);
1420          while (nptr < end_subject)          while (nptr < end_subject)
1421            {            {
1422            int nd;            dlen = 1;
1423            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1424            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1425            if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1426            ncount++;            ncount++;
1427            nptr += ndlen;            lgb = rgb;
1428              nptr += dlen;
1429            }            }
1430          count++;          count++;
1431          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1432          }          }
1433        break;        break;
1434    #endif
1435    
1436        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1437        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1438        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1439        count = 3;        case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1440        goto QS1;        count = current_state->count;  /* Already matched */
1441          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
       case OP_PROP_EXTRA + OP_TYPESTAR:  
       case OP_PROP_EXTRA + OP_TYPEMINSTAR:  
       count = 0;  
   
       QS1:  
   
       ADD_ACTIVE(state_offset + 3, 0);  
1442        if (clen > 0)        if (clen > 0)
1443          {          {
         int category = _pcre_ucp_findchar(c, &chartype, &othercase);  
         int rqdtype = code[2];  
         if ((d == OP_PROP) ==  
             (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))  
           { ADD_NEW(state_offset + count, 0); }  
         }  
       break;  
   
       /*-----------------------------------------------------------------*/  
       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:  
       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:  
       count = 2;  
       goto QS2;  
   
       case OP_EXTUNI_EXTRA + OP_TYPESTAR:  
       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:  
       count = 0;  
   
       QS2:  
   
       ADD_ACTIVE(state_offset + 2, 0);  
       if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)  
         {  
         const uschar *nptr = ptr + clen;  
1444          int ncount = 0;          int ncount = 0;
1445          while (nptr < end_subject)          switch (c)
1446            {            {
1447            int nd;            case CHAR_VT:
1448            int ndlen = 1;            case CHAR_FF:
1449            GETCHARLEN(nd, nptr, ndlen);            case CHAR_NEL:
1450            if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;  #ifndef EBCDIC
1451            ncount++;            case 0x2028:
1452            nptr += ndlen;            case 0x2029:
1453    #endif  /* Not EBCDIC */
1454              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1455              goto ANYNL01;
1456    
1457              case CHAR_CR:
1458              if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1459              /* Fall through */
1460    
1461              ANYNL01:
1462              case CHAR_LF:
1463              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1464                {
1465                active_count--;           /* Remove non-match possibility */
1466                next_active_state--;
1467                }
1468              count++;
1469              ADD_NEW_DATA(-state_offset, count, ncount);
1470              break;
1471    
1472              default:
1473              break;
1474            }            }
         ADD_NEW_DATA(-(state_offset + count), 0, ncount);  
1475          }          }
1476        break;        break;
1477    
1478        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1479        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1480        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1481        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1482          count = current_state->count;  /* Already matched */
1483          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1484          if (clen > 0)
1485            {
1486            BOOL OK;
1487            switch (c)
1488              {
1489              VSPACE_CASES:
1490              OK = TRUE;
1491              break;
1492    
1493              default:
1494              OK = FALSE;
1495              break;
1496              }
1497    
1498            if (OK == (d == OP_VSPACE))
1499              {
1500              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1501                {
1502                active_count--;           /* Remove non-match possibility */
1503                next_active_state--;
1504                }
1505              count++;
1506              ADD_NEW_DATA(-state_offset, count, 0);
1507              }
1508            }
1509          break;
1510    
1511          /*-----------------------------------------------------------------*/
1512          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1513          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1514          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1515          count = current_state->count;  /* Already matched */
1516          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1517          if (clen > 0)
1518            {
1519            BOOL OK;
1520            switch (c)
1521              {
1522              HSPACE_CASES:
1523              OK = TRUE;
1524              break;
1525    
1526              default:
1527              OK = FALSE;
1528              break;
1529              }
1530    
1531            if (OK == (d == OP_HSPACE))
1532              {
1533              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1534                {
1535                active_count--;           /* Remove non-match possibility */
1536                next_active_state--;
1537                }
1538              count++;
1539              ADD_NEW_DATA(-state_offset, count, 0);
1540              }
1541            }
1542          break;
1543    
1544          /*-----------------------------------------------------------------*/
1545    #ifdef SUPPORT_UCP
1546          case OP_PROP_EXTRA + OP_TYPEQUERY:
1547          case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1548          case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1549          count = 4;
1550          goto QS1;
1551    
1552          case OP_PROP_EXTRA + OP_TYPESTAR:
1553          case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1554          case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1555          count = 0;
1556    
1557          QS1:
1558    
1559          ADD_ACTIVE(state_offset + 4, 0);
1560          if (clen > 0)
1561            {
1562            BOOL OK;
1563            const pcre_uint32 *cp;
1564            const ucd_record * prop = GET_UCD(c);
1565            switch(code[2])
1566              {
1567              case PT_ANY:
1568              OK = TRUE;
1569              break;
1570    
1571              case PT_LAMP:
1572              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1573                prop->chartype == ucp_Lt;
1574              break;
1575    
1576              case PT_GC:
1577              OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1578              break;
1579    
1580              case PT_PC:
1581              OK = prop->chartype == code[3];
1582              break;
1583    
1584              case PT_SC:
1585              OK = prop->script == code[3];
1586              break;
1587    
1588              /* These are specials for combination cases. */
1589    
1590              case PT_ALNUM:
1591              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1592                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1593              break;
1594    
1595              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1596              which means that Perl space and POSIX space are now identical. PCRE
1597              was changed at release 8.34. */
1598    
1599              case PT_SPACE:    /* Perl space */
1600              case PT_PXSPACE:  /* POSIX space */
1601              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1602                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1603                   c == CHAR_FF || c == CHAR_CR;
1604              break;
1605    
1606              case PT_WORD:
1607              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1608                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1609                   c == CHAR_UNDERSCORE;
1610              break;
1611    
1612              case PT_CLIST:
1613              cp = PRIV(ucd_caseless_sets) + code[3];
1614              for (;;)
1615                {
1616                if (c < *cp) { OK = FALSE; break; }
1617                if (c == *cp++) { OK = TRUE; break; }
1618                }
1619              break;
1620    
1621              case PT_UCNC:
1622              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1623                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1624                   c >= 0xe000;
1625              break;
1626    
1627              /* Should never occur, but keep compilers from grumbling. */
1628    
1629              default:
1630              OK = codevalue != OP_PROP;
1631              break;
1632              }
1633    
1634            if (OK == (d == OP_PROP))
1635              {
1636              if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1637                  codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1638                {
1639                active_count--;           /* Remove non-match possibility */
1640                next_active_state--;
1641                }
1642              ADD_NEW(state_offset + count, 0);
1643              }
1644            }
1645          break;
1646    
1647          /*-----------------------------------------------------------------*/
1648          case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1649          case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1650          case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1651          count = 2;
1652          goto QS2;
1653    
1654          case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1655          case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1656          case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1657          count = 0;
1658    
1659          QS2:
1660    
1661          ADD_ACTIVE(state_offset + 2, 0);
1662          if (clen > 0)
1663            {
1664            int lgb, rgb;
1665            const pcre_uchar *nptr = ptr + clen;
1666            int ncount = 0;
1667            if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1668                codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1669              {
1670              active_count--;           /* Remove non-match possibility */
1671              next_active_state--;
1672              }
1673            lgb = UCD_GRAPHBREAK(c);
1674            while (nptr < end_subject)
1675              {
1676              dlen = 1;
1677              if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1678              rgb = UCD_GRAPHBREAK(d);
1679              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1680              ncount++;
1681              lgb = rgb;
1682              nptr += dlen;
1683              }
1684            ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1685            }
1686          break;
1687    #endif
1688    
1689          /*-----------------------------------------------------------------*/
1690          case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1691          case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1692          case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1693          count = 2;
1694          goto QS3;
1695    
1696          case OP_ANYNL_EXTRA + OP_TYPESTAR:
1697          case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1698          case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1699          count = 0;
1700    
1701          QS3:
1702          ADD_ACTIVE(state_offset + 2, 0);
1703          if (clen > 0)
1704            {
1705            int ncount = 0;
1706            switch (c)
1707              {
1708              case CHAR_VT:
1709              case CHAR_FF:
1710              case CHAR_NEL:
1711    #ifndef EBCDIC
1712              case 0x2028:
1713              case 0x2029:
1714    #endif  /* Not EBCDIC */
1715              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1716              goto ANYNL02;
1717    
1718              case CHAR_CR:
1719              if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1720              /* Fall through */
1721    
1722              ANYNL02:
1723              case CHAR_LF:
1724              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1725                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1726                {
1727                active_count--;           /* Remove non-match possibility */
1728                next_active_state--;
1729                }
1730              ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1731              break;
1732    
1733              default:
1734              break;
1735              }
1736            }
1737          break;
1738    
1739          /*-----------------------------------------------------------------*/
1740          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1741          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1742          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1743          count = 2;
1744          goto QS4;
1745    
1746          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1747          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1748          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1749          count = 0;
1750    
1751          QS4:
1752          ADD_ACTIVE(state_offset + 2, 0);
1753          if (clen > 0)
1754            {
1755            BOOL OK;
1756            switch (c)
1757              {
1758              VSPACE_CASES:
1759              OK = TRUE;
1760              break;
1761    
1762              default:
1763              OK = FALSE;
1764              break;
1765              }
1766            if (OK == (d == OP_VSPACE))
1767              {
1768              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1769                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1770                {
1771                active_count--;           /* Remove non-match possibility */
1772                next_active_state--;
1773                }
1774              ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1775              }
1776            }
1777          break;
1778    
1779          /*-----------------------------------------------------------------*/
1780          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1781          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1782          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1783          count = 2;
1784          goto QS5;
1785    
1786          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1787          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1788          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1789          count = 0;
1790    
1791          QS5:
1792          ADD_ACTIVE(state_offset + 2, 0);
1793          if (clen > 0)
1794            {
1795            BOOL OK;
1796            switch (c)
1797              {
1798              HSPACE_CASES:
1799              OK = TRUE;
1800              break;
1801    
1802              default:
1803              OK = FALSE;
1804              break;
1805              }
1806    
1807            if (OK == (d == OP_HSPACE))
1808              {
1809              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1810                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1811                {
1812                active_count--;           /* Remove non-match possibility */
1813                next_active_state--;
1814                }
1815              ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1816              }
1817            }
1818          break;
1819    
1820          /*-----------------------------------------------------------------*/
1821    #ifdef SUPPORT_UCP
1822          case OP_PROP_EXTRA + OP_TYPEEXACT:
1823          case OP_PROP_EXTRA + OP_TYPEUPTO:
1824          case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1825          case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1826        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1827          { ADD_ACTIVE(state_offset + 5, 0); }          { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1828        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1829        if (clen > 0)        if (clen > 0)
1830          {          {
1831          int category = _pcre_ucp_findchar(c, &chartype, &othercase);          BOOL OK;
1832          int rqdtype = code[4];          const pcre_uint32 *cp;
1833          if ((d == OP_PROP) ==          const ucd_record * prop = GET_UCD(c);
1834              (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))          switch(code[1 + IMM2_SIZE + 1])
1835            {            {
1836            if (++count >= GET2(code, 1))            case PT_ANY:
1837              { ADD_NEW(state_offset + 5, 0); }            OK = TRUE;
1838              break;
1839    
1840              case PT_LAMP:
1841              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1842                prop->chartype == ucp_Lt;
1843              break;
1844    
1845              case PT_GC:
1846              OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1847              break;
1848    
1849              case PT_PC:
1850              OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1851              break;
1852    
1853              case PT_SC:
1854              OK = prop->script == code[1 + IMM2_SIZE + 2];
1855              break;
1856    
1857              /* These are specials for combination cases. */
1858    
1859              case PT_ALNUM:
1860              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1861                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1862              break;
1863    
1864              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1865              which means that Perl space and POSIX space are now identical. PCRE
1866              was changed at release 8.34. */
1867    
1868              case PT_SPACE:    /* Perl space */
1869              case PT_PXSPACE:  /* POSIX space */
1870              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1871                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1872                   c == CHAR_FF || c == CHAR_CR;
1873              break;
1874    
1875              case PT_WORD:
1876              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1877                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1878                   c == CHAR_UNDERSCORE;
1879              break;
1880    
1881              case PT_CLIST:
1882              cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1883              for (;;)
1884                {
1885                if (c < *cp) { OK = FALSE; break; }
1886                if (c == *cp++) { OK = TRUE; break; }
1887                }
1888              break;
1889    
1890              case PT_UCNC:
1891              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1892                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1893                   c >= 0xe000;
1894              break;
1895    
1896              /* Should never occur, but keep compilers from grumbling. */
1897    
1898              default:
1899              OK = codevalue != OP_PROP;
1900              break;
1901              }
1902    
1903            if (OK == (d == OP_PROP))
1904              {
1905              if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1906                {
1907                active_count--;           /* Remove non-match possibility */
1908                next_active_state--;
1909                }
1910              if (++count >= (int)GET2(code, 1))
1911                { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1912            else            else
1913              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1914            }            }
# Line 975  for (;;) Line 1919  for (;;)
1919        case OP_EXTUNI_EXTRA + OP_TYPEEXACT:        case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1920        case OP_EXTUNI_EXTRA + OP_TYPEUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1921        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1922          case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1923        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1924          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1925        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1926        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0)
1927          {          {
1928          const uschar *nptr = ptr + clen;          int lgb, rgb;
1929            const pcre_uchar *nptr = ptr + clen;
1930          int ncount = 0;          int ncount = 0;
1931            if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1932              {
1933              active_count--;           /* Remove non-match possibility */
1934              next_active_state--;
1935              }
1936            lgb = UCD_GRAPHBREAK(c);
1937          while (nptr < end_subject)          while (nptr < end_subject)
1938            {            {
1939            int nd;            dlen = 1;
1940            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1941            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1942            if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1943            ncount++;            ncount++;
1944            nptr += ndlen;            lgb = rgb;
1945              nptr += dlen;
1946            }            }
1947          if (++count >= GET2(code, 1))          if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1948            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              reset_could_continue = TRUE;
1949            if (++count >= (int)GET2(code, 1))
1950              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1951          else          else
1952            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1953          }          }
1954        break;        break;
1955    #endif
1956    
1957          /*-----------------------------------------------------------------*/
1958          case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1959          case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1960          case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1961          case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1962          if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1963            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1964          count = current_state->count;  /* Number already matched */
1965          if (clen > 0)
1966            {
1967            int ncount = 0;
1968            switch (c)
1969              {
1970              case CHAR_VT:
1971              case CHAR_FF:
1972              case CHAR_NEL:
1973    #ifndef EBCDIC
1974              case 0x2028:
1975              case 0x2029:
1976    #endif  /* Not EBCDIC */
1977              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1978              goto ANYNL03;
1979    
1980              case CHAR_CR:
1981              if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1982              /* Fall through */
1983    
1984              ANYNL03:
1985              case CHAR_LF:
1986              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1987                {
1988                active_count--;           /* Remove non-match possibility */
1989                next_active_state--;
1990                }
1991              if (++count >= (int)GET2(code, 1))
1992                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1993              else
1994                { ADD_NEW_DATA(-state_offset, count, ncount); }
1995              break;
1996    
1997              default:
1998              break;
1999              }
2000            }
2001          break;
2002    
2003          /*-----------------------------------------------------------------*/
2004          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2005          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2006          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2007          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2008          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2009            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2010          count = current_state->count;  /* Number already matched */
2011          if (clen > 0)
2012            {
2013            BOOL OK;
2014            switch (c)
2015              {
2016              VSPACE_CASES:
2017              OK = TRUE;
2018              break;
2019    
2020              default:
2021              OK = FALSE;
2022              }
2023    
2024            if (OK == (d == OP_VSPACE))
2025              {
2026              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2027                {
2028                active_count--;           /* Remove non-match possibility */
2029                next_active_state--;
2030                }
2031              if (++count >= (int)GET2(code, 1))
2032                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2033              else
2034                { ADD_NEW_DATA(-state_offset, count, 0); }
2035              }
2036            }
2037          break;
2038    
2039          /*-----------------------------------------------------------------*/
2040          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2041          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2042          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2043          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2044          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2045            { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2046          count = current_state->count;  /* Number already matched */
2047          if (clen > 0)
2048            {
2049            BOOL OK;
2050            switch (c)
2051              {
2052              HSPACE_CASES:
2053              OK = TRUE;
2054              break;
2055    
2056              default:
2057              OK = FALSE;
2058              break;
2059              }
2060    
2061            if (OK == (d == OP_HSPACE))
2062              {
2063              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2064                {
2065                active_count--;           /* Remove non-match possibility */
2066                next_active_state--;
2067                }
2068              if (++count >= (int)GET2(code, 1))
2069                { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2070              else
2071                { ADD_NEW_DATA(-state_offset, count, 0); }
2072              }
2073            }
2074          break;
2075    
2076  /* ========================================================================== */  /* ========================================================================== */
2077        /* These opcodes are followed by a character that is usually compared        /* These opcodes are followed by a character that is usually compared
# Line 1010  for (;;) Line 2085  for (;;)
2085        break;        break;
2086    
2087        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2088        case OP_CHARNC:        case OP_CHARI:
2089        if (clen == 0) break;        if (clen == 0) break;
2090    
2091  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2092        if (utf8)        if (utf)
2093          {          {
2094          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2095            {            {
2096            if (c < 128) othercase = fcc[c]; else            unsigned int othercase;
2097              if (c < 128)
2098            /* If we have Unicode property support, we can use it to test the              othercase = fcc[c];
2099            other case of the character, if there is one. The result of            else
2100            _pcre_ucp_findchar() is < 0 if the char isn't found, and othercase is              /* If we have Unicode property support, we can use it to test the
2101            returned as zero if there isn't another case. */              other case of the character. */
   
2102  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2103            if (_pcre_ucp_findchar(c, &chartype, &othercase) < 0)              othercase = UCD_OTHERCASE(c);
2104    #else
2105                othercase = NOTACHAR;
2106  #endif  #endif
             othercase = -1;  
2107    
2108            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2109            }            }
2110          }          }
2111        else        else
2112  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2113          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2114          {          {
2115          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2116              { ADD_NEW(state_offset + 2, 0); }
2117          }          }
2118        break;        break;
2119    
# Line 1049  for (;;) Line 2124  for (;;)
2124        Find out how many characters to skip, and then set up a negative state        Find out how many characters to skip, and then set up a negative state
2125        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2126    
2127        case OP_EXTUNI:        case OP_EXTUNI:
2128        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0)
2129            {
2130            int lgb, rgb;
2131            const pcre_uchar *nptr = ptr + clen;
2132            int ncount = 0;
2133            lgb = UCD_GRAPHBREAK(c);
2134            while (nptr < end_subject)
2135              {
2136              dlen = 1;
2137              if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2138              rgb = UCD_GRAPHBREAK(d);
2139              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2140              ncount++;
2141              lgb = rgb;
2142              nptr += dlen;
2143              }
2144            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2145                reset_could_continue = TRUE;
2146            ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2147            }
2148          break;
2149    #endif
2150    
2151          /*-----------------------------------------------------------------*/
2152          /* This is a tricky like EXTUNI because it too can match more than one
2153          character (when CR is followed by LF). In this case, set up a negative
2154          state to wait for one character to pass before continuing. */
2155    
2156          case OP_ANYNL:
2157          if (clen > 0) switch(c)
2158            {
2159            case CHAR_VT:
2160            case CHAR_FF:
2161            case CHAR_NEL:
2162    #ifndef EBCDIC
2163            case 0x2028:
2164            case 0x2029:
2165    #endif  /* Not EBCDIC */
2166            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2167    
2168            case CHAR_LF:
2169            ADD_NEW(state_offset + 1, 0);
2170            break;
2171    
2172            case CHAR_CR:
2173            if (ptr + 1 >= end_subject)
2174              {
2175              ADD_NEW(state_offset + 1, 0);
2176              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2177                reset_could_continue = TRUE;
2178              }
2179            else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2180              {
2181              ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2182              }
2183            else
2184              {
2185              ADD_NEW(state_offset + 1, 0);
2186              }
2187            break;
2188            }
2189          break;
2190    
2191          /*-----------------------------------------------------------------*/
2192          case OP_NOT_VSPACE:
2193          if (clen > 0) switch(c)
2194            {
2195            VSPACE_CASES:
2196            break;
2197    
2198            default:
2199            ADD_NEW(state_offset + 1, 0);
2200            break;
2201            }
2202          break;
2203    
2204          /*-----------------------------------------------------------------*/
2205          case OP_VSPACE:
2206          if (clen > 0) switch(c)
2207            {
2208            VSPACE_CASES:
2209            ADD_NEW(state_offset + 1, 0);
2210            break;
2211    
2212            default:
2213            break;
2214            }
2215          break;
2216    
2217          /*-----------------------------------------------------------------*/
2218          case OP_NOT_HSPACE:
2219          if (clen > 0) switch(c)
2220            {
2221            HSPACE_CASES:
2222            break;
2223    
2224            default:
2225            ADD_NEW(state_offset + 1, 0);
2226            break;
2227            }
2228          break;
2229    
2230          /*-----------------------------------------------------------------*/
2231          case OP_HSPACE:
2232          if (clen > 0) switch(c)
2233          {          {
2234          const uschar *nptr = ptr + clen;          HSPACE_CASES:
2235          int ncount = 0;          ADD_NEW(state_offset + 1, 0);
2236          while (nptr < end_subject)          break;
2237            {  
2238            int nclen = 1;          default:
2239            GETCHARLEN(c, nptr, nclen);          break;
           if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) break;  
           ncount++;  
           nptr += nclen;  
           }  
         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);  
2240          }          }
2241        break;        break;
 #endif  
2242    
2243        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2244        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character casefully. */
       characters, that is, we know that d < 256. The character we are  
       checking (c) can be multibyte. */  
2245    
2246        case OP_NOT:        case OP_NOT:
2247          if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2248          break;
2249    
2250          /*-----------------------------------------------------------------*/
2251          /* Match a negated single character caselessly. */
2252    
2253          case OP_NOTI:
2254        if (clen > 0)        if (clen > 0)
2255          {          {
2256          int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          unsigned int otherd;
2257          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  #ifdef SUPPORT_UTF
2258            if (utf && d >= 128)
2259              {
2260    #ifdef SUPPORT_UCP
2261              otherd = UCD_OTHERCASE(d);
2262    #endif  /* SUPPORT_UCP */
2263              }
2264            else
2265    #endif  /* SUPPORT_UTF */
2266            otherd = TABLE_GET(d, fcc, d);
2267            if (c != d && c != otherd)
2268              { ADD_NEW(state_offset + dlen + 1, 0); }
2269          }          }
2270        break;        break;
2271    
2272        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2273          case OP_PLUSI:
2274          case OP_MINPLUSI:
2275          case OP_POSPLUSI:
2276          case OP_NOTPLUSI:
2277          case OP_NOTMINPLUSI:
2278          case OP_NOTPOSPLUSI:
2279          caseless = TRUE;
2280          codevalue -= OP_STARI - OP_STAR;
2281    
2282          /* Fall through */
2283        case OP_PLUS:        case OP_PLUS:
2284        case OP_MINPLUS:        case OP_MINPLUS:
2285          case OP_POSPLUS:
2286        case OP_NOTPLUS:        case OP_NOTPLUS:
2287        case OP_NOTMINPLUS:        case OP_NOTMINPLUS:
2288          case OP_NOTPOSPLUS:
2289        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
2290        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2291        if (clen > 0)        if (clen > 0)
2292          {          {
2293          int otherd = -1;          pcre_uint32 otherd = NOTACHAR;
2294          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2295            {            {
2296  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2297            if (utf8 && c >= 128)            if (utf && d >= 128)
2298              {              {
2299  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2300              if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;              otherd = UCD_OTHERCASE(d);
2301  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2302              }              }
2303            else            else
2304  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2305            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2306            }            }
2307          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2308            { count++; ADD_NEW(state_offset, count); }            {
2309              if (count > 0 &&
2310                  (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2311                {
2312                active_count--;             /* Remove non-match possibility */
2313                next_active_state--;
2314                }
2315              count++;
2316              ADD_NEW(state_offset, count);
2317              }
2318          }          }
2319        break;        break;
2320    
2321        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2322          case OP_QUERYI:
2323          case OP_MINQUERYI:
2324          case OP_POSQUERYI:
2325          case OP_NOTQUERYI:
2326          case OP_NOTMINQUERYI:
2327          case OP_NOTPOSQUERYI:
2328          caseless = TRUE;
2329          codevalue -= OP_STARI - OP_STAR;
2330          /* Fall through */
2331        case OP_QUERY:        case OP_QUERY:
2332        case OP_MINQUERY:        case OP_MINQUERY:
2333          case OP_POSQUERY:
2334        case OP_NOTQUERY:        case OP_NOTQUERY:
2335        case OP_NOTMINQUERY:        case OP_NOTMINQUERY:
2336          case OP_NOTPOSQUERY:
2337        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2338        if (clen > 0)        if (clen > 0)
2339          {          {
2340          int otherd = -1;          pcre_uint32 otherd = NOTACHAR;
2341          if ((ims && PCRE_CASELESS) != 0)          if (caseless)
2342            {            {
2343  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2344            if (utf8 && c >= 128)            if (utf && d >= 128)
2345              {              {
2346  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2347              if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;              otherd = UCD_OTHERCASE(d);
2348  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2349              }              }
2350            else            else
2351  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2352            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2353            }            }
2354          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2355            { ADD_NEW(state_offset + dlen + 1, 0); }            {
2356              if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2357                {
2358                active_count--;            /* Remove non-match possibility */
2359                next_active_state--;
2360                }
2361              ADD_NEW(state_offset + dlen + 1, 0);
2362              }
2363          }          }
2364        break;        break;
2365    
2366        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2367          case OP_STARI:
2368          case OP_MINSTARI:
2369          case OP_POSSTARI:
2370          case OP_NOTSTARI:
2371          case OP_NOTMINSTARI:
2372          case OP_NOTPOSSTARI:
2373          caseless = TRUE;
2374          codevalue -= OP_STARI - OP_STAR;
2375          /* Fall through */
2376        case OP_STAR:        case OP_STAR:
2377        case OP_MINSTAR:        case OP_MINSTAR:
2378          case OP_POSSTAR:
2379        case OP_NOTSTAR:        case OP_NOTSTAR:
2380        case OP_NOTMINSTAR:        case OP_NOTMINSTAR:
2381          case OP_NOTPOSSTAR:
2382        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2383        if (clen > 0)        if (clen > 0)
2384          {          {
2385          int otherd = -1;          pcre_uint32 otherd = NOTACHAR;
2386          if ((ims && PCRE_CASELESS) != 0)          if (caseless)
2387            {            {
2388  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2389            if (utf8 && c >= 128)            if (utf && d >= 128)
2390              {              {
2391  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2392              if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;              otherd = UCD_OTHERCASE(d);
2393  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2394              }              }
2395            else            else
2396  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2397            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2398            }            }
2399          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2400            { ADD_NEW(state_offset, 0); }            {
2401              if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2402                {
2403                active_count--;            /* Remove non-match possibility */
2404                next_active_state--;
2405                }
2406              ADD_NEW(state_offset, 0);
2407              }
2408          }          }
2409        break;        break;
2410    
2411        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2412          case OP_EXACTI:
2413          case OP_NOTEXACTI:
2414          caseless = TRUE;
2415          codevalue -= OP_STARI - OP_STAR;
2416          /* Fall through */
2417        case OP_EXACT:        case OP_EXACT:
2418          case OP_NOTEXACT:
2419          count = current_state->count;  /* Number already matched */
2420          if (clen > 0)
2421            {
2422            pcre_uint32 otherd = NOTACHAR;
2423            if (caseless)
2424              {
2425    #ifdef SUPPORT_UTF
2426              if (utf && d >= 128)
2427                {
2428    #ifdef SUPPORT_UCP
2429                otherd = UCD_OTHERCASE(d);
2430    #endif  /* SUPPORT_UCP */
2431                }
2432              else
2433    #endif  /* SUPPORT_UTF */
2434              otherd = TABLE_GET(d, fcc, d);
2435              }
2436            if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2437              {
2438              if (++count >= (int)GET2(code, 1))
2439                { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2440              else
2441                { ADD_NEW(state_offset, count); }
2442              }
2443            }
2444          break;
2445    
2446          /*-----------------------------------------------------------------*/
2447          case OP_UPTOI:
2448          case OP_MINUPTOI:
2449          case OP_POSUPTOI:
2450          case OP_NOTUPTOI:
2451          case OP_NOTMINUPTOI:
2452          case OP_NOTPOSUPTOI:
2453          caseless = TRUE;
2454          codevalue -= OP_STARI - OP_STAR;
2455          /* Fall through */
2456        case OP_UPTO:        case OP_UPTO:
2457        case OP_MINUPTO:        case OP_MINUPTO:
2458        case OP_NOTEXACT:        case OP_POSUPTO:
2459        case OP_NOTUPTO:        case OP_NOTUPTO:
2460        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2461        if (codevalue != OP_EXACT && codevalue != OP_NOTEXACT)        case OP_NOTPOSUPTO:
2462          { ADD_ACTIVE(state_offset + dlen + 3, 0); }        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2463        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2464        if (clen > 0)        if (clen > 0)
2465          {          {
2466          int otherd = -1;          pcre_uint32 otherd = NOTACHAR;
2467          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2468            {            {
2469  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2470            if (utf8 && c >= 128)            if (utf && d >= 128)
2471              {              {
2472  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2473              if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;              otherd = UCD_OTHERCASE(d);
2474  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2475              }              }
2476            else            else
2477  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2478            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2479            }            }
2480          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2481            {            {
2482            if (++count >= GET2(code, 1))            if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2483              { ADD_NEW(state_offset + dlen + 3, 0); }              {
2484                active_count--;             /* Remove non-match possibility */
2485                next_active_state--;
2486                }
2487              if (++count >= (int)GET2(code, 1))
2488                { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2489            else            else
2490              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2491            }            }
# Line 1208  for (;;) Line 2502  for (;;)
2502          {          {
2503          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2504          int next_state_offset;          int next_state_offset;
2505          const uschar *ecode;          const pcre_uchar *ecode;
2506    
2507          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2508          can set isinclass from it. */          can set isinclass from it. */
2509    
2510          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2511            {            {
2512            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2513            if (clen > 0)            if (clen > 0)
2514              {              {
2515              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2516                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2517              }              }
2518            }            }
2519    
# Line 1230  for (;;) Line 2524  for (;;)
2524          else          else
2525           {           {
2526           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2527           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2528           }           }
2529    
2530          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2531          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2532          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2533    
2534          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2535    
2536          switch (*ecode)          switch (*ecode)
2537            {            {
# Line 1263  for (;;) Line 2557  for (;;)
2557            case OP_CRRANGE:            case OP_CRRANGE:
2558            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2559            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2560            if (count >= GET2(ecode, 1))            if (count >= (int)GET2(ecode, 1))
2561              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2562            if (isinclass)            if (isinclass)
2563              {              {
2564              if (++count >= GET2(ecode, 3))              int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2565                { ADD_NEW(next_state_offset + 5, 0); }              if (++count >= max && max != 0)   /* Max 0 => no limit */
2566                  { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2567              else              else
2568                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2569              }              }
# Line 1283  for (;;) Line 2578  for (;;)
2578    
2579  /* ========================================================================== */  /* ========================================================================== */
2580        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2581        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2582          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2583          though the other "backtracking verbs" are not supported. */
2584    
2585          case OP_FAIL:
2586          forced_fail++;    /* Count FAILs for multiple states */
2587          break;
2588    
2589        case OP_ASSERT:        case OP_ASSERT:
2590        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1293  for (;;) Line 2594  for (;;)
2594          int rc;          int rc;
2595          int local_offsets[2];          int local_offsets[2];
2596          int local_workspace[1000];          int local_workspace[1000];
2597          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2598    
2599          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2600    
# Line 1301  for (;;) Line 2602  for (;;)
2602            md,                                   /* static match data */            md,                                   /* static match data */
2603            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2604            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2605            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2606            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2607            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2608            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2609            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2610            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2611    
2612            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2613          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2614              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2615          }          }
2616        break;        break;
2617    
2618        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2619        case OP_COND:        case OP_COND:
2620          case OP_SCOND:
2621          {          {
2622          int local_offsets[1000];          int local_offsets[1000];
2623          int local_workspace[1000];          int local_workspace[1000];
2624          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2625            int condcode;
2626    
2627          /* The only supported version of OP_CREF is for the value 0xffff, which          /* Because of the way auto-callout works during compile, a callout item
2628          means "test if in a recursion". */          is inserted between OP_COND and an assertion condition. This does not
2629            happen for the other conditions. */
2630    
2631          if (condcode == OP_CREF)          if (code[LINK_SIZE+1] == OP_CALLOUT)
2632            {            {
2633            int value = GET2(code, LINK_SIZE+2);            rrc = 0;
2634            if (value != 0xffff) return PCRE_ERROR_DFA_UCOND;            if (PUBL(callout) != NULL)
2635            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }              {
2636              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              PUBL(callout_block) cb;
2637                cb.version          = 1;   /* Version 1 of the callout block */
2638                cb.callout_number   = code[LINK_SIZE+2];
2639                cb.offset_vector    = offsets;
2640    #if defined COMPILE_PCRE8
2641                cb.subject          = (PCRE_SPTR)start_subject;
2642    #elif defined COMPILE_PCRE16
2643                cb.subject          = (PCRE_SPTR16)start_subject;
2644    #elif defined COMPILE_PCRE32
2645                cb.subject          = (PCRE_SPTR32)start_subject;
2646    #endif
2647                cb.subject_length   = (int)(end_subject - start_subject);
2648                cb.start_match      = (int)(current_subject - start_subject);
2649                cb.current_position = (int)(ptr - start_subject);
2650                cb.pattern_position = GET(code, LINK_SIZE + 3);
2651                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2652                cb.capture_top      = 1;
2653                cb.capture_last     = -1;
2654                cb.callout_data     = md->callout_data;
2655                cb.mark             = NULL;   /* No (*MARK) support */
2656                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2657                }
2658              if (rrc > 0) break;                      /* Fail this thread */
2659              code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2660              }
2661    
2662            condcode = code[LINK_SIZE+1];
2663    
2664            /* Back reference conditions are not supported */
2665    
2666            if (condcode == OP_CREF || condcode == OP_NCREF)
2667              return PCRE_ERROR_DFA_UCOND;
2668    
2669            /* The DEFINE condition is always false */
2670    
2671            if (condcode == OP_DEF)
2672              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2673    
2674            /* The only supported version of OP_RREF is for the value RREF_ANY,
2675            which means "test if in any recursion". We can't test for specifically
2676            recursed groups. */
2677    
2678            else if (condcode == OP_RREF || condcode == OP_NRREF)
2679              {
2680              int value = GET2(code, LINK_SIZE + 2);
2681              if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2682              if (md->recursive != NULL)
2683                { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2684              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2685            }            }
2686    
2687          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1338  for (;;) Line 2689  for (;;)
2689          else          else
2690            {            {
2691            int rc;            int rc;
2692            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2693            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2694    
2695            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2696    
# Line 1347  for (;;) Line 2698  for (;;)
2698              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2699              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2700              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2701              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2702              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2703              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2704              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2705              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2706              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2707    
2708              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2709            if ((rc >= 0) ==            if ((rc >= 0) ==
2710                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2711              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2712            else            else
2713              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2714            }            }
2715          }          }
2716        break;        break;
# Line 1368  for (;;) Line 2718  for (;;)
2718        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2719        case OP_RECURSE:        case OP_RECURSE:
2720          {          {
2721            dfa_recursion_info *ri;
2722          int local_offsets[1000];          int local_offsets[1000];
2723          int local_workspace[1000];          int local_workspace[1000];
2724            const pcre_uchar *callpat = start_code + GET(code, 1);
2725            int recno = (callpat == md->start_code)? 0 :
2726              GET2(callpat, 1 + LINK_SIZE);
2727          int rc;          int rc;
2728    
2729          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2730            recursing + 1));  
2731            /* Check for repeating a recursion without advancing the subject
2732            pointer. This should catch convoluted mutual recursions. (Some simple
2733            cases are caught at compile time.) */
2734    
2735            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2736              if (recno == ri->group_num && ptr == ri->subject_position)
2737                return PCRE_ERROR_RECURSELOOP;
2738    
2739            /* Remember this recursion and where we started it so as to
2740            catch infinite loops. */
2741    
2742            new_recursive.group_num = recno;
2743            new_recursive.subject_position = ptr;
2744            new_recursive.prevrec = md->recursive;
2745            md->recursive = &new_recursive;
2746    
2747          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2748            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2749            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2750            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2751            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2752            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2753            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2754            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2755            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2756            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
2757            rlevel,                               /* function recursion level */  
2758            recursing + 1);                       /* regex recurse level */          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2759    
2760          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2761            recursing + 1, rc));            rc));
2762    
2763          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2764    
# Line 1403  for (;;) Line 2772  for (;;)
2772            {            {
2773            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2774              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2775              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2776              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2777                if (utf)
2778                  {
2779                  const pcre_uchar *p = start_subject + local_offsets[rc];
2780                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2781                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2782                  }
2783    #endif
2784              if (charcount > 0)              if (charcount > 0)
2785                {                {
2786                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 1422  for (;;) Line 2796  for (;;)
2796        break;        break;
2797    
2798        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2799          case OP_BRAPOS:
2800          case OP_SBRAPOS:
2801          case OP_CBRAPOS:
2802          case OP_SCBRAPOS:
2803          case OP_BRAPOSZERO:
2804            {
2805            int charcount, matched_count;
2806            const pcre_uchar *local_ptr = ptr;
2807            BOOL allow_zero;
2808    
2809            if (codevalue == OP_BRAPOSZERO)
2810              {
2811              allow_zero = TRUE;
2812              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2813              }
2814            else allow_zero = FALSE;
2815    
2816            /* Loop to match the subpattern as many times as possible as if it were
2817            a complete pattern. */
2818    
2819            for (matched_count = 0;; matched_count++)
2820              {
2821              int local_offsets[2];
2822              int local_workspace[1000];
2823    
2824              int rc = internal_dfa_exec(
2825                md,                                   /* fixed match data */
2826                code,                                 /* this subexpression's code */
2827                local_ptr,                            /* where we currently are */
2828                (int)(ptr - start_subject),           /* start offset */
2829                local_offsets,                        /* offset vector */
2830                sizeof(local_offsets)/sizeof(int),    /* size of same */
2831                local_workspace,                      /* workspace vector */
2832                sizeof(local_workspace)/sizeof(int),  /* size of same */
2833                rlevel);                              /* function recursion level */
2834    
2835              /* Failed to match */
2836    
2837              if (rc < 0)
2838                {
2839                if (rc != PCRE_ERROR_NOMATCH) return rc;
2840                break;
2841                }
2842    
2843              /* Matched: break the loop if zero characters matched. */
2844    
2845              charcount = local_offsets[1] - local_offsets[0];
2846              if (charcount == 0) break;
2847              local_ptr += charcount;    /* Advance temporary position ptr */
2848              }
2849    
2850            /* At this point we have matched the subpattern matched_count
2851            times, and local_ptr is pointing to the character after the end of the
2852            last match. */
2853    
2854            if (matched_count > 0 || allow_zero)
2855              {
2856              const pcre_uchar *end_subpattern = code;
2857              int next_state_offset;
2858    
2859              do { end_subpattern += GET(end_subpattern, 1); }
2860                while (*end_subpattern == OP_ALT);
2861              next_state_offset =
2862                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2863    
2864              /* Optimization: if there are no more active states, and there
2865              are no new states yet set up, then skip over the subject string
2866              right here, to save looping. Otherwise, set up the new state to swing
2867              into action when the end of the matched substring is reached. */
2868    
2869              if (i + 1 >= active_count && new_count == 0)
2870                {
2871                ptr = local_ptr;
2872                clen = 0;
2873                ADD_NEW(next_state_offset, 0);
2874                }
2875              else
2876                {
2877                const pcre_uchar *p = ptr;
2878                const pcre_uchar *pp = local_ptr;
2879                charcount = (int)(pp - p);
2880    #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2881                if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2882    #endif
2883                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2884                }
2885              }
2886            }
2887          break;
2888    
2889          /*-----------------------------------------------------------------*/
2890        case OP_ONCE:        case OP_ONCE:
2891          case OP_ONCE_NC:
2892          {          {
2893          int local_offsets[2];          int local_offsets[2];
2894          int local_workspace[1000];          int local_workspace[1000];
# Line 1431  for (;;) Line 2897  for (;;)
2897            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2898            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2899            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2900            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2901            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2902            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2903            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2904            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2905            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2906    
2907          if (rc >= 0)          if (rc >= 0)
2908            {            {
2909            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2910            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2911            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2912    
2913            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2914              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2915            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2916                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2917    
2918            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2919            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 1456  for (;;) Line 2921  for (;;)
2921    
2922            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2923                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2924              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2925    
2926            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2927            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 1471  for (;;) Line 2936  for (;;)
2936            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2937            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2938            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2939            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2940    
2941            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2942              {              {
# Line 1494  for (;;) Line 2959  for (;;)
2959              }              }
2960            else            else
2961              {              {
2962              const uschar *p = start_subject + local_offsets[0];  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2963              const uschar *pp = start_subject + local_offsets[1];              if (utf)
2964              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;                {
2965                  const pcre_uchar *p = start_subject + local_offsets[0];
2966                  const pcre_uchar *pp = start_subject + local_offsets[1];
2967                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2968                  }
2969    #endif
2970              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2971              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2972                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2973              }              }
   
2974            }            }
2975          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2976          }          }
# Line 1512  for (;;) Line 2981  for (;;)
2981        /* Handle callouts */        /* Handle callouts */
2982    
2983        case OP_CALLOUT:        case OP_CALLOUT:
2984        if (pcre_callout != NULL)        rrc = 0;
2985          if (PUBL(callout) != NULL)
2986          {          {
2987          int rrc;          PUBL(callout_block) cb;
         pcre_callout_block cb;  
2988          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2989          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2990          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2991          cb.subject          = (char *)start_subject;  #if defined COMPILE_PCRE8
2992          cb.subject_length   = end_subject - start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2993          cb.start_match      = current_subject - start_subject;  #elif defined COMPILE_PCRE16
2994          cb.current_position = ptr - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
2995    #elif defined COMPILE_PCRE32
2996            cb.subject          = (PCRE_SPTR32)start_subject;
2997    #endif
2998            cb.subject_length   = (int)(end_subject - start_subject);
2999            cb.start_match      = (int)(current_subject - start_subject);
3000            cb.current_position = (int)(ptr - start_subject);
3001          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
3002          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
3003          cb.capture_top      = 1;          cb.capture_top      = 1;
3004          cb.capture_last     = -1;          cb.capture_last     = -1;
3005          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
3006          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
3007          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3008          }          }
3009          if (rrc == 0)
3010            { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3011        break;        break;
3012    
3013    
# Line 1546  for (;;) Line 3023  for (;;)
3023    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
3024    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
3025    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
3026    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
3027    
3028      The "forced_ fail" variable counts the number of (*F) encountered for the
3029      character. If it is equal to the original active_count (saved in
3030      workspace[1]) it means that (*F) was found on every active state. In this
3031      case we don't want to give a partial match.
3032    
3033      The "could_continue" variable is true if a state could have continued but
3034      for the fact that the end of the subject was reached. */
3035    
3036    if (new_count <= 0)    if (new_count <= 0)
3037      {      {
3038      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
3039          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on, and */
3040          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3041          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
3042          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3043        {          ||                                           /* or... */
3044        if (offsetcount >= 2)          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3045          {           match_count < 0)                            /* no matches */
3046          offsets[0] = current_subject - start_subject;          ) &&                                         /* And... */
3047          offsets[1] = end_subject - start_subject;          (
3048          }          partial_newline ||                           /* Either partial NL */
3049              (                                          /* or ... */
3050              ptr >= end_subject &&                /* End of subject and */
3051              ptr > md->start_used_ptr)            /* Inspected non-empty string */
3052              )
3053            )
3054        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
       }  
   
3055      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3056        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3057        rlevel*2-2, SP));        rlevel*2-2, SP));
3058      return match_count;      break;        /* In effect, "return", but see the comment below */
3059      }      }
3060    
3061    /* One or more states are active for the next character. */    /* One or more states are active for the next character. */
# Line 1575  for (;;) Line 3063  for (;;)
3063    ptr += clen;    /* Advance to next subject character */    ptr += clen;    /* Advance to next subject character */
3064    }               /* Loop to move along the subject string */    }               /* Loop to move along the subject string */
3065    
3066  /* Control never gets here, but we must keep the compiler happy. */  /* Control gets here from "break" a few lines above. We do it this way because
3067    if we use "return" above, we have compiler trouble. Some compilers warn if
3068    there's nothing here because they think the function doesn't return a value. On
3069    the other hand, if we put a dummy statement here, some more clever compilers
3070    complain that it can't be reached. Sigh. */
3071    
3072  DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n"  return match_count;
   "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP));  
 return PCRE_ERROR_NOMATCH;  
3073  }  }
3074    
3075    
# Line 1595  is not anchored. Line 3085  is not anchored.
3085    
3086  Arguments:  Arguments:
3087    argument_re     points to the compiled expression    argument_re     points to the compiled expression
3088    extra_data      points to extra data or is NULL (not currently used)    extra_data      points to extra data or is NULL
3089    subject         points to the subject string    subject         points to the subject string
3090    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
3091    start_offset    where to start in the subject string    start_offset    where to start in the subject string
# Line 1611  Returns:          > 0 => number of match Line 3101  Returns:          > 0 => number of match
3101                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3102  */  */
3103    
3104  PCRE_EXPORT int  #if defined COMPILE_PCRE8
3105    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3106  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3107    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3108    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3109    #elif defined COMPILE_PCRE16
3110    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3111    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3112      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3113      int offsetcount, int *workspace, int wscount)
3114    #elif defined COMPILE_PCRE32
3115    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3116    pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3117      PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3118      int offsetcount, int *workspace, int wscount)
3119    #endif
3120  {  {
3121  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3122  dfa_match_data match_block;  dfa_match_data match_block;
3123  BOOL utf8, anchored, startline, firstline;  dfa_match_data *md = &match_block;
3124  const uschar *current_subject, *end_subject, *lcc;  BOOL utf, anchored, startline, firstline;
3125    const pcre_uchar *current_subject, *end_subject;
 pcre_study_data internal_study;  
3126  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3127    
3128  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3129  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3130  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3131  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3132  int first_byte = -1;  pcre_uchar first_char = 0;
3133  int req_byte = -1;  pcre_uchar first_char2 = 0;
3134  int req_byte2 = -1;  pcre_uchar req_char = 0;
3135    pcre_uchar req_char2 = 0;
3136    int newline;
3137    
3138  /* Plausibility checks */  /* Plausibility checks */
3139    
# Line 1640  if (re == NULL || subject == NULL || wor Line 3142  if (re == NULL || subject == NULL || wor
3142     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3143  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3144  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3145    if (length < 0) return PCRE_ERROR_BADLENGTH;
3146    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3147    
3148    /* Check that the first field in the block is the magic number. If it is not,
3149    return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3150    REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3151    means that the pattern is likely compiled with different endianness. */
3152    
3153    if (re->magic_number != MAGIC_NUMBER)
3154      return re->magic_number == REVERSED_MAGIC_NUMBER?
3155        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3156    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3157    
3158    /* If restarting after a partial match, do some sanity checks on the contents
3159    of the workspace. */
3160    
3161  /* We need to find the pointer to any study data before we test for byte  if ((options & PCRE_DFA_RESTART) != 0)
3162  flipping, so we scan the extra_data block first. This may set two fields in the    {
3163  match block, so we must initialize them beforehand. However, the other fields    if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3164  in the match block must not be set until after the byte flipping. */      workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3165          return PCRE_ERROR_DFA_BADRESTART;
3166      }
3167    
3168  match_block.tables = re->tables;  /* Set up study, callout, and table data */
3169  match_block.callout_data = NULL;  
3170    md->tables = re->tables;
3171    md->callout_data = NULL;
3172    
3173  if (extra_data != NULL)  if (extra_data != NULL)
3174    {    {
# Line 1655  if (extra_data != NULL) Line 3176  if (extra_data != NULL)
3176    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3177      study = (const pcre_study_data *)extra_data->study_data;      study = (const pcre_study_data *)extra_data->study_data;
3178    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3179      if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3180        return PCRE_ERROR_DFA_UMLIMIT;
3181    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3182      match_block.callout_data = extra_data->callout_data;      md->callout_data = extra_data->callout_data;
3183    if ((flags & PCRE_EXTRA_TABLES) != 0)    if ((flags & PCRE_EXTRA_TABLES) != 0)
3184      match_block.tables = extra_data->tables;      md->tables = extra_data->tables;
   }  
   
 /* Check that the first field in the block is the magic number. If it is not,  
 test for a regex that was compiled on a host of opposite endianness. If this is  
 the case, flipped values are put in internal_re and internal_study if there was  
 study data too. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   {  
   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);  
   if (re == NULL) return PCRE_ERROR_BADMAGIC;  
   if (study != NULL) study = &internal_study;  
3185    }    }
3186    
3187  /* Set some local values */  /* Set some local values */
3188    
3189  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3190  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3191  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3192    
3193    #ifdef SUPPORT_UTF
3194    /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3195    utf = (re->options & PCRE_UTF8) != 0;
3196    #else
3197    utf = FALSE;
3198    #endif
3199    
3200  utf8 = (re->options & PCRE_UTF8) != 0;  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3201  anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0;    (re->options & PCRE_ANCHORED) != 0;
3202    
3203  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3204    
3205  match_block.start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3206      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3207  match_block.start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3208  match_block.end_subject = end_subject;  md->end_subject = end_subject;
3209  match_block.moptions = options;  md->start_offset = start_offset;
3210  match_block.poptions = re->options;  md->moptions = options;
3211    md->poptions = re->options;
3212    
3213    /* If the BSR option is not set at match time, copy what was set
3214    at compile time. */
3215    
3216    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3217      {
3218      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3219        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3220    #ifdef BSR_ANYCRLF
3221      else md->moptions |= PCRE_BSR_ANYCRLF;
3222    #endif
3223      }
3224    
3225    /* Handle different types of newline. The three bits give eight cases. If
3226    nothing is set at run time, whatever was used at compile time applies. */
3227    
3228    switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3229             PCRE_NEWLINE_BITS)
3230      {
3231      case 0: newline = NEWLINE; break;   /* Compile-time default */
3232      case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3233      case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3234      case PCRE_NEWLINE_CR+
3235           PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3236      case PCRE_NEWLINE_ANY: newline = -1; break;
3237      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3238      default: return PCRE_ERROR_BADNEWLINE;
3239      }
3240    
3241    if (newline == -2)
3242      {
3243      md->nltype = NLTYPE_ANYCRLF;
3244      }
3245    else if (newline < 0)
3246      {
3247      md->nltype = NLTYPE_ANY;
3248      }
3249    else
3250      {
3251      md->nltype = NLTYPE_FIXED;
3252      if (newline > 255)
3253        {
3254        md->nllen = 2;
3255        md->nl[0] = (newline >> 8) & 255;
3256        md->nl[1] = newline & 255;
3257        }
3258      else
3259        {
3260        md->nllen = 1;
3261        md->nl[0] = newline;
3262        }
3263      }
3264    
3265  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3266  back the character offset. */  back the character offset. */
3267    
3268  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3269  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3270    {    {
3271    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3272      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3273    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3274      {      {
3275      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3276        {        {
3277        tb &= 0xc0;        offsets[0] = erroroffset;
3278        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3279        }        }
3280    #if defined COMPILE_PCRE8
3281        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3282          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3283    #elif defined COMPILE_PCRE16
3284        return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3285          PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3286    #elif defined COMPILE_PCRE32
3287        return PCRE_ERROR_BADUTF32;
3288    #endif
3289      }      }
3290    #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3291      if (start_offset > 0 && start_offset < length &&
3292            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3293        return PCRE_ERROR_BADUTF8_OFFSET;
3294    #endif
3295    }    }
3296  #endif  #endif
3297    
# Line 1715  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3299  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3299  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3300  in other programs later. */  in other programs later. */
3301    
3302  if (match_block.tables == NULL) match_block.tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3303    
3304  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3305  used in a loop when finding where to start. */  where to start. */
3306    
3307  lcc = match_block.tables + lcc_offset;  startline = (re->flags & PCRE_STARTLINE) != 0;
 startline = (re->options & PCRE_STARTLINE) != 0;  
3308  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3309    
3310  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 1732  studied, there may be a bitmap of possib Line 3315  studied, there may be a bitmap of possib
3315    
3316  if (!anchored)  if (!anchored)
3317    {    {
3318    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3319      {      {
3320      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3321      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = (pcre_uchar)(re->first_char);
3322        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3323          {
3324          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3325    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3326          if (utf && first_char > 127)
3327            first_char2 = UCD_OTHERCASE(first_char);
3328    #endif
3329          }
3330      }      }
3331    else    else
3332      {      {
3333      if (startline && study != NULL &&      if (!startline && study != NULL &&
3334           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3335        start_bits = study->start_bits;        start_bits = study->start_bits;
3336      }      }
3337    }    }
# Line 1749  if (!anchored) Line 3339  if (!anchored)
3339  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
3340  character" set. */  character" set. */