/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 85 by nigel, Sat Feb 24 21:41:13 2007 UTC revision 723 by ph10, Sat Oct 8 15:55:23 2011 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 39  POSSIBILITY OF SUCH DAMAGE. Line 40  POSSIBILITY OF SUCH DAMAGE.
40    
41    
42  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
43  alternative matching function that uses a DFA algorithm. This is NOT Perl-  alternative matching function that uses a sort of DFA algorithm (not a true
44  compatible, but it has advantages in certain applications. */  FSM). This is NOT Perl- compatible, but it has advantages in certain
45    applications. */
46    
47    
48    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49    the performance of his patterns greatly. I could not use it as it stood, as it
50    was not thread safe, and made assumptions about pattern sizes. Also, it caused
51    test 7 to loop, and test 9 to crash with a segfault.
52    
53    The issue is the check for duplicate states, which is done by a simple linear
54    search up the state list. (Grep for "duplicate" below to find the code.) For
55    many patterns, there will never be many states active at one time, so a simple
56    linear search is fine. In patterns that have many active states, it might be a
57    bottleneck. The suggested code used an indexing scheme to remember which states
58    had previously been used for each character, and avoided the linear search when
59    it knew there was no chance of a duplicate. This was implemented when adding
60    states to the state lists.
61    
62    I wrote some thread-safe, not-limited code to try something similar at the time
63    of checking for duplicates (instead of when adding states), using index vectors
64    on the stack. It did give a 13% improvement with one specially constructed
65    pattern for certain subject strings, but on other strings and on many of the
66    simpler patterns in the test suite it did worse. The major problem, I think,
67    was the extra time to initialize the index. This had to be done for each call
68    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69    only once - I suspect this was the cause of the problems with the tests.)
70    
71    Overall, I concluded that the gains in some cases did not outweigh the losses
72    in others, so I abandoned this code. */
73    
74    
75    
76    #ifdef HAVE_CONFIG_H
77    #include "config.h"
78    #endif
79    
80    #define NLBLOCK md             /* Block containing newline information */
81    #define PSSTART start_subject  /* Field containing processed string start */
82    #define PSEND   end_subject    /* Field containing processed string end */
83    
84  #include "pcre_internal.h"  #include "pcre_internal.h"
85    
# Line 51  compatible, but it has advantages in cer Line 89  compatible, but it has advantages in cer
89  #define SP "                   "  #define SP "                   "
90    
91    
   
92  /*************************************************  /*************************************************
93  *      Code parameters and static tables         *  *      Code parameters and static tables         *
94  *************************************************/  *************************************************/
95    
96  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97  into others, under special conditions. A gap of 10 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
98  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
99    never stored, so we push them well clear of the normal opcodes. */
100  #define OP_PROP_EXTRA    (EXTRACT_BASIC_MAX+1)  
101  #define OP_EXTUNI_EXTRA  (EXTRACT_BASIC_MAX+11)  #define OP_PROP_EXTRA       300
102    #define OP_EXTUNI_EXTRA     320
103    #define OP_ANYNL_EXTRA      340
104    #define OP_HSPACE_EXTRA     360
105    #define OP_VSPACE_EXTRA     380
106    
107    
108  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
109  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
110  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
111  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112  small value. */  small value. Non-zero values in the table are the offsets from the opcode where
113    the character is to be found. ***NOTE*** If the start of this table is
114    modified, the three tables that follow must also be modified. */
115    
116  static uschar coptable[] = {  static const uschar coptable[] = {
117    0,                             /* End                                    */    0,                             /* End                                    */
118    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0,                          /* \P, \p                                 */
122      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123      0,                             /* \X                                     */
124      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
125    1,                             /* Char                                   */    1,                             /* Char                                   */
126    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
127    1,                             /* not                                    */    1,                             /* not                                    */
128      1,                             /* noti                                   */
129    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
130    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131    3, 3, 3,                       /* upto, minupto, exact                   */    3, 3, 3,                       /* upto, minupto, exact                   */
132      1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      3, 3, 3,                       /* upto I, minupto I, exact I             */
135      1, 1, 1, 3,                    /* *+I, ++I, ?+I, upto+I                  */
136    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
137    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
138    3, 3, 3,                       /* NOT upto, minupto, exact               */    3, 3, 3,                       /* NOT upto, minupto, exact               */
139      1, 1, 1, 3,                    /* NOT *+, ++, ?+, upto+                  */
140      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
141      3, 3, 3,                       /* NOT upto I, minupto I, exact I         */
142      1, 1, 1, 3,                    /* NOT *+I, ++I, ?+I, upto+I              */
143    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
144    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
145    3, 3, 3,                       /* Type upto, minupto, exact              */    3, 3, 3,                       /* Type upto, minupto, exact              */
146      1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
147    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
148    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
149    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 95  static uschar coptable[] = { Line 151  static uschar coptable[] = {
151    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
152    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
153    0,                             /* REF                                    */    0,                             /* REF                                    */
154      0,                             /* REFI                                   */
155    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
156    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
157    0,                             /* Alt                                    */    0,                             /* Alt                                    */
158    0,                             /* Ket                                    */    0,                             /* Ket                                    */
159    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
160    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
161      0,                             /* KetRpos                                */
162      0,                             /* Reverse                                */
163    0,                             /* Assert                                 */    0,                             /* Assert                                 */
164    0,                             /* Assert not                             */    0,                             /* Assert not                             */
165    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
166    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
167      0, 0,                          /* ONCE, ONCE_NC                          */
168      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
169      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
170      0, 0,                          /* CREF, NCREF                            */
171      0, 0,                          /* RREF, NRREF                            */
172      0,                             /* DEF                                    */
173      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
174      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
175      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
176      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
177      0, 0                           /* CLOSE, SKIPZERO  */
178    };
179    
180    /* This table identifies those opcodes that inspect a character. It is used to
181    remember the fact that a character could have been inspected when the end of
182    the subject is reached. ***NOTE*** If the start of this table is modified, the
183    two tables that follow must also be modified. */
184    
185    static const uschar poptable[] = {
186      0,                             /* End                                    */
187      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
188      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
189      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
190      1, 1,                          /* \P, \p                                 */
191      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
192      1,                             /* \X                                     */
193      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
194      1,                             /* Char                                   */
195      1,                             /* Chari                                  */
196      1,                             /* not                                    */
197      1,                             /* noti                                   */
198      /* Positive single-char repeats                                          */
199      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
200      1, 1, 1,                       /* upto, minupto, exact                   */
201      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
202      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
203      1, 1, 1,                       /* upto I, minupto I, exact I             */
204      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
205      /* Negative single-char repeats - only for chars < 256                   */
206      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
207      1, 1, 1,                       /* NOT upto, minupto, exact               */
208      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
209      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
210      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
211      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
212      /* Positive type repeats                                                 */
213      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
214      1, 1, 1,                       /* Type upto, minupto, exact              */
215      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
216      /* Character class & ref repeats                                         */
217      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
218      1, 1,                          /* CRRANGE, CRMINRANGE                    */
219      1,                             /* CLASS                                  */
220      1,                             /* NCLASS                                 */
221      1,                             /* XCLASS - variable length               */
222      0,                             /* REF                                    */
223      0,                             /* REFI                                   */
224      0,                             /* RECURSE                                */
225      0,                             /* CALLOUT                                */
226      0,                             /* Alt                                    */
227      0,                             /* Ket                                    */
228      0,                             /* KetRmax                                */
229      0,                             /* KetRmin                                */
230      0,                             /* KetRpos                                */
231    0,                             /* Reverse                                */    0,                             /* Reverse                                */
232    0,                             /* Once                                   */    0,                             /* Assert                                 */
233    0,                             /* COND                                   */    0,                             /* Assert not                             */
234    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
235    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0,                             /* Assert behind not                      */
236    0,                             /* BRANUMBER                              */    0, 0,                          /* ONCE, ONCE_NC                          */
237    0                              /* BRA                                    */    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
238      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
239      0, 0,                          /* CREF, NCREF                            */
240      0, 0,                          /* RREF, NRREF                            */
241      0,                             /* DEF                                    */
242      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
243      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
244      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
245      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
246      0, 0                           /* CLOSE, SKIPZERO                        */
247  };  };
248    
249  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
250  and \w */  and \w */
251    
252  static uschar toptable1[] = {  static const uschar toptable1[] = {
253    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
254    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
255    ctype_space, ctype_space,    ctype_space, ctype_space,
256    ctype_word,  ctype_word,    ctype_word,  ctype_word,
257    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
258  };  };
259    
260  static uschar toptable2[] = {  static const uschar toptable2[] = {
261    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
262    ctype_digit, 0,    ctype_digit, 0,
263    ctype_space, 0,    ctype_space, 0,
264    ctype_word,  0,    ctype_word,  0,
265    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
266  };  };
267    
268    
# Line 142  these structures in, is a vector of ints Line 274  these structures in, is a vector of ints
274  typedef struct stateblock {  typedef struct stateblock {
275    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
276    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
277    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
278  } stateblock;  } stateblock;
279    
280  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
281    
282    
283  #ifdef DEBUG  #ifdef PCRE_DEBUG
284  /*************************************************  /*************************************************
285  *             Print character string             *  *             Print character string             *
286  *************************************************/  *************************************************/
# Line 198  Arguments: Line 329  Arguments:
329    offsetcount       size of same    offsetcount       size of same
330    workspace         vector of workspace    workspace         vector of workspace
331    wscount           size of same    wscount           size of same
   ims               the current ims flags  
332    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
333    
334  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
335                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
336                       -1 => failed to match                       -1 => failed to match
337                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
338    
# Line 215  for the current character, one for the f Line 344  for the current character, one for the f
344      { \      { \
345      next_active_state->offset = (x); \      next_active_state->offset = (x); \
346      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
347      next_active_state++; \      next_active_state++; \
348      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349      } \      } \
# Line 226  for the current character, one for the f Line 354  for the current character, one for the f
354      { \      { \
355      next_active_state->offset = (x); \      next_active_state->offset = (x); \
356      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
357      next_active_state->data   = (z); \      next_active_state->data   = (z); \
358      next_active_state++; \      next_active_state++; \
359      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 238  for the current character, one for the f Line 365  for the current character, one for the f
365      { \      { \
366      next_new_state->offset = (x); \      next_new_state->offset = (x); \
367      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
368      next_new_state++; \      next_new_state++; \
369      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370      } \      } \
# Line 249  for the current character, one for the f Line 375  for the current character, one for the f
375      { \      { \
376      next_new_state->offset = (x); \      next_new_state->offset = (x); \
377      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
378      next_new_state->data   = (z); \      next_new_state->data   = (z); \
379      next_new_state++; \      next_new_state++; \
380      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 268  internal_dfa_exec( Line 393  internal_dfa_exec(
393    int offsetcount,    int offsetcount,
394    int *workspace,    int *workspace,
395    int wscount,    int wscount,
396    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
397  {  {
398  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
399  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
400    
401  const uschar *ctypes, *lcc, *fcc;  const uschar *ctypes, *lcc, *fcc;
402  const uschar *ptr;  const uschar *ptr;
403  const uschar *end_code;  const uschar *end_code, *first_op;
404    
405    dfa_recursion_info new_recursive;
406    
407  int active_count, new_count, match_count;  int active_count, new_count, match_count;
408    
# Line 288  const uschar *start_subject = md->start_ Line 413  const uschar *start_subject = md->start_
413  const uschar *end_subject = md->end_subject;  const uschar *end_subject = md->end_subject;
414  const uschar *start_code = md->start_code;  const uschar *start_code = md->start_code;
415    
416    #ifdef SUPPORT_UTF8
417  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
418    #else
419    BOOL utf8 = FALSE;
420    #endif
421    
422  rlevel++;  rlevel++;
423  offsetcount &= (-2);  offsetcount &= (-2);
# Line 298  wscount = (wscount - (wscount % (INTS_PE Line 427  wscount = (wscount - (wscount % (INTS_PE
427            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
428    
429  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
430    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
431    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432    
433  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
434  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 311  active_states = (stateblock *)(workspace Line 440  active_states = (stateblock *)(workspace
440  next_new_state = new_states = active_states + wscount;  next_new_state = new_states = active_states + wscount;
441  new_count = 0;  new_count = 0;
442    
443    first_op = this_start_code + 1 + LINK_SIZE +
444      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
446    
447  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
448  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
449  makes is possible to use this function recursively, when we want to stop at a  makes is possible to use this function recursively, when we want to stop at a
# Line 320  If the first opcode in the first alterna Line 453  If the first opcode in the first alterna
453  a backward assertion. In that case, we have to find out the maximum amount to  a backward assertion. In that case, we have to find out the maximum amount to
454  move back, and set up each alternative appropriately. */  move back, and set up each alternative appropriately. */
455    
456  if (this_start_code[1+LINK_SIZE] == OP_REVERSE)  if (*first_op == OP_REVERSE)
457    {    {
458    int max_back = 0;    int max_back = 0;
459    int gone_back;    int gone_back;
# Line 358  if (this_start_code[1+LINK_SIZE] == OP_R Line 491  if (this_start_code[1+LINK_SIZE] == OP_R
491    
492      {      {
493      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
494        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
495      current_subject -= gone_back;      current_subject -= gone_back;
496      }      }
497    
498      /* Save the earliest consulted character */
499    
500      if (current_subject < md->start_used_ptr)
501        md->start_used_ptr = current_subject;
502    
503    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
504    
505    end_code = this_start_code;    end_code = this_start_code;
# Line 370  if (this_start_code[1+LINK_SIZE] == OP_R Line 508  if (this_start_code[1+LINK_SIZE] == OP_R
508      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
509      if (back <= gone_back)      if (back <= gone_back)
510        {        {
511        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
512        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
513        }        }
514      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 402  else Line 540  else
540    
541    else    else
542      {      {
543        int length = 1 + LINK_SIZE +
544          ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
545            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
546            2:0);
547      do      do
548        {        {
549        ADD_NEW(end_code - start_code + 1 + LINK_SIZE, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
550        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
551          length = 1 + LINK_SIZE;
552        }        }
553      while (*end_code == OP_ALT);      while (*end_code == OP_ALT);
554      }      }
# Line 421  ptr = current_subject; Line 564  ptr = current_subject;
564  for (;;)  for (;;)
565    {    {
566    int i, j;    int i, j;
567    int c, d, clen, dlen;    int clen, dlen;
568      unsigned int c, d;
569      int forced_fail = 0;
570      BOOL could_continue = FALSE;
571    
572    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
573    new state list. */    new state list. */
# Line 435  for (;;) Line 581  for (;;)
581    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
582    workspace[1] = active_count;    workspace[1] = active_count;
583    
584  #ifdef DEBUG  #ifdef PCRE_DEBUG
585    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
586    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars((uschar *)ptr, strlen((char *)ptr), stdout);
587    printf("\"\n");    printf("\"\n");
# Line 457  for (;;) Line 603  for (;;)
603    
604    if (ptr < end_subject)    if (ptr < end_subject)
605      {      {
606      clen = 1;      clen = 1;        /* Number of bytes in the character */
607  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
608      if (utf8) { GETCHARLEN(c, ptr, clen); } else      if (utf8) { GETCHARLEN(c, ptr, clen); } else
609  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
# Line 465  for (;;) Line 611  for (;;)
611      }      }
612    else    else
613      {      {
614      clen = 0;    /* At end subject */      clen = 0;        /* This indicates the end of the subject */
615      c = -1;      c = NOTACHAR;    /* This value should never actually be used */
616      }      }
617    
618    /* Scan up the active states and act on each one. The result of an action    /* Scan up the active states and act on each one. The result of an action
# Line 477  for (;;) Line 623  for (;;)
623    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
624      {      {
625      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
626        BOOL caseless = FALSE;
627      const uschar *code;      const uschar *code;
628      int state_offset = current_state->offset;      int state_offset = current_state->offset;
629      int count, codevalue;      int count, codevalue, rrc;
     int chartype, othercase;  
630    
631  #ifdef DEBUG  #ifdef PCRE_DEBUG
632      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
633      if (c < 0) printf("-1\n");      if (clen == 0) printf("EOL\n");
634        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
635          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
636  #endif  #endif
637    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
638      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
639      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
640      been skipped". */      been skipped". */
# Line 512  for (;;) Line 654  for (;;)
654          }          }
655        }        }
656    
657      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
658        See the note at the head of this module about the possibility of improving
659        performance here. */
660    
661      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
662        {        {
# Line 528  for (;;) Line 672  for (;;)
672    
673      code = start_code + state_offset;      code = start_code + state_offset;
674      codevalue = *code;      codevalue = *code;
675      if (codevalue >= OP_BRA) codevalue = OP_BRA; /* All brackets are equal */  
676        /* If this opcode inspects a character, but we are at the end of the
677        subject, remember the fact for use when testing for a partial match. */
678    
679        if (clen == 0 && poptable[codevalue] != 0)
680          could_continue = TRUE;
681    
682      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
683      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
# Line 536  for (;;) Line 685  for (;;)
685      permitted.      permitted.
686    
687      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
688      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long. We
689      Unfortunately, we have to take special action to deal with  \P, \p, and      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
690      \X in this case. To keep the other cases fast, convert these ones to new      this case. To keep the other cases fast, convert these ones to new opcodes.
691      opcodes. */      */
692    
693      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
694        {        {
# Line 550  for (;;) Line 699  for (;;)
699        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
700        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
701          {          {
702          if (d == OP_ANYBYTE) return PCRE_ERROR_DFA_UITEM;          switch(d)
703          if (d >= OP_NOTPROP)            {
704            codevalue += (d == OP_EXTUNI)? OP_EXTUNI_EXTRA : OP_PROP_EXTRA;            case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
705              case OP_NOTPROP:
706              case OP_PROP: codevalue += OP_PROP_EXTRA; break;
707              case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
708              case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
709              case OP_NOT_HSPACE:
710              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
711              case OP_NOT_VSPACE:
712              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
713              default: break;
714              }
715          }          }
716        }        }
717      else      else
718        {        {
719        dlen = 0;         /* Not strictly necessary, but compilers moan */        dlen = 0;         /* Not strictly necessary, but compilers moan */
720        d = -1;           /* if these variables are not set. */        d = NOTACHAR;     /* if these variables are not set. */
721        }        }
722    
723    
# Line 566  for (;;) Line 725  for (;;)
725    
726      switch (codevalue)      switch (codevalue)
727        {        {
728    /* ========================================================================== */
729          /* These cases are never obeyed. This is a fudge that causes a compile-
730          time error if the vectors coptable or poptable, which are indexed by
731          opcode, are not the correct length. It seems to be the only way to do
732          such a check at compile time, as the sizeof() operator does not work
733          in the C preprocessor. */
734    
735          case OP_TABLE_LENGTH:
736          case OP_TABLE_LENGTH +
737            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
738             (sizeof(poptable) == OP_TABLE_LENGTH)):
739          break;
740    
741  /* ========================================================================== */  /* ========================================================================== */
742        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
743        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
744        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
745          subpattern, because the possessive subpattern repeats are always handled
746          using recursive calls. Thus, it never adds any new states.
747    
748          At the end of the (sub)pattern, unless we have an empty string and
749          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
750          start of the subject, save the match data, shifting up all previous
751        matches so we always have the longest first. */        matches so we always have the longest first. */
752    
753        case OP_KET:        case OP_KET:
754        case OP_KETRMIN:        case OP_KETRMIN:
755        case OP_KETRMAX:        case OP_KETRMAX:
756          case OP_KETRPOS:
757        if (code != end_code)        if (code != end_code)
758          {          {
759          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 584  for (;;) Line 762  for (;;)
762            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
763            }            }
764          }          }
765        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
766          {          {
767          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
768            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
769              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
770          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
771          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
772          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
773            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
774            offsets[0] = current_subject - start_subject;                match_count = 0;
775            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
776            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
777              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
778            }              {
779          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
780            {              offsets[1] = (int)(ptr - start_subject);
781            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
782              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], current_subject));
783              match_count, rlevel*2-2, SP));              }
784            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
785                {
786                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
787                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
788                  match_count, rlevel*2-2, SP));
789                return match_count;
790                }
791            }            }
792          }          }
793        break;        break;
# Line 615  for (;;) Line 799  for (;;)
799        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
800        case OP_ALT:        case OP_ALT:
801        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
802        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
803        break;        break;
804    
805        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
806        case OP_BRA:        case OP_BRA:
807          case OP_SBRA:
808        do        do
809          {          {
810          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
811          code += GET(code, 1);          code += GET(code, 1);
812          }          }
813        while (*code == OP_ALT);        while (*code == OP_ALT);
814        break;        break;
815    
816        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
817          case OP_CBRA:
818          case OP_SCBRA:
819          ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE),  0);
820          code += GET(code, 1);
821          while (*code == OP_ALT)
822            {
823            ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
824            code += GET(code, 1);
825            }
826          break;
827    
828          /*-----------------------------------------------------------------*/
829        case OP_BRAZERO:        case OP_BRAZERO:
830        case OP_BRAMINZERO:        case OP_BRAMINZERO:
831        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
832        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
833        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
834        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
835        break;        break;
836    
837        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
838        case OP_BRANUMBER:        case OP_SKIPZERO:
839        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);        code += 1 + GET(code, 2);
840          while (*code == OP_ALT) code += GET(code, 1);
841          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
842        break;        break;
843    
844        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
845        case OP_CIRC:        case OP_CIRC:
846        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE))  
847          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
848        break;        break;
849    
850        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
851        case OP_EOD:        case OP_CIRCM:
852        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
853              (ptr != end_subject && WAS_NEWLINE(ptr)))
854            { ADD_ACTIVE(state_offset + 1, 0); }
855        break;        break;
856    
857        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
858        case OP_OPT:        case OP_EOD:
859        ims = code[1];        if (ptr >= end_subject)
860        ADD_ACTIVE(state_offset + 2, 0);          {
861            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
862              could_continue = TRUE;
863            else { ADD_ACTIVE(state_offset + 1, 0); }
864            }
865        break;        break;
866    
867        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 679  for (;;) Line 883  for (;;)
883    
884        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
885        case OP_ANY:        case OP_ANY:
886        if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0))        if (clen > 0 && !IS_NEWLINE(ptr))
887            { ADD_NEW(state_offset + 1, 0); }
888          break;
889    
890          /*-----------------------------------------------------------------*/
891          case OP_ALLANY:
892          if (clen > 0)
893          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
894        break;        break;
895    
896        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
897        case OP_EODN:        case OP_EODN:
898        if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
899            could_continue = TRUE;
900          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
901          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
902        break;        break;
903    
# Line 693  for (;;) Line 905  for (;;)
905        case OP_DOLL:        case OP_DOLL:
906        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
907          {          {
908          if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
909                                  (ims & PCRE_MULTILINE) != 0)))            could_continue = TRUE;
910            else if (clen == 0 ||
911                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
912                   (ptr == end_subject - md->nllen)
913                ))
914              { ADD_ACTIVE(state_offset + 1, 0); }
915            }
916          break;
917    
918          /*-----------------------------------------------------------------*/
919          case OP_DOLLM:
920          if ((md->moptions & PCRE_NOTEOL) == 0)
921            {
922            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
923              could_continue = TRUE;
924            else if (clen == 0 ||
925                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
926            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
927          }          }
928        else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0)        else if (IS_NEWLINE(ptr))
929          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
930        break;        break;
931    
# Line 729  for (;;) Line 957  for (;;)
957          if (ptr > start_subject)          if (ptr > start_subject)
958            {            {
959            const uschar *temp = ptr - 1;            const uschar *temp = ptr - 1;
960              if (temp < md->start_used_ptr) md->start_used_ptr = temp;
961  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
962            if (utf8) BACKCHAR(temp);            if (utf8) BACKCHAR(temp);
963  #endif  #endif
964            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
965    #ifdef SUPPORT_UCP
966              if ((md->poptions & PCRE_UCP) != 0)
967                {
968                if (d == '_') left_word = TRUE; else
969                  {
970                  int cat = UCD_CATEGORY(d);
971                  left_word = (cat == ucp_L || cat == ucp_N);
972                  }
973                }
974              else
975    #endif
976            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
977            }            }
978          else left_word = 0;          else left_word = FALSE;
979    
980          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
981            else right_word = 0;            {
982    #ifdef SUPPORT_UCP
983              if ((md->poptions & PCRE_UCP) != 0)
984                {
985                if (c == '_') right_word = TRUE; else
986                  {
987                  int cat = UCD_CATEGORY(c);
988                  right_word = (cat == ucp_L || cat == ucp_N);
989                  }
990                }
991              else
992    #endif
993              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
994              }
995            else right_word = FALSE;
996    
997          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
998            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 746  for (;;) Line 1000  for (;;)
1000        break;        break;
1001    
1002    
 #ifdef SUPPORT_UCP  
   
1003        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1004        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
1005        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
1006        */        */
1007    
1008    #ifdef SUPPORT_UCP
1009        case OP_PROP:        case OP_PROP:
1010        case OP_NOTPROP:        case OP_NOTPROP:
1011        if (clen > 0)        if (clen > 0)
1012          {          {
1013          int rqdtype, category;          BOOL OK;
1014          category = _pcre_ucp_findchar(c, &chartype, &othercase);          const ucd_record * prop = GET_UCD(c);
1015          rqdtype = code[1];          switch(code[1])
         if (rqdtype >= 128)  
1016            {            {
1017            if ((rqdtype - 128 == category) == (codevalue == OP_PROP))            case PT_ANY:
1018              { ADD_NEW(state_offset + 2, 0); }            OK = TRUE;
1019            }            break;
1020          else  
1021            {            case PT_LAMP:
1022            if ((rqdtype == chartype) == (codevalue == OP_PROP))            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1023              { ADD_NEW(state_offset + 2, 0); }                 prop->chartype == ucp_Lt;
1024              break;
1025    
1026              case PT_GC:
1027              OK = _pcre_ucp_gentype[prop->chartype] == code[2];
1028              break;
1029    
1030              case PT_PC:
1031              OK = prop->chartype == code[2];
1032              break;
1033    
1034              case PT_SC:
1035              OK = prop->script == code[2];
1036              break;
1037    
1038              /* These are specials for combination cases. */
1039    
1040              case PT_ALNUM:
1041              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1042                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1043              break;
1044    
1045              case PT_SPACE:    /* Perl space */
1046              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1047                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1048              break;
1049    
1050              case PT_PXSPACE:  /* POSIX space */
1051              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1052                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1053                   c == CHAR_FF || c == CHAR_CR;
1054              break;
1055    
1056              case PT_WORD:
1057              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1058                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1059                   c == CHAR_UNDERSCORE;
1060              break;
1061    
1062              /* Should never occur, but keep compilers from grumbling. */
1063    
1064              default:
1065              OK = codevalue != OP_PROP;
1066              break;
1067            }            }
1068    
1069            if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1070          }          }
1071        break;        break;
1072  #endif  #endif
# Line 779  for (;;) Line 1076  for (;;)
1076  /* ========================================================================== */  /* ========================================================================== */
1077        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
1078        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
1079        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1080        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1081    
1082        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1083        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
1084          case OP_TYPEPOSPLUS:
1085        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1086        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1087        if (clen > 0)        if (clen > 0)
1088          {          {
1089          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1090              (c < 256 &&              (c < 256 &&
1091                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1092                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1093            {            {
1094              if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1095                {
1096                active_count--;            /* Remove non-match possibility */
1097                next_active_state--;
1098                }
1099            count++;            count++;
1100            ADD_NEW(state_offset, count);            ADD_NEW(state_offset, count);
1101            }            }
# Line 802  for (;;) Line 1105  for (;;)
1105        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1106        case OP_TYPEQUERY:        case OP_TYPEQUERY:
1107        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
1108          case OP_TYPEPOSQUERY:
1109        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1110        if (clen > 0)        if (clen > 0)
1111          {          {
1112          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1113              (c < 256 &&              (c < 256 &&
1114                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1115                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1116            {            {
1117              if (codevalue == OP_TYPEPOSQUERY)
1118                {
1119                active_count--;            /* Remove non-match possibility */
1120                next_active_state--;
1121                }
1122            ADD_NEW(state_offset + 2, 0);            ADD_NEW(state_offset + 2, 0);
1123            }            }
1124          }          }
# Line 818  for (;;) Line 1127  for (;;)
1127        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1128        case OP_TYPESTAR:        case OP_TYPESTAR:
1129        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
1130          case OP_TYPEPOSSTAR:
1131        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1132        if (clen > 0)        if (clen > 0)
1133          {          {
1134          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1135              (c < 256 &&              (c < 256 &&
1136                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1137                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1138            {            {
1139              if (codevalue == OP_TYPEPOSSTAR)
1140                {
1141                active_count--;            /* Remove non-match possibility */
1142                next_active_state--;
1143                }
1144            ADD_NEW(state_offset, 0);            ADD_NEW(state_offset, 0);
1145            }            }
1146          }          }
# Line 833  for (;;) Line 1148  for (;;)
1148    
1149        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1150        case OP_TYPEEXACT:        case OP_TYPEEXACT:
1151          count = current_state->count;  /* Number already matched */
1152          if (clen > 0)
1153            {
1154            if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1155                (c < 256 &&
1156                  (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1157                  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1158              {
1159              if (++count >= GET2(code, 1))
1160                { ADD_NEW(state_offset + 4, 0); }
1161              else
1162                { ADD_NEW(state_offset, count); }
1163              }
1164            }
1165          break;
1166    
1167          /*-----------------------------------------------------------------*/
1168        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1169        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1170        if (codevalue != OP_TYPEEXACT)        case OP_TYPEPOSUPTO:
1171          { ADD_ACTIVE(state_offset + 4, 0); }        ADD_ACTIVE(state_offset + 4, 0);
1172        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1173        if (clen > 0)        if (clen > 0)
1174          {          {
1175          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1176              (c < 256 &&              (c < 256 &&
1177                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1178                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1179            {            {
1180              if (codevalue == OP_TYPEPOSUPTO)
1181                {
1182                active_count--;           /* Remove non-match possibility */
1183                next_active_state--;
1184                }
1185            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1186              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 4, 0); }
1187            else            else
# Line 855  for (;;) Line 1192  for (;;)
1192    
1193  /* ========================================================================== */  /* ========================================================================== */
1194        /* These are virtual opcodes that are used when something like        /* These are virtual opcodes that are used when something like
1195        OP_TYPEPLUS has OP_PROP, OP_NOTPROP, or OP_EXTUNI as its argument. It        OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1196        keeps the code above fast for the other cases. The argument is in the        argument. It keeps the code above fast for the other cases. The argument
1197        d variable. */        is in the d variable. */
1198    
1199    #ifdef SUPPORT_UCP
1200        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
1201        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1202          case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1203        count = current_state->count;           /* Already matched */        count = current_state->count;           /* Already matched */
1204        if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1205        if (clen > 0)        if (clen > 0)
1206          {          {
1207          int category = _pcre_ucp_findchar(c, &chartype, &othercase);          BOOL OK;
1208          int rqdtype = code[2];          const ucd_record * prop = GET_UCD(c);
1209          if ((d == OP_PROP) ==          switch(code[2])
1210              (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))            {
1211            { count++; ADD_NEW(state_offset, count); }            case PT_ANY:
1212              OK = TRUE;
1213              break;
1214    
1215              case PT_LAMP:
1216              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1217                prop->chartype == ucp_Lt;
1218              break;
1219    
1220              case PT_GC:
1221              OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1222              break;
1223    
1224              case PT_PC:
1225              OK = prop->chartype == code[3];
1226              break;
1227    
1228              case PT_SC:
1229              OK = prop->script == code[3];
1230              break;
1231    
1232              /* These are specials for combination cases. */
1233    
1234              case PT_ALNUM:
1235              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1236                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1237              break;
1238    
1239              case PT_SPACE:    /* Perl space */
1240              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1241                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1242              break;
1243    
1244              case PT_PXSPACE:  /* POSIX space */
1245              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1246                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1247                   c == CHAR_FF || c == CHAR_CR;
1248              break;
1249    
1250              case PT_WORD:
1251              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1252                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1253                   c == CHAR_UNDERSCORE;
1254              break;
1255    
1256              /* Should never occur, but keep compilers from grumbling. */
1257    
1258              default:
1259              OK = codevalue != OP_PROP;
1260              break;
1261              }
1262    
1263            if (OK == (d == OP_PROP))
1264              {
1265              if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1266                {
1267                active_count--;           /* Remove non-match possibility */
1268                next_active_state--;
1269                }
1270              count++;
1271              ADD_NEW(state_offset, count);
1272              }
1273          }          }
1274        break;        break;
1275    
1276        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1277        case OP_EXTUNI_EXTRA + OP_TYPEPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1278        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1279          case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1280        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1281        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1282        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1283          {          {
1284          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1285          int ncount = 0;          int ncount = 0;
1286            if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1287              {
1288              active_count--;           /* Remove non-match possibility */
1289              next_active_state--;
1290              }
1291          while (nptr < end_subject)          while (nptr < end_subject)
1292            {            {
1293            int nd;            int nd;
1294            int ndlen = 1;            int ndlen = 1;
1295            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1296            if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1297            ncount++;            ncount++;
1298            nptr += ndlen;            nptr += ndlen;
1299            }            }
# Line 895  for (;;) Line 1301  for (;;)
1301          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1302          }          }
1303        break;        break;
1304    #endif
1305    
1306        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1307        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1308        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1309        count = 3;        case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1310        goto QS1;        count = current_state->count;  /* Already matched */
1311          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
       case OP_PROP_EXTRA + OP_TYPESTAR:  
       case OP_PROP_EXTRA + OP_TYPEMINSTAR:  
       count = 0;  
   
       QS1:  
   
       ADD_ACTIVE(state_offset + 3, 0);  
1312        if (clen > 0)        if (clen > 0)
1313          {          {
1314          int category = _pcre_ucp_findchar(c, &chartype, &othercase);          int ncount = 0;
1315          int rqdtype = code[2];          switch (c)
1316          if ((d == OP_PROP) ==            {
1317              (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))            case 0x000b:
1318            { ADD_NEW(state_offset + count, 0); }            case 0x000c:
1319              case 0x0085:
1320              case 0x2028:
1321              case 0x2029:
1322              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323              goto ANYNL01;
1324    
1325              case 0x000d:
1326              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327              /* Fall through */
1328    
1329              ANYNL01:
1330              case 0x000a:
1331              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1332                {
1333                active_count--;           /* Remove non-match possibility */
1334                next_active_state--;
1335                }
1336              count++;
1337              ADD_NEW_DATA(-state_offset, count, ncount);
1338              break;
1339    
1340              default:
1341              break;
1342              }
1343          }          }
1344        break;        break;
1345    
1346        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1347        case OP_EXTUNI_EXTRA + OP_TYPEQUERY:        case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1348        case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:        case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1349        count = 2;        case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1350        goto QS2;        count = current_state->count;  /* Already matched */
1351          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1352        case OP_EXTUNI_EXTRA + OP_TYPESTAR:        if (clen > 0)
1353        case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:          {
1354        count = 0;          BOOL OK;
1355            switch (c)
1356              {
1357              case 0x000a:
1358              case 0x000b:
1359              case 0x000c:
1360              case 0x000d:
1361              case 0x0085:
1362              case 0x2028:
1363              case 0x2029:
1364              OK = TRUE;
1365              break;
1366    
1367        QS2:            default:
1368              OK = FALSE;
1369              break;
1370              }
1371    
1372        ADD_ACTIVE(state_offset + 2, 0);          if (OK == (d == OP_VSPACE))
       if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)  
         {  
         const uschar *nptr = ptr + clen;  
         int ncount = 0;  
         while (nptr < end_subject)  
1373            {            {
1374            int nd;            if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1375            int ndlen = 1;              {
1376            GETCHARLEN(nd, nptr, ndlen);              active_count--;           /* Remove non-match possibility */
1377            if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;              next_active_state--;
1378            ncount++;              }
1379            nptr += ndlen;            count++;
1380              ADD_NEW_DATA(-state_offset, count, 0);
1381            }            }
         ADD_NEW_DATA(-(state_offset + count), 0, ncount);  
1382          }          }
1383        break;        break;
1384    
1385        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1386        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1387        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1388        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1389        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        count = current_state->count;  /* Already matched */
1390          { ADD_ACTIVE(state_offset + 5, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
       count = current_state->count;  /* Number already matched */  
1391        if (clen > 0)        if (clen > 0)
1392          {          {
1393          int category = _pcre_ucp_findchar(c, &chartype, &othercase);          BOOL OK;
1394          int rqdtype = code[4];          switch (c)
         if ((d == OP_PROP) ==  
             (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))  
1395            {            {
1396            if (++count >= GET2(code, 1))            case 0x09:      /* HT */
1397              { ADD_NEW(state_offset + 5, 0); }            case 0x20:      /* SPACE */
1398            else            case 0xa0:      /* NBSP */
1399              { ADD_NEW(state_offset, count); }            case 0x1680:    /* OGHAM SPACE MARK */
1400              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1401              case 0x2000:    /* EN QUAD */
1402              case 0x2001:    /* EM QUAD */
1403              case 0x2002:    /* EN SPACE */
1404              case 0x2003:    /* EM SPACE */
1405              case 0x2004:    /* THREE-PER-EM SPACE */
1406              case 0x2005:    /* FOUR-PER-EM SPACE */
1407              case 0x2006:    /* SIX-PER-EM SPACE */
1408              case 0x2007:    /* FIGURE SPACE */
1409              case 0x2008:    /* PUNCTUATION SPACE */
1410              case 0x2009:    /* THIN SPACE */
1411              case 0x200A:    /* HAIR SPACE */
1412              case 0x202f:    /* NARROW NO-BREAK SPACE */
1413              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1414              case 0x3000:    /* IDEOGRAPHIC SPACE */
1415              OK = TRUE;
1416              break;
1417    
1418              default:
1419              OK = FALSE;
1420              break;
1421            }            }
1422          }  
1423        break;          if (OK == (d == OP_HSPACE))
1424              {
1425              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1426                {
1427                active_count--;           /* Remove non-match possibility */
1428                next_active_state--;
1429                }
1430              count++;
1431              ADD_NEW_DATA(-state_offset, count, 0);
1432              }
1433            }
1434          break;
1435    
1436          /*-----------------------------------------------------------------*/
1437    #ifdef SUPPORT_UCP
1438          case OP_PROP_EXTRA + OP_TYPEQUERY:
1439          case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1440          case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1441          count = 4;
1442          goto QS1;
1443    
1444          case OP_PROP_EXTRA + OP_TYPESTAR:
1445          case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1446          case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1447          count = 0;
1448    
1449          QS1:
1450    
1451          ADD_ACTIVE(state_offset + 4, 0);
1452          if (clen > 0)
1453            {
1454            BOOL OK;
1455            const ucd_record * prop = GET_UCD(c);
1456            switch(code[2])
1457              {
1458              case PT_ANY:
1459              OK = TRUE;
1460              break;
1461    
1462              case PT_LAMP:
1463              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1464                prop->chartype == ucp_Lt;
1465              break;
1466    
1467              case PT_GC:
1468              OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1469              break;
1470    
1471              case PT_PC:
1472              OK = prop->chartype == code[3];
1473              break;
1474    
1475              case PT_SC:
1476              OK = prop->script == code[3];
1477              break;
1478    
1479              /* These are specials for combination cases. */
1480    
1481              case PT_ALNUM:
1482              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1483                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1484              break;
1485    
1486              case PT_SPACE:    /* Perl space */
1487              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1488                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1489              break;
1490    
1491              case PT_PXSPACE:  /* POSIX space */
1492              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1493                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1494                   c == CHAR_FF || c == CHAR_CR;
1495              break;
1496    
1497              case PT_WORD:
1498              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1499                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1500                   c == CHAR_UNDERSCORE;
1501              break;
1502    
1503              /* Should never occur, but keep compilers from grumbling. */
1504    
1505              default:
1506              OK = codevalue != OP_PROP;
1507              break;
1508              }
1509    
1510            if (OK == (d == OP_PROP))
1511              {
1512              if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1513                  codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1514                {
1515                active_count--;           /* Remove non-match possibility */
1516                next_active_state--;
1517                }
1518              ADD_NEW(state_offset + count, 0);
1519              }
1520            }
1521          break;
1522    
1523          /*-----------------------------------------------------------------*/
1524          case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1525          case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1526          case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1527          count = 2;
1528          goto QS2;
1529    
1530          case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1531          case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1532          case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1533          count = 0;
1534    
1535          QS2:
1536    
1537          ADD_ACTIVE(state_offset + 2, 0);
1538          if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1539            {
1540            const uschar *nptr = ptr + clen;
1541            int ncount = 0;
1542            if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1543                codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1544              {
1545              active_count--;           /* Remove non-match possibility */
1546              next_active_state--;
1547              }
1548            while (nptr < end_subject)
1549              {
1550              int nd;
1551              int ndlen = 1;
1552              GETCHARLEN(nd, nptr, ndlen);
1553              if (UCD_CATEGORY(nd) != ucp_M) break;
1554              ncount++;
1555              nptr += ndlen;
1556              }
1557            ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1558            }
1559          break;
1560    #endif
1561    
1562          /*-----------------------------------------------------------------*/
1563          case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1564          case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1565          case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1566          count = 2;
1567          goto QS3;
1568    
1569          case OP_ANYNL_EXTRA + OP_TYPESTAR:
1570          case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1571          case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1572          count = 0;
1573    
1574          QS3:
1575          ADD_ACTIVE(state_offset + 2, 0);
1576          if (clen > 0)
1577            {
1578            int ncount = 0;
1579            switch (c)
1580              {
1581              case 0x000b:
1582              case 0x000c:
1583              case 0x0085:
1584              case 0x2028:
1585              case 0x2029:
1586              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1587              goto ANYNL02;
1588    
1589              case 0x000d:
1590              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1591              /* Fall through */
1592    
1593              ANYNL02:
1594              case 0x000a:
1595              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1596                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1597                {
1598                active_count--;           /* Remove non-match possibility */
1599                next_active_state--;
1600                }
1601              ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1602              break;
1603    
1604              default:
1605              break;
1606              }
1607            }
1608          break;
1609    
1610          /*-----------------------------------------------------------------*/
1611          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1612          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1613          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1614          count = 2;
1615          goto QS4;
1616    
1617          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1618          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1619          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1620          count = 0;
1621    
1622          QS4:
1623          ADD_ACTIVE(state_offset + 2, 0);
1624          if (clen > 0)
1625            {
1626            BOOL OK;
1627            switch (c)
1628              {
1629              case 0x000a:
1630              case 0x000b:
1631              case 0x000c:
1632              case 0x000d:
1633              case 0x0085:
1634              case 0x2028:
1635              case 0x2029:
1636              OK = TRUE;
1637              break;
1638    
1639              default:
1640              OK = FALSE;
1641              break;
1642              }
1643            if (OK == (d == OP_VSPACE))
1644              {
1645              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1646                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1647                {
1648                active_count--;           /* Remove non-match possibility */
1649                next_active_state--;
1650                }
1651              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1652              }
1653            }
1654          break;
1655    
1656          /*-----------------------------------------------------------------*/
1657          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1658          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1659          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1660          count = 2;
1661          goto QS5;
1662    
1663          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1664          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1665          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1666          count = 0;
1667    
1668          QS5:
1669          ADD_ACTIVE(state_offset + 2, 0);
1670          if (clen > 0)
1671            {
1672            BOOL OK;
1673            switch (c)
1674              {
1675              case 0x09:      /* HT */
1676              case 0x20:      /* SPACE */
1677              case 0xa0:      /* NBSP */
1678              case 0x1680:    /* OGHAM SPACE MARK */
1679              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1680              case 0x2000:    /* EN QUAD */
1681              case 0x2001:    /* EM QUAD */
1682              case 0x2002:    /* EN SPACE */
1683              case 0x2003:    /* EM SPACE */
1684              case 0x2004:    /* THREE-PER-EM SPACE */
1685              case 0x2005:    /* FOUR-PER-EM SPACE */
1686              case 0x2006:    /* SIX-PER-EM SPACE */
1687              case 0x2007:    /* FIGURE SPACE */
1688              case 0x2008:    /* PUNCTUATION SPACE */
1689              case 0x2009:    /* THIN SPACE */
1690              case 0x200A:    /* HAIR SPACE */
1691              case 0x202f:    /* NARROW NO-BREAK SPACE */
1692              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1693              case 0x3000:    /* IDEOGRAPHIC SPACE */
1694              OK = TRUE;
1695              break;
1696    
1697              default:
1698              OK = FALSE;
1699              break;
1700              }
1701    
1702            if (OK == (d == OP_HSPACE))
1703              {
1704              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1705                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1706                {
1707                active_count--;           /* Remove non-match possibility */
1708                next_active_state--;
1709                }
1710              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1711              }
1712            }
1713          break;
1714    
1715          /*-----------------------------------------------------------------*/
1716    #ifdef SUPPORT_UCP
1717          case OP_PROP_EXTRA + OP_TYPEEXACT:
1718          case OP_PROP_EXTRA + OP_TYPEUPTO:
1719          case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1720          case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1721          if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1722            { ADD_ACTIVE(state_offset + 6, 0); }
1723          count = current_state->count;  /* Number already matched */
1724          if (clen > 0)
1725            {
1726            BOOL OK;
1727            const ucd_record * prop = GET_UCD(c);
1728            switch(code[4])
1729              {
1730              case PT_ANY:
1731              OK = TRUE;
1732              break;
1733    
1734              case PT_LAMP:
1735              OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1736                prop->chartype == ucp_Lt;
1737              break;
1738    
1739              case PT_GC:
1740              OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1741              break;
1742    
1743              case PT_PC:
1744              OK = prop->chartype == code[5];
1745              break;
1746    
1747              case PT_SC:
1748              OK = prop->script == code[5];
1749              break;
1750    
1751              /* These are specials for combination cases. */
1752    
1753              case PT_ALNUM:
1754              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1755                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1756              break;
1757    
1758              case PT_SPACE:    /* Perl space */
1759              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1760                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1761              break;
1762    
1763              case PT_PXSPACE:  /* POSIX space */
1764              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1765                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1766                   c == CHAR_FF || c == CHAR_CR;
1767              break;
1768    
1769              case PT_WORD:
1770              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1771                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1772                   c == CHAR_UNDERSCORE;
1773              break;
1774    
1775              /* Should never occur, but keep compilers from grumbling. */
1776    
1777              default:
1778              OK = codevalue != OP_PROP;
1779              break;
1780              }
1781    
1782            if (OK == (d == OP_PROP))
1783              {
1784              if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1785                {
1786                active_count--;           /* Remove non-match possibility */
1787                next_active_state--;
1788                }
1789              if (++count >= GET2(code, 1))
1790                { ADD_NEW(state_offset + 6, 0); }
1791              else
1792                { ADD_NEW(state_offset, count); }
1793              }
1794            }
1795          break;
1796    
1797        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1798        case OP_EXTUNI_EXTRA + OP_TYPEEXACT:        case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1799        case OP_EXTUNI_EXTRA + OP_TYPEUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1800        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1801          case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1802        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1803          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1804        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1805        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1806          {          {
1807          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1808          int ncount = 0;          int ncount = 0;
1809            if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1810              {
1811              active_count--;           /* Remove non-match possibility */
1812              next_active_state--;
1813              }
1814          while (nptr < end_subject)          while (nptr < end_subject)
1815            {            {
1816            int nd;            int nd;
1817            int ndlen = 1;            int ndlen = 1;
1818            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1819            if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1820            ncount++;            ncount++;
1821            nptr += ndlen;            nptr += ndlen;
1822            }            }
# Line 997  for (;;) Line 1826  for (;;)
1826            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1827          }          }
1828        break;        break;
1829    #endif
1830    
1831          /*-----------------------------------------------------------------*/
1832          case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1833          case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1834          case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1835          case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1836          if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1837            { ADD_ACTIVE(state_offset + 4, 0); }
1838          count = current_state->count;  /* Number already matched */
1839          if (clen > 0)
1840            {
1841            int ncount = 0;
1842            switch (c)
1843              {
1844              case 0x000b:
1845              case 0x000c:
1846              case 0x0085:
1847              case 0x2028:
1848              case 0x2029:
1849              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1850              goto ANYNL03;
1851    
1852              case 0x000d:
1853              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1854              /* Fall through */
1855    
1856              ANYNL03:
1857              case 0x000a:
1858              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1859                {
1860                active_count--;           /* Remove non-match possibility */
1861                next_active_state--;
1862                }
1863              if (++count >= GET2(code, 1))
1864                { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1865              else
1866                { ADD_NEW_DATA(-state_offset, count, ncount); }
1867              break;
1868    
1869              default:
1870              break;
1871              }
1872            }
1873          break;
1874    
1875          /*-----------------------------------------------------------------*/
1876          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1877          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1878          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1879          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1880          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1881            { ADD_ACTIVE(state_offset + 4, 0); }
1882          count = current_state->count;  /* Number already matched */
1883          if (clen > 0)
1884            {
1885            BOOL OK;
1886            switch (c)
1887              {
1888              case 0x000a:
1889              case 0x000b:
1890              case 0x000c:
1891              case 0x000d:
1892              case 0x0085:
1893              case 0x2028:
1894              case 0x2029:
1895              OK = TRUE;
1896              break;
1897    
1898              default:
1899              OK = FALSE;
1900              }
1901    
1902            if (OK == (d == OP_VSPACE))
1903              {
1904              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1905                {
1906                active_count--;           /* Remove non-match possibility */
1907                next_active_state--;
1908                }
1909              if (++count >= GET2(code, 1))
1910                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1911              else
1912                { ADD_NEW_DATA(-state_offset, count, 0); }
1913              }
1914            }
1915          break;
1916    
1917          /*-----------------------------------------------------------------*/
1918          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1919          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1920          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1921          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1922          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1923            { ADD_ACTIVE(state_offset + 4, 0); }
1924          count = current_state->count;  /* Number already matched */
1925          if (clen > 0)
1926            {
1927            BOOL OK;
1928            switch (c)
1929              {
1930              case 0x09:      /* HT */
1931              case 0x20:      /* SPACE */
1932              case 0xa0:      /* NBSP */
1933              case 0x1680:    /* OGHAM SPACE MARK */
1934              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1935              case 0x2000:    /* EN QUAD */
1936              case 0x2001:    /* EM QUAD */
1937              case 0x2002:    /* EN SPACE */
1938              case 0x2003:    /* EM SPACE */
1939              case 0x2004:    /* THREE-PER-EM SPACE */
1940              case 0x2005:    /* FOUR-PER-EM SPACE */
1941              case 0x2006:    /* SIX-PER-EM SPACE */
1942              case 0x2007:    /* FIGURE SPACE */
1943              case 0x2008:    /* PUNCTUATION SPACE */
1944              case 0x2009:    /* THIN SPACE */
1945              case 0x200A:    /* HAIR SPACE */
1946              case 0x202f:    /* NARROW NO-BREAK SPACE */
1947              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1948              case 0x3000:    /* IDEOGRAPHIC SPACE */
1949              OK = TRUE;
1950              break;
1951    
1952              default:
1953              OK = FALSE;
1954              break;
1955              }
1956    
1957            if (OK == (d == OP_HSPACE))
1958              {
1959              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1960                {
1961                active_count--;           /* Remove non-match possibility */
1962                next_active_state--;
1963                }
1964              if (++count >= GET2(code, 1))
1965                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1966              else
1967                { ADD_NEW_DATA(-state_offset, count, 0); }
1968              }
1969            }
1970          break;
1971    
1972  /* ========================================================================== */  /* ========================================================================== */
1973        /* These opcodes are followed by a character that is usually compared        /* These opcodes are followed by a character that is usually compared
# Line 1010  for (;;) Line 1981  for (;;)
1981        break;        break;
1982    
1983        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1984        case OP_CHARNC:        case OP_CHARI:
1985        if (clen == 0) break;        if (clen == 0) break;
1986    
1987  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1018  for (;;) Line 1989  for (;;)
1989          {          {
1990          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1991            {            {
1992              unsigned int othercase;
1993            if (c < 128) othercase = fcc[c]; else            if (c < 128) othercase = fcc[c]; else
1994    
1995            /* If we have Unicode property support, we can use it to test the            /* If we have Unicode property support, we can use it to test the
1996            other case of the character, if there is one. The result of            other case of the character. */
           _pcre_ucp_findchar() is < 0 if the char isn't found, and othercase is  
           returned as zero if there isn't another case. */  
1997    
1998  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1999            if (_pcre_ucp_findchar(c, &chartype, &othercase) < 0)            othercase = UCD_OTHERCASE(c);
2000    #else
2001              othercase = NOTACHAR;
2002  #endif  #endif
             othercase = -1;  
2003    
2004            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2005            }            }
# Line 1050  for (;;) Line 2021  for (;;)
2021        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2022    
2023        case OP_EXTUNI:        case OP_EXTUNI:
2024        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2025          {          {
2026          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
2027          int ncount = 0;          int ncount = 0;
# Line 1058  for (;;) Line 2029  for (;;)
2029            {            {
2030            int nclen = 1;            int nclen = 1;
2031            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
2032            if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
2033            ncount++;            ncount++;
2034            nptr += nclen;            nptr += nclen;
2035            }            }
# Line 1068  for (;;) Line 2039  for (;;)
2039  #endif  #endif
2040    
2041        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2042        /* Match a negated single character. This is only used for one-byte        /* This is a tricky like EXTUNI because it too can match more than one
2043        characters, that is, we know that d < 256. The character we are        character (when CR is followed by LF). In this case, set up a negative
2044        checking (c) can be multibyte. */        state to wait for one character to pass before continuing. */
2045    
2046          case OP_ANYNL:
2047          if (clen > 0) switch(c)
2048            {
2049            case 0x000b:
2050            case 0x000c:
2051            case 0x0085:
2052            case 0x2028:
2053            case 0x2029:
2054            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2055    
2056            case 0x000a:
2057            ADD_NEW(state_offset + 1, 0);
2058            break;
2059    
2060        case OP_NOT:          case 0x000d:
2061        if (clen > 0)          if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2062              {
2063              ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2064              }
2065            else
2066              {
2067              ADD_NEW(state_offset + 1, 0);
2068              }
2069            break;
2070            }
2071          break;
2072    
2073          /*-----------------------------------------------------------------*/
2074          case OP_NOT_VSPACE:
2075          if (clen > 0) switch(c)
2076            {
2077            case 0x000a:
2078            case 0x000b:
2079            case 0x000c:
2080            case 0x000d:
2081            case 0x0085:
2082            case 0x2028:
2083            case 0x2029:
2084            break;
2085    
2086            default:
2087            ADD_NEW(state_offset + 1, 0);
2088            break;
2089            }
2090          break;
2091    
2092          /*-----------------------------------------------------------------*/
2093          case OP_VSPACE:
2094          if (clen > 0) switch(c)
2095            {
2096            case 0x000a:
2097            case 0x000b:
2098            case 0x000c:
2099            case 0x000d:
2100            case 0x0085:
2101            case 0x2028:
2102            case 0x2029:
2103            ADD_NEW(state_offset + 1, 0);
2104            break;
2105    
2106            default: break;
2107            }
2108          break;
2109    
2110          /*-----------------------------------------------------------------*/
2111          case OP_NOT_HSPACE:
2112          if (clen > 0) switch(c)
2113          {          {
2114          int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          case 0x09:      /* HT */
2115          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }          case 0x20:      /* SPACE */
2116            case 0xa0:      /* NBSP */
2117            case 0x1680:    /* OGHAM SPACE MARK */
2118            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2119            case 0x2000:    /* EN QUAD */
2120            case 0x2001:    /* EM QUAD */
2121            case 0x2002:    /* EN SPACE */
2122            case 0x2003:    /* EM SPACE */
2123            case 0x2004:    /* THREE-PER-EM SPACE */
2124            case 0x2005:    /* FOUR-PER-EM SPACE */
2125            case 0x2006:    /* SIX-PER-EM SPACE */
2126            case 0x2007:    /* FIGURE SPACE */
2127            case 0x2008:    /* PUNCTUATION SPACE */
2128            case 0x2009:    /* THIN SPACE */
2129            case 0x200A:    /* HAIR SPACE */
2130            case 0x202f:    /* NARROW NO-BREAK SPACE */
2131            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2132            case 0x3000:    /* IDEOGRAPHIC SPACE */
2133            break;
2134    
2135            default:
2136            ADD_NEW(state_offset + 1, 0);
2137            break;
2138          }          }
2139        break;        break;
2140    
2141        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2142          case OP_HSPACE:
2143          if (clen > 0) switch(c)
2144            {
2145            case 0x09:      /* HT */
2146            case 0x20:      /* SPACE */
2147            case 0xa0:      /* NBSP */
2148            case 0x1680:    /* OGHAM SPACE MARK */
2149            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2150            case 0x2000:    /* EN QUAD */
2151            case 0x2001:    /* EM QUAD */
2152            case 0x2002:    /* EN SPACE */
2153            case 0x2003:    /* EM SPACE */
2154            case 0x2004:    /* THREE-PER-EM SPACE */
2155            case 0x2005:    /* FOUR-PER-EM SPACE */
2156            case 0x2006:    /* SIX-PER-EM SPACE */
2157            case 0x2007:    /* FIGURE SPACE */
2158            case 0x2008:    /* PUNCTUATION SPACE */
2159            case 0x2009:    /* THIN SPACE */
2160            case 0x200A:    /* HAIR SPACE */
2161            case 0x202f:    /* NARROW NO-BREAK SPACE */
2162            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2163            case 0x3000:    /* IDEOGRAPHIC SPACE */
2164            ADD_NEW(state_offset + 1, 0);
2165            break;
2166            }
2167          break;
2168    
2169          /*-----------------------------------------------------------------*/
2170          /* Match a negated single character casefully. This is only used for
2171          one-byte characters, that is, we know that d < 256. The character we are
2172          checking (c) can be multibyte. */
2173    
2174          case OP_NOT:
2175          if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2176          break;
2177    
2178          /*-----------------------------------------------------------------*/
2179          /* Match a negated single character caselessly. This is only used for
2180          one-byte characters, that is, we know that d < 256. The character we are
2181          checking (c) can be multibyte. */
2182    
2183          case OP_NOTI:
2184          if (clen > 0 && c != d && c != fcc[d])
2185            { ADD_NEW(state_offset + dlen + 1, 0); }
2186          break;
2187    
2188          /*-----------------------------------------------------------------*/
2189          case OP_PLUSI:
2190          case OP_MINPLUSI:
2191          case OP_POSPLUSI:
2192          case OP_NOTPLUSI:
2193          case OP_NOTMINPLUSI:
2194          case OP_NOTPOSPLUSI:
2195          caseless = TRUE;
2196          codevalue -= OP_STARI - OP_STAR;
2197    
2198          /* Fall through */
2199        case OP_PLUS:        case OP_PLUS:
2200        case OP_MINPLUS:        case OP_MINPLUS:
2201          case OP_POSPLUS:
2202        case OP_NOTPLUS:        case OP_NOTPLUS:
2203        case OP_NOTMINPLUS:        case OP_NOTMINPLUS:
2204          case OP_NOTPOSPLUS:
2205        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
2206        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2207        if (clen > 0)        if (clen > 0)
2208          {          {
2209          int otherd = -1;          unsigned int otherd = NOTACHAR;
2210          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2211            {            {
2212  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2213            if (utf8 && c >= 128)            if (utf8 && d >= 128)
2214              {              {
2215  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2216              if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;              otherd = UCD_OTHERCASE(d);
2217  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2218              }              }
2219            else            else
# Line 1104  for (;;) Line 2221  for (;;)
2221            otherd = fcc[d];            otherd = fcc[d];
2222            }            }
2223          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2224            { count++; ADD_NEW(state_offset, count); }            {
2225              if (count > 0 &&
2226                  (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2227                {
2228                active_count--;             /* Remove non-match possibility */
2229                next_active_state--;
2230                }
2231              count++;
2232              ADD_NEW(state_offset, count);
2233              }
2234          }          }
2235        break;        break;
2236    
2237        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2238          case OP_QUERYI:
2239          case OP_MINQUERYI:
2240          case OP_POSQUERYI:
2241          case OP_NOTQUERYI:
2242          case OP_NOTMINQUERYI:
2243          case OP_NOTPOSQUERYI:
2244          caseless = TRUE;
2245          codevalue -= OP_STARI - OP_STAR;
2246          /* Fall through */
2247        case OP_QUERY:        case OP_QUERY:
2248        case OP_MINQUERY:        case OP_MINQUERY:
2249          case OP_POSQUERY:
2250        case OP_NOTQUERY:        case OP_NOTQUERY:
2251        case OP_NOTMINQUERY:        case OP_NOTMINQUERY:
2252          case OP_NOTPOSQUERY:
2253        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2254        if (clen > 0)        if (clen > 0)
2255          {          {
2256          int otherd = -1;          unsigned int otherd = NOTACHAR;
2257          if ((ims && PCRE_CASELESS) != 0)          if (caseless)
2258            {            {
2259  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2260            if (utf8 && c >= 128)            if (utf8 && d >= 128)
2261              {              {
2262  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2263              if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;              otherd = UCD_OTHERCASE(d);
2264  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2265              }              }
2266            else            else
# Line 1131  for (;;) Line 2268  for (;;)
2268            otherd = fcc[d];            otherd = fcc[d];
2269            }            }
2270          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2271            { ADD_NEW(state_offset + dlen + 1, 0); }            {
2272              if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2273                {
2274                active_count--;            /* Remove non-match possibility */
2275                next_active_state--;
2276                }
2277              ADD_NEW(state_offset + dlen + 1, 0);
2278              }
2279          }          }
2280        break;        break;
2281    
2282        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2283          case OP_STARI:
2284          case OP_MINSTARI:
2285          case OP_POSSTARI:
2286          case OP_NOTSTARI:
2287          case OP_NOTMINSTARI:
2288          case OP_NOTPOSSTARI:
2289          caseless = TRUE;
2290          codevalue -= OP_STARI - OP_STAR;
2291          /* Fall through */
2292        case OP_STAR:        case OP_STAR:
2293        case OP_MINSTAR:        case OP_MINSTAR:
2294          case OP_POSSTAR:
2295        case OP_NOTSTAR:        case OP_NOTSTAR:
2296        case OP_NOTMINSTAR:        case OP_NOTMINSTAR:
2297          case OP_NOTPOSSTAR:
2298        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2299        if (clen > 0)        if (clen > 0)
2300          {          {
2301          int otherd = -1;          unsigned int otherd = NOTACHAR;
2302          if ((ims && PCRE_CASELESS) != 0)          if (caseless)
2303            {            {
2304  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2305            if (utf8 && c >= 128)            if (utf8 && d >= 128)
2306              {              {
2307  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2308              if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;              otherd = UCD_OTHERCASE(d);
2309  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2310              }              }
2311            else            else
# Line 1158  for (;;) Line 2313  for (;;)
2313            otherd = fcc[d];            otherd = fcc[d];
2314            }            }
2315          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2316            { ADD_NEW(state_offset, 0); }            {
2317              if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2318                {
2319                active_count--;            /* Remove non-match possibility */
2320                next_active_state--;
2321                }
2322              ADD_NEW(state_offset, 0);
2323              }
2324          }          }
2325        break;        break;
2326    
2327        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2328          case OP_EXACTI:
2329          case OP_NOTEXACTI:
2330          caseless = TRUE;
2331          codevalue -= OP_STARI - OP_STAR;
2332          /* Fall through */
2333        case OP_EXACT:        case OP_EXACT:
2334          case OP_NOTEXACT:
2335          count = current_state->count;  /* Number already matched */
2336          if (clen > 0)
2337            {
2338            unsigned int otherd = NOTACHAR;
2339            if (caseless)
2340              {
2341    #ifdef SUPPORT_UTF8
2342              if (utf8 && d >= 128)
2343                {
2344    #ifdef SUPPORT_UCP
2345                otherd = UCD_OTHERCASE(d);
2346    #endif  /* SUPPORT_UCP */
2347                }
2348              else
2349    #endif  /* SUPPORT_UTF8 */
2350              otherd = fcc[d];
2351              }
2352            if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2353              {
2354              if (++count >= GET2(code, 1))
2355                { ADD_NEW(state_offset + dlen + 3, 0); }
2356              else
2357                { ADD_NEW(state_offset, count); }
2358              }
2359            }
2360          break;
2361    
2362          /*-----------------------------------------------------------------*/
2363          case OP_UPTOI:
2364          case OP_MINUPTOI:
2365          case OP_POSUPTOI:
2366          case OP_NOTUPTOI:
2367          case OP_NOTMINUPTOI:
2368          case OP_NOTPOSUPTOI:
2369          caseless = TRUE;
2370          codevalue -= OP_STARI - OP_STAR;
2371          /* Fall through */
2372        case OP_UPTO:        case OP_UPTO:
2373        case OP_MINUPTO:        case OP_MINUPTO:
2374        case OP_NOTEXACT:        case OP_POSUPTO:
2375        case OP_NOTUPTO:        case OP_NOTUPTO:
2376        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2377        if (codevalue != OP_EXACT && codevalue != OP_NOTEXACT)        case OP_NOTPOSUPTO:
2378          { ADD_ACTIVE(state_offset + dlen + 3, 0); }        ADD_ACTIVE(state_offset + dlen + 3, 0);
2379        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2380        if (clen > 0)        if (clen > 0)
2381          {          {
2382          int otherd = -1;          unsigned int otherd = NOTACHAR;
2383          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2384            {            {
2385  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2386            if (utf8 && c >= 128)            if (utf8 && d >= 128)
2387              {              {
2388  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2389              if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;              otherd = UCD_OTHERCASE(d);
2390  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2391              }              }
2392            else            else
# Line 1190  for (;;) Line 2395  for (;;)
2395            }            }
2396          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2397            {            {
2398              if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2399                {
2400                active_count--;             /* Remove non-match possibility */
2401                next_active_state--;
2402                }
2403            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2404              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 3, 0); }
2405            else            else
# Line 1237  for (;;) Line 2447  for (;;)
2447          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2448          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2449    
2450          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2451    
2452          switch (*ecode)          switch (*ecode)
2453            {            {
# Line 1267  for (;;) Line 2477  for (;;)
2477              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 5, 0); }
2478            if (isinclass)            if (isinclass)
2479              {              {
2480              if (++count >= GET2(ecode, 3))              int max = GET2(ecode, 3);
2481                if (++count >= max && max != 0)   /* Max 0 => no limit */
2482                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 5, 0); }
2483              else              else
2484                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
# Line 1283  for (;;) Line 2494  for (;;)
2494    
2495  /* ========================================================================== */  /* ========================================================================== */
2496        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2497        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2498          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2499          though the other "backtracking verbs" are not supported. */
2500    
2501          case OP_FAIL:
2502          forced_fail++;    /* Count FAILs for multiple states */
2503          break;
2504    
2505        case OP_ASSERT:        case OP_ASSERT:
2506        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1301  for (;;) Line 2518  for (;;)
2518            md,                                   /* static match data */            md,                                   /* static match data */
2519            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2520            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2521            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2522            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2523            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2524            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2525            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2526            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2527    
2528            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2529          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2530              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2531          }          }
2532        break;        break;
2533    
2534        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2535        case OP_COND:        case OP_COND:
2536          case OP_SCOND:
2537          {          {
2538          int local_offsets[1000];          int local_offsets[1000];
2539          int local_workspace[1000];          int local_workspace[1000];
2540          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2541            int condcode;
2542    
2543            /* Because of the way auto-callout works during compile, a callout item
2544            is inserted between OP_COND and an assertion condition. This does not
2545            happen for the other conditions. */
2546    
2547            if (code[LINK_SIZE+1] == OP_CALLOUT)
2548              {
2549              rrc = 0;
2550              if (pcre_callout != NULL)
2551                {
2552                pcre_callout_block cb;
2553                cb.version          = 1;   /* Version 1 of the callout block */
2554                cb.callout_number   = code[LINK_SIZE+2];
2555                cb.offset_vector    = offsets;
2556                cb.subject          = (PCRE_SPTR)start_subject;
2557                cb.subject_length   = (int)(end_subject - start_subject);
2558                cb.start_match      = (int)(current_subject - start_subject);
2559                cb.current_position = (int)(ptr - start_subject);
2560                cb.pattern_position = GET(code, LINK_SIZE + 3);
2561                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2562                cb.capture_top      = 1;
2563                cb.capture_last     = -1;
2564                cb.callout_data     = md->callout_data;
2565                cb.mark             = NULL;   /* No (*MARK) support */
2566                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2567                }
2568              if (rrc > 0) break;                      /* Fail this thread */
2569              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2570              }
2571    
2572            condcode = code[LINK_SIZE+1];
2573    
2574            /* Back reference conditions are not supported */
2575    
2576            if (condcode == OP_CREF || condcode == OP_NCREF)
2577              return PCRE_ERROR_DFA_UCOND;
2578    
2579            /* The DEFINE condition is always false */
2580    
2581          /* The only supported version of OP_CREF is for the value 0xffff, which          if (condcode == OP_DEF)
2582          means "test if in a recursion". */            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2583    
2584          if (condcode == OP_CREF)          /* The only supported version of OP_RREF is for the value RREF_ANY,
2585            which means "test if in any recursion". We can't test for specifically
2586            recursed groups. */
2587    
2588            else if (condcode == OP_RREF || condcode == OP_NRREF)
2589            {            {
2590            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2591            if (value != 0xffff) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2592            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (md->recursive != NULL)
2593              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2594              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2595            }            }
2596    
2597          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1347  for (;;) Line 2608  for (;;)
2608              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2609              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2610              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2611              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2612              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2613              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2614              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2615              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2616              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2617    
2618              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2619            if ((rc >= 0) ==            if ((rc >= 0) ==
2620                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2621              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2622            else            else
2623              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2624            }            }
2625          }          }
2626        break;        break;
# Line 1368  for (;;) Line 2628  for (;;)
2628        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2629        case OP_RECURSE:        case OP_RECURSE:
2630          {          {
2631            dfa_recursion_info *ri;
2632          int local_offsets[1000];          int local_offsets[1000];
2633          int local_workspace[1000];          int local_workspace[1000];
2634            const uschar *callpat = start_code + GET(code, 1);
2635            int recno = (callpat == md->start_code)? 0 :
2636              GET2(callpat, 1 + LINK_SIZE);
2637          int rc;          int rc;
2638    
2639          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2640            recursing + 1));  
2641            /* Check for repeating a recursion without advancing the subject
2642            pointer. This should catch convoluted mutual recursions. (Some simple
2643            cases are caught at compile time.) */
2644    
2645            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2646              if (recno == ri->group_num && ptr == ri->subject_position)
2647                return PCRE_ERROR_RECURSELOOP;
2648    
2649            /* Remember this recursion and where we started it so as to
2650            catch infinite loops. */
2651    
2652            new_recursive.group_num = recno;
2653            new_recursive.subject_position = ptr;
2654            new_recursive.prevrec = md->recursive;
2655            md->recursive = &new_recursive;
2656    
2657          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2658            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2659            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2660            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2661            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2662            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2663            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2664            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2665            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2666            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing + 1);                       /* regex recurse level */  
2667    
2668          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2669            recursing + 1, rc));  
2670            DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2671              rc));
2672    
2673          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2674    
# Line 1422  for (;;) Line 2701  for (;;)
2701        break;        break;
2702    
2703        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2704          case OP_BRAPOS:
2705          case OP_SBRAPOS:
2706          case OP_CBRAPOS:
2707          case OP_SCBRAPOS:
2708          case OP_BRAPOSZERO:
2709            {
2710            int charcount, matched_count;
2711            const uschar *local_ptr = ptr;
2712            BOOL allow_zero;
2713    
2714            if (codevalue == OP_BRAPOSZERO)
2715              {
2716              allow_zero = TRUE;
2717              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2718              }
2719            else allow_zero = FALSE;
2720    
2721            /* Loop to match the subpattern as many times as possible as if it were
2722            a complete pattern. */
2723    
2724            for (matched_count = 0;; matched_count++)
2725              {
2726              int local_offsets[2];
2727              int local_workspace[1000];
2728    
2729              int rc = internal_dfa_exec(
2730                md,                                   /* fixed match data */
2731                code,                                 /* this subexpression's code */
2732                local_ptr,                            /* where we currently are */
2733                (int)(ptr - start_subject),           /* start offset */
2734                local_offsets,                        /* offset vector */
2735                sizeof(local_offsets)/sizeof(int),    /* size of same */
2736                local_workspace,                      /* workspace vector */
2737                sizeof(local_workspace)/sizeof(int),  /* size of same */
2738                rlevel);                              /* function recursion level */
2739    
2740              /* Failed to match */
2741    
2742              if (rc < 0)
2743                {
2744                if (rc != PCRE_ERROR_NOMATCH) return rc;
2745                break;
2746                }
2747    
2748              /* Matched: break the loop if zero characters matched. */
2749    
2750              charcount = local_offsets[1] - local_offsets[0];
2751              if (charcount == 0) break;
2752              local_ptr += charcount;    /* Advance temporary position ptr */
2753              }
2754    
2755            /* At this point we have matched the subpattern matched_count
2756            times, and local_ptr is pointing to the character after the end of the
2757            last match. */
2758    
2759            if (matched_count > 0 || allow_zero)
2760              {
2761              const uschar *end_subpattern = code;
2762              int next_state_offset;
2763    
2764              do { end_subpattern += GET(end_subpattern, 1); }
2765                while (*end_subpattern == OP_ALT);
2766              next_state_offset =
2767                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2768    
2769              /* Optimization: if there are no more active states, and there
2770              are no new states yet set up, then skip over the subject string
2771              right here, to save looping. Otherwise, set up the new state to swing
2772              into action when the end of the matched substring is reached. */
2773    
2774              if (i + 1 >= active_count && new_count == 0)
2775                {
2776                ptr = local_ptr;
2777                clen = 0;
2778                ADD_NEW(next_state_offset, 0);
2779                }
2780              else
2781                {
2782                const uschar *p = ptr;
2783                const uschar *pp = local_ptr;
2784                charcount = pp - p;
2785                while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2786                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2787                }
2788              }
2789            }
2790          break;
2791    
2792          /*-----------------------------------------------------------------*/
2793        case OP_ONCE:        case OP_ONCE:
2794          case OP_ONCE_NC:
2795          {          {
2796          int local_offsets[2];          int local_offsets[2];
2797          int local_workspace[1000];          int local_workspace[1000];
# Line 1431  for (;;) Line 2800  for (;;)
2800            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2801            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2802            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2803            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2804            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2805            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2806            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2807            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2808            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2809    
2810          if (rc >= 0)          if (rc >= 0)
2811            {            {
# Line 1448  for (;;) Line 2815  for (;;)
2815    
2816            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2817              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2818            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2819                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2820    
2821            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2822            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 1456  for (;;) Line 2824  for (;;)
2824    
2825            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2826                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2827              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2828    
2829            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2830            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 1471  for (;;) Line 2839  for (;;)
2839            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2840            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2841            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2842            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2843    
2844            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2845              {              {
# Line 1501  for (;;) Line 2869  for (;;)
2869              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2870                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2871              }              }
   
2872            }            }
2873          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2874          }          }
# Line 1512  for (;;) Line 2879  for (;;)
2879        /* Handle callouts */        /* Handle callouts */
2880    
2881        case OP_CALLOUT:        case OP_CALLOUT:
2882          rrc = 0;
2883        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2884          {          {
         int rrc;  
2885          pcre_callout_block cb;          pcre_callout_block cb;
2886          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2887          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2888          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2889          cb.subject          = (char *)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2890          cb.subject_length   = end_subject - start_subject;          cb.subject_length   = (int)(end_subject - start_subject);
2891          cb.start_match      = current_subject - start_subject;          cb.start_match      = (int)(current_subject - start_subject);
2892          cb.current_position = ptr - start_subject;          cb.current_position = (int)(ptr - start_subject);
2893          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
2894          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
2895          cb.capture_top      = 1;          cb.capture_top      = 1;
2896          cb.capture_last     = -1;          cb.capture_last     = -1;
2897          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2898            cb.mark             = NULL;   /* No (*MARK) support */
2899          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2900          }          }
2901          if (rrc == 0)
2902            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2903        break;        break;
2904    
2905    
# Line 1546  for (;;) Line 2915  for (;;)
2915    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2916    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2917    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2918    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
2919    
2920      The "forced_ fail" variable counts the number of (*F) encountered for the
2921      character. If it is equal to the original active_count (saved in
2922      workspace[1]) it means that (*F) was found on every active state. In this
2923      case we don't want to give a partial match.
2924    
2925      The "could_continue" variable is true if a state could have continued but
2926      for the fact that the end of the subject was reached. */
2927    
2928    if (new_count <= 0)    if (new_count <= 0)
2929      {      {
2930      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2931          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on */
2932          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2933            (                                            /* either... */
2934            (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2935            ||                                           /* or... */
2936            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2937             match_count < 0)                            /* no matches */
2938            ) &&                                         /* And... */
2939          ptr >= end_subject &&                  /* Reached end of subject */          ptr >= end_subject &&                  /* Reached end of subject */
2940          ptr > current_subject)                 /* Matched non-empty string */          ptr > md->start_used_ptr)              /* Inspected non-empty string */
2941        {        {
2942        if (offsetcount >= 2)        if (offsetcount >= 2)
2943          {          {
2944          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
2945          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
2946          }          }
2947        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
2948        }        }
# Line 1567  for (;;) Line 2950  for (;;)
2950      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2951        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2952        rlevel*2-2, SP));        rlevel*2-2, SP));
2953      return match_count;      break;        /* In effect, "return", but see the comment below */
2954      }      }
2955    
2956    /* One or more states are active for the next character. */    /* One or more states are active for the next character. */
# Line 1575  for (;;) Line 2958  for (;;)
2958    ptr += clen;    /* Advance to next subject character */    ptr += clen;    /* Advance to next subject character */
2959    }               /* Loop to move along the subject string */    }               /* Loop to move along the subject string */
2960    
2961  /* Control never gets here, but we must keep the compiler happy. */  /* Control gets here from "break" a few lines above. We do it this way because
2962    if we use "return" above, we have compiler trouble. Some compilers warn if
2963    there's nothing here because they think the function doesn't return a value. On
2964    the other hand, if we put a dummy statement here, some more clever compilers
2965    complain that it can't be reached. Sigh. */
2966    
2967  DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n"  return match_count;
   "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP));  
 return PCRE_ERROR_NOMATCH;  
2968  }  }
2969    
2970    
# Line 1595  is not anchored. Line 2980  is not anchored.
2980    
2981  Arguments:  Arguments:
2982    argument_re     points to the compiled expression    argument_re     points to the compiled expression
2983    extra_data      points to extra data or is NULL (not currently used)    extra_data      points to extra data or is NULL
2984    subject         points to the subject string    subject         points to the subject string
2985    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
2986    start_offset    where to start in the subject string    start_offset    where to start in the subject string
# Line 1611  Returns:          > 0 => number of match Line 2996  Returns:          > 0 => number of match
2996                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2997  */  */
2998    
2999  PCRE_EXPORT int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3000  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3001    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3002    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3003  {  {
3004  real_pcre *re = (real_pcre *)argument_re;  real_pcre *re = (real_pcre *)argument_re;
3005  dfa_match_data match_block;  dfa_match_data match_block;
3006    dfa_match_data *md = &match_block;
3007  BOOL utf8, anchored, startline, firstline;  BOOL utf8, anchored, startline, firstline;
3008  const uschar *current_subject, *end_subject, *lcc;  const uschar *current_subject, *end_subject, *lcc;
3009    
# Line 1632  BOOL req_byte_caseless = FALSE; Line 3018  BOOL req_byte_caseless = FALSE;
3018  int first_byte = -1;  int first_byte = -1;
3019  int req_byte = -1;  int req_byte = -1;
3020  int req_byte2 = -1;  int req_byte2 = -1;
3021    int newline;
3022    
3023  /* Plausibility checks */  /* Plausibility checks */
3024    
# Line 1640  if (re == NULL || subject == NULL || wor Line 3027  if (re == NULL || subject == NULL || wor
3027     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3028  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3029  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3030    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3031    
3032  /* We need to find the pointer to any study data before we test for byte  /* We need to find the pointer to any study data before we test for byte
3033  flipping, so we scan the extra_data block first. This may set two fields in the  flipping, so we scan the extra_data block first. This may set two fields in the
3034  match block, so we must initialize them beforehand. However, the other fields  match block, so we must initialize them beforehand. However, the other fields
3035  in the match block must not be set until after the byte flipping. */  in the match block must not be set until after the byte flipping. */
3036    
3037  match_block.tables = re->tables;  md->tables = re->tables;
3038  match_block.callout_data = NULL;  md->callout_data = NULL;
3039    
3040  if (extra_data != NULL)  if (extra_data != NULL)
3041    {    {
# Line 1655  if (extra_data != NULL) Line 3043  if (extra_data != NULL)
3043    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3044      study = (const pcre_study_data *)extra_data->study_data;      study = (const pcre_study_data *)extra_data->study_data;
3045    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3046      if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3047        return PCRE_ERROR_DFA_UMLIMIT;
3048    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3049      match_block.callout_data = extra_data->callout_data;      md->callout_data = extra_data->callout_data;
3050    if ((flags & PCRE_EXTRA_TABLES) != 0)    if ((flags & PCRE_EXTRA_TABLES) != 0)
3051      match_block.tables = extra_data->tables;      md->tables = extra_data->tables;
3052    }    }
3053    
3054  /* Check that the first field in the block is the magic number. If it is not,  /* Check that the first field in the block is the magic number. If it is not,
# Line 1679  current_subject = (const unsigned char * Line 3069  current_subject = (const unsigned char *
3069  end_subject = (const unsigned char *)subject + length;  end_subject = (const unsigned char *)subject + length;
3070  req_byte_ptr = current_subject - 1;  req_byte_ptr = current_subject - 1;
3071    
3072    #ifdef SUPPORT_UTF8
3073  utf8 = (re->options & PCRE_UTF8) != 0;  utf8 = (re->options & PCRE_UTF8) != 0;
3074  anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0;  #else
3075    utf8 = FALSE;
3076    #endif
3077    
3078    anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3079      (re->options & PCRE_ANCHORED) != 0;
3080    
3081  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3082    
3083  match_block.start_code = (const uschar *)argument_re +  md->start_code = (const uschar *)argument_re +
3084      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3085  match_block.start_subject = (const unsigned char *)subject;  md->start_subject = (const unsigned char *)subject;
3086  match_block.end_subject = end_subject;  md->end_subject = end_subject;
3087  match_block.moptions = options;  md->start_offset = start_offset;
3088  match_block.poptions = re->options;  md->moptions = options;
3089    md->poptions = re->options;
3090    
3091    /* If the BSR option is not set at match time, copy what was set
3092    at compile time. */
3093    
3094    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3095      {
3096      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3097        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3098    #ifdef BSR_ANYCRLF
3099      else md->moptions |= PCRE_BSR_ANYCRLF;
3100    #endif
3101      }
3102    
3103    /* Handle different types of newline. The three bits give eight cases. If
3104    nothing is set at run time, whatever was used at compile time applies. */
3105    
3106    switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3107             PCRE_NEWLINE_BITS)
3108      {
3109      case 0: newline = NEWLINE; break;   /* Compile-time default */
3110      case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3111      case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3112      case PCRE_NEWLINE_CR+
3113           PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3114      case PCRE_NEWLINE_ANY: newline = -1; break;
3115      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3116      default: return PCRE_ERROR_BADNEWLINE;
3117      }
3118    
3119    if (newline == -2)
3120      {
3121      md->nltype = NLTYPE_ANYCRLF;
3122      }
3123    else if (newline < 0)
3124      {
3125      md->nltype = NLTYPE_ANY;
3126      }
3127    else
3128      {
3129      md->nltype = NLTYPE_FIXED;
3130      if (newline > 255)
3131        {
3132        md->nllen = 2;
3133        md->nl[0] = (newline >> 8) & 255;
3134        md->nl[1] = newline & 255;
3135        }
3136      else
3137        {
3138        md->nllen = 1;
3139        md->nl[0] = newline;
3140        }
3141      }
3142    
3143  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3144  back the character offset. */  back the character offset. */
# Line 1697  back the character offset. */ Line 3146  back the character offset. */
3146  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3147  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3148    {    {
3149    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3150      return PCRE_ERROR_BADUTF8;    int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
3151    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3152      {      {
3153      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3154        {        {
3155        tb &= 0xc0;        offsets[0] = erroroffset;
3156        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3157        }        }
3158        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3159          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3160      }      }
3161      if (start_offset > 0 && start_offset < length &&
3162            (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
3163        return PCRE_ERROR_BADUTF8_OFFSET;
3164    }    }
3165  #endif  #endif
3166    
# Line 1715  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3168  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3168  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3169  in other programs later. */  in other programs later. */
3170    
3171  if (match_block.tables == NULL) match_block.tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = _pcre_default_tables;
3172    
3173  /* The lower casing table and the "must be at the start of a line" flag are  /* The lower casing table and the "must be at the start of a line" flag are
3174  used in a loop when finding where to start. */  used in a loop when finding where to start. */
3175    
3176  lcc = match_block.tables + lcc_offset;  lcc = md->tables + lcc_offset;
3177  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
3178  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3179    
3180  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 1732  studied, there may be a bitmap of possib Line 3185  studied, there may be a bitmap of possib
3185    
3186  if (!anchored)  if (!anchored)
3187    {    {
3188    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3189      {      {
3190      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
3191      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 1740  if (!anchored) Line 3193  if (!anchored)
3193      }      }
3194    else    else
3195      {      {
3196      if (startline && study != NULL &&      if (!startline && study != NULL &&
3197           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3198        start_bits = study->start_bits;        start_bits = study->start_bits;
3199      }      }
3200    }    }
# Line 1749  if (!anchored) Line 3202  if (!anchored)
3202  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
3203  character" set. */  character" set. */
3204    
3205  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3206    {    {
3207    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
3208    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3209    req_byte2 = (match_block.tables + fcc_offset)[req_byte];  /* case flipped */    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
3210    }    }
3211    
3212  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
3213  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
3214  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
3215    
3216  for (;;)  for (;;)
3217    {    {
# Line 1769  for (;;) Line 3221  for (;;)
3221      {      {
3222      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
3223    
3224      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
3225      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
3226      Implement this by temporarily adjusting end_subject so that we stop scanning      end_subject so that we stop scanning at a newline. If the match fails at
3227      at a newline. If the match fails at the newline, later code breaks this loop.      the newline, later code breaks this loop. */
     */  
3228    
3229      if (firstline)      if (firstline)
3230        {        {
3231        const uschar *t = current_subject;        USPTR t = current_subject;
3232        while (t < save_end_subject && *t != '\n') t++;  #ifdef SUPPORT_UTF8
3233          if (utf8)
3234            {
3235            while (t < md->end_subject && !IS_NEWLINE(t))
3236              {
3237              t++;
3238              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3239              }
3240            }
3241          else
3242    #endif
3243          while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3244        end_subject = t;        end_subject = t;
3245        }        }
3246    
3247      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
3248        starting point is not found. However, there is an option that disables
3249        these, for testing and for ensuring that all callouts do actually occur.
3250        The option can be set in the regex by (*NO_START_OPT) or passed in
3251        match-time options. */
3252    
3253        if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3254        {        {
3255        if (first_byte_caseless)        /* Advance to a known first byte. */
3256          while (current_subject < end_subject &&  
3257                 lcc[*current_subject] != first_byte)        if (first_byte >= 0)
3258            current_subject++;          {
3259        else          if (first_byte_caseless)
3260          while (current_subject < end_subject && *current_subject != first_byte)            while (current_subject < end_subject &&
3261            current_subject++;                   lcc[*current_subject] != first_byte)
3262        }              current_subject++;
3263            else
3264              while (current_subject < end_subject &&
3265                     *current_subject != first_byte)
3266                current_subject++;
3267            }
3268    
3269      /* Or to just after \n for a multiline match if possible */        /* Or to just after a linebreak for a multiline match if possible */
3270    
3271      else if (startline)        else if (startline)
       {  
       if (current_subject > match_block.start_subject + start_offset)  
3272          {          {
3273          while (current_subject < end_subject && current_subject[-1] != NEWLINE)          if (current_subject > md->start_subject + start_offset)
3274            current_subject++;            {
3275    #ifdef SUPPORT_UTF8
3276              if (utf8)
3277                {
3278                while (current_subject < end_subject &&
3279                       !WAS_NEWLINE(current_subject))
3280                  {
3281                  current_subject++;
3282                  while(current_subject < end_subject &&
3283                        (*current_subject & 0xc0) == 0x80)
3284                    current_subject++;
3285                  }
3286                }
3287              else
3288    #endif
3289              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3290                current_subject++;
3291    
3292              /* If we have just passed a CR and the newline option is ANY or
3293              ANYCRLF, and we are now at a LF, advance the match position by one
3294              more character. */
3295    
3296              if (current_subject[-1] == CHAR_CR &&
3297                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3298                   current_subject < end_subject &&
3299                   *current_subject == CHAR_NL)
3300                current_subject++;
3301              }
3302          }          }
       }  
3303    
3304      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
3305    
3306      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
3307          {          {
3308          register unsigned int c = *current_subject;          while (current_subject < end_subject)
3309          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
3310              register unsigned int c = *current_subject;
3311              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3312                {
3313                current_subject++;
3314    #ifdef SUPPORT_UTF8
3315                if (utf8)
3316                  while(current_subject < end_subject &&
3317                        (*current_subject & 0xc0) == 0x80) current_subject++;
3318    #endif
3319                }
3320            else break;            else break;
3321              }
3322          }          }
3323        }        }
3324    
3325      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3326    
3327      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
3328    
3329    /* If req_byte is set, we know that that character must appear in the subject      /* The following two optimizations are disabled for partial matching or if
3330    for the match to succeed. If the first character is set, req_byte must be      disabling is explicitly requested (and of course, by the test above, this
3331    later in the subject; otherwise the test starts at the match point. This      code is not obeyed when restarting after a partial match). */
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
   
     /* We don't need to repeat the search if we haven't yet reached the  
     place we found it at last time. */  
3332    
3333      if (p > req_byte_ptr)      if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3334            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3335        {        {
3336        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3337          {        is a lower bound; no actual string of that length may actually match the
3338          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3339            {        bytes to avoid spending too much time in this optimization. */
3340            register int pp = *p++;  
3341            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3342            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3343          }          return PCRE_ERROR_NOMATCH;
3344        else  
3345          /* If req_byte is set, we know that that character must appear in the
3346          subject for the match to succeed. If the first character is set, req_byte
3347          must be later in the subject; otherwise the test starts at the match
3348          point. This optimization can save a huge amount of work in patterns with
3349          nested unlimited repeats that aren't going to match. Writing separate
3350          code for cased/caseless versions makes it go faster, as does using an
3351          autoincrement and backing off on a match.
3352    
3353          HOWEVER: when the subject string is very, very long, searching to its end
3354          can take a long time, and give bad performance on quite ordinary
3355          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3356          string... so we don't do this when the string is sufficiently long. */
3357    
3358          if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3359          {          {
3360          while (p < end_subject)          register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3361    
3362            /* We don't need to repeat the search if we haven't yet reached the
3363            place we found it at last time. */
3364    
3365            if (p > req_byte_ptr)
3366            {            {
3367            if (*p++ == req_byte) { p--; break; }            if (req_byte_caseless)
3368            }              {
3369          }              while (p < end_subject)
3370                  {
3371                  register int pp = *p++;
3372                  if (pp == req_byte || pp == req_byte2) { p--; break; }
3373                  }
3374                }
3375              else
3376                {
3377                while (p < end_subject)
3378                  {
3379                  if (*p++ == req_byte) { p--; break; }
3380                  }
3381                }
3382    
3383        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3384        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3385    
3386        if (p >= end_subject) break;            if (p >= end_subject) break;
3387    
3388        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3389        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3390        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3391    
3392        req_byte_ptr = p;            req_byte_ptr = p;
3393              }
3394            }
3395        }        }
3396      }      }   /* End of optimizations that are done when not restarting */
3397    
3398    /* OK, now we can do the business */    /* OK, now we can do the business */
3399    
3400      md->start_used_ptr = current_subject;
3401      md->recursive = NULL;
3402    
3403    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3404      &match_block,                              /* fixed match data */      md,                                /* fixed match data */
3405      match_block.start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
3406      current_subject,                           /* where we currently are */      current_subject,                   /* where we currently are */
3407      start_offset,                              /* start offset in subject */      start_offset,                      /* start offset in subject */
3408      offsets,                                   /* offset vector */      offsets,                           /* offset vector */
3409      offsetcount,                               /* size of same */      offsetcount,                       /* size of same */
3410      workspace,                                 /* workspace vector */      workspace,                         /* workspace vector */
3411      wscount,                                   /* size of same */      wscount,                           /* size of same */
3412      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                         /* function recurse level */  
     0);                                        /* regex recurse level */  
3413    
3414    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3415    on only if not anchored. */    on only if not anchored. */
# Line 1900  for (;;) Line 3419  for (;;)
3419    /* Advance to the next subject character unless we are at the end of a line    /* Advance to the next subject character unless we are at the end of a line
3420    and firstline is set. */    and firstline is set. */
3421    
3422    if (firstline && *current_subject == NEWLINE) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3423    current_subject++;    current_subject++;
   
 #ifdef SUPPORT_UTF8  
3424    if (utf8)    if (utf8)
3425      {      {
3426      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3427        current_subject++;        current_subject++;
3428      }      }
 #endif  
   
3429    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3430    }  
3431      /* If we have just passed a CR and we are now at a LF, and the pattern does
3432      not contain any explicit matches for \r or \n, and the newline option is CRLF
3433      or ANY or ANYCRLF, advance the match position by one more character. */
3434    
3435      if (current_subject[-1] == CHAR_CR &&
3436          current_subject < end_subject &&
3437          *current_subject == CHAR_NL &&
3438          (re->flags & PCRE_HASCRORLF) == 0 &&
3439            (md->nltype == NLTYPE_ANY ||
3440             md->nltype == NLTYPE_ANYCRLF ||
3441             md->nllen == 2))
3442        current_subject++;
3443    
3444      }   /* "Bumpalong" loop */
3445    
3446  return PCRE_ERROR_NOMATCH;  return PCRE_ERROR_NOMATCH;
3447  }  }

Legend:
Removed from v.85  
changed lines
  Added in v.723

  ViewVC Help
Powered by ViewVC 1.1.5