/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 178 by ph10, Wed Jun 13 08:44:34 2007 UTC revision 919 by ph10, Fri Feb 17 11:48:02 2012 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49    the performance of his patterns greatly. I could not use it as it stood, as it
50    was not thread safe, and made assumptions about pattern sizes. Also, it caused
51    test 7 to loop, and test 9 to crash with a segfault.
52    
53    The issue is the check for duplicate states, which is done by a simple linear
54    search up the state list. (Grep for "duplicate" below to find the code.) For
55    many patterns, there will never be many states active at one time, so a simple
56    linear search is fine. In patterns that have many active states, it might be a
57    bottleneck. The suggested code used an indexing scheme to remember which states
58    had previously been used for each character, and avoided the linear search when
59    it knew there was no chance of a duplicate. This was implemented when adding
60    states to the state lists.
61    
62    I wrote some thread-safe, not-limited code to try something similar at the time
63    of checking for duplicates (instead of when adding states), using index vectors
64    on the stack. It did give a 13% improvement with one specially constructed
65    pattern for certain subject strings, but on other strings and on many of the
66    simpler patterns in the test suite it did worse. The major problem, I think,
67    was the extra time to initialize the index. This had to be done for each call
68    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69    only once - I suspect this was the cause of the problems with the tests.)
70    
71    Overall, I concluded that the gains in some cases did not outweigh the losses
72    in others, so I abandoned this code. */
73    
74    
75    
76    #ifdef HAVE_CONFIG_H
77    #include "config.h"
78    #endif
79    
80  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
81  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
82  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 56  applications. */ Line 89  applications. */
89  #define SP "                   "  #define SP "                   "
90    
91    
   
92  /*************************************************  /*************************************************
93  *      Code parameters and static tables         *  *      Code parameters and static tables         *
94  *************************************************/  *************************************************/
95    
96  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
98  enough. The resulting opcodes don't have to be less than 256 because they are  enough. The resulting opcodes don't have to be less than 256 because they are
99  never stored, so we push them well clear of the normal opcodes. */  never stored, so we push them well clear of the normal opcodes. */
100    
101  #define OP_PROP_EXTRA       300  #define OP_PROP_EXTRA       300
# Line 74  never stored, so we push them well clear Line 106  never stored, so we push them well clear
106    
107    
108  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
109  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
110  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
111  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. Non-zero values in the table are the offsets from the opcode where
113  that follow must also be modified. */  the character is to be found. ***NOTE*** If the start of this table is
114    modified, the three tables that follow must also be modified. */
115    
116  static uschar coptable[] = {  static const pcre_uint8 coptable[] = {
117    0,                             /* End                                    */    0,                             /* End                                    */
118    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120    0, 0,                          /* Any, Anybyte                           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0,                          /* \P, \p                                 */
122    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0,                             /* \X                                     */
124      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
125    1,                             /* Char                                   */    1,                             /* Char                                   */
126    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
127    1,                             /* not                                    */    1,                             /* not                                    */
128      1,                             /* noti                                   */
129    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
130    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
133      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136      1+IMM2_SIZE,                   /* exact I                                */
137      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
139    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
142      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145      1+IMM2_SIZE,                   /* NOT exact I                            */
146      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
148    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
151      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
153    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 110  static uschar coptable[] = { Line 156  static uschar coptable[] = {
156    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
157    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
158    0,                             /* REF                                    */    0,                             /* REF                                    */
159      0,                             /* REFI                                   */
160    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
161    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
162    0,                             /* Alt                                    */    0,                             /* Alt                                    */
163    0,                             /* Ket                                    */    0,                             /* Ket                                    */
164    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
165    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
166      0,                             /* KetRpos                                */
167      0,                             /* Reverse                                */
168    0,                             /* Assert                                 */    0,                             /* Assert                                 */
169    0,                             /* Assert not                             */    0,                             /* Assert not                             */
170    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
171    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
172      0, 0,                          /* ONCE, ONCE_NC                          */
173      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
174      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
175      0, 0,                          /* CREF, NCREF                            */
176      0, 0,                          /* RREF, NRREF                            */
177      0,                             /* DEF                                    */
178      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
179      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
180      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
181      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
182      0, 0                           /* CLOSE, SKIPZERO  */
183    };
184    
185    /* This table identifies those opcodes that inspect a character. It is used to
186    remember the fact that a character could have been inspected when the end of
187    the subject is reached. ***NOTE*** If the start of this table is modified, the
188    two tables that follow must also be modified. */
189    
190    static const pcre_uint8 poptable[] = {
191      0,                             /* End                                    */
192      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
193      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
194      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
195      1, 1,                          /* \P, \p                                 */
196      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
197      1,                             /* \X                                     */
198      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
199      1,                             /* Char                                   */
200      1,                             /* Chari                                  */
201      1,                             /* not                                    */
202      1,                             /* noti                                   */
203      /* Positive single-char repeats                                          */
204      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
205      1, 1, 1,                       /* upto, minupto, exact                   */
206      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
207      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
208      1, 1, 1,                       /* upto I, minupto I, exact I             */
209      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
210      /* Negative single-char repeats - only for chars < 256                   */
211      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
212      1, 1, 1,                       /* NOT upto, minupto, exact               */
213      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
214      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
215      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
216      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
217      /* Positive type repeats                                                 */
218      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
219      1, 1, 1,                       /* Type upto, minupto, exact              */
220      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
221      /* Character class & ref repeats                                         */
222      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
223      1, 1,                          /* CRRANGE, CRMINRANGE                    */
224      1,                             /* CLASS                                  */
225      1,                             /* NCLASS                                 */
226      1,                             /* XCLASS - variable length               */
227      0,                             /* REF                                    */
228      0,                             /* REFI                                   */
229      0,                             /* RECURSE                                */
230      0,                             /* CALLOUT                                */
231      0,                             /* Alt                                    */
232      0,                             /* Ket                                    */
233      0,                             /* KetRmax                                */
234      0,                             /* KetRmin                                */
235      0,                             /* KetRpos                                */
236    0,                             /* Reverse                                */    0,                             /* Reverse                                */
237    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
238    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
239    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
240    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
241      0, 0,                          /* ONCE, ONCE_NC                          */
242      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
243      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
244      0, 0,                          /* CREF, NCREF                            */
245      0, 0,                          /* RREF, NRREF                            */
246    0,                             /* DEF                                    */    0,                             /* DEF                                    */
247    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
248      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
249      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
250      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
251      0, 0                           /* CLOSE, SKIPZERO                        */
252  };  };
253    
254  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
255  and \w */  and \w */
256    
257  static uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
258    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
259    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
260    ctype_space, ctype_space,    ctype_space, ctype_space,
261    ctype_word,  ctype_word,    ctype_word,  ctype_word,
262    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
263  };  };
264    
265  static uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
266    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
267    ctype_digit, 0,    ctype_digit, 0,
268    ctype_space, 0,    ctype_space, 0,
269    ctype_word,  0,    ctype_word,  0,
270    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
271  };  };
272    
273    
# Line 157  these structures in, is a vector of ints Line 279  these structures in, is a vector of ints
279  typedef struct stateblock {  typedef struct stateblock {
280    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
281    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
282    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
283  } stateblock;  } stateblock;
284    
285  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
286    
287    
288  #ifdef DEBUG  #ifdef PCRE_DEBUG
289  /*************************************************  /*************************************************
290  *             Print character string             *  *             Print character string             *
291  *************************************************/  *************************************************/
# Line 180  Returns:       nothing Line 301  Returns:       nothing
301  */  */
302    
303  static void  static void
304  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
305  {  {
306  int c;  int c;
307  while (length-- > 0)  while (length-- > 0)
# Line 213  Arguments: Line 334  Arguments:
334    offsetcount       size of same    offsetcount       size of same
335    workspace         vector of workspace    workspace         vector of workspace
336    wscount           size of same    wscount           size of same
   ims               the current ims flags  
337    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
338    
339  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
340                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
341                       -1 => failed to match                       -1 => failed to match
342                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
343    
# Line 230  for the current character, one for the f Line 349  for the current character, one for the f
349      { \      { \
350      next_active_state->offset = (x); \      next_active_state->offset = (x); \
351      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
352      next_active_state++; \      next_active_state++; \
353      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354      } \      } \
# Line 241  for the current character, one for the f Line 359  for the current character, one for the f
359      { \      { \
360      next_active_state->offset = (x); \      next_active_state->offset = (x); \
361      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
362      next_active_state->data   = (z); \      next_active_state->data   = (z); \
363      next_active_state++; \      next_active_state++; \
364      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 253  for the current character, one for the f Line 370  for the current character, one for the f
370      { \      { \
371      next_new_state->offset = (x); \      next_new_state->offset = (x); \
372      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
373      next_new_state++; \      next_new_state++; \
374      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
375      } \      } \
# Line 264  for the current character, one for the f Line 380  for the current character, one for the f
380      { \      { \
381      next_new_state->offset = (x); \      next_new_state->offset = (x); \
382      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
383      next_new_state->data   = (z); \      next_new_state->data   = (z); \
384      next_new_state++; \      next_new_state++; \
385      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 276  for the current character, one for the f Line 391  for the current character, one for the f
391  static int  static int
392  internal_dfa_exec(  internal_dfa_exec(
393    dfa_match_data *md,    dfa_match_data *md,
394    const uschar *this_start_code,    const pcre_uchar *this_start_code,
395    const uschar *current_subject,    const pcre_uchar *current_subject,
396    int start_offset,    int start_offset,
397    int *offsets,    int *offsets,
398    int offsetcount,    int offsetcount,
399    int *workspace,    int *workspace,
400    int wscount,    int wscount,
401    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
402  {  {
403  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
404  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
405    
406  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
407  const uschar *ptr;  const pcre_uchar *ptr;
408  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
409    
410    dfa_recursion_info new_recursive;
411    
412  int active_count, new_count, match_count;  int active_count, new_count, match_count;
413    
414  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
415  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
416    
417  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
418  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
419  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
420    
421  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
422  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423  #else  #else
424  BOOL utf8 = FALSE;  BOOL utf = FALSE;
425  #endif  #endif
426    
427    BOOL reset_could_continue = FALSE;
428    
429  rlevel++;  rlevel++;
430  offsetcount &= (-2);  offsetcount &= (-2);
431    
# Line 317  wscount = (wscount - (wscount % (INTS_PE Line 434  wscount = (wscount - (wscount % (INTS_PE
434            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
435    
436  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
437    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
438    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439    
440  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
441  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 331  next_new_state = new_states = active_sta Line 448  next_new_state = new_states = active_sta
448  new_count = 0;  new_count = 0;
449    
450  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
451    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453        ? IMM2_SIZE:0);
454    
455  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 359  if (*first_op == OP_REVERSE) Line 478  if (*first_op == OP_REVERSE)
478    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
479    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
480    
481  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
482    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
483    
484    if (utf8)    if (utf)
485      {      {
486      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
487        {        {
488        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
489        current_subject--;        current_subject--;
490        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
491        }        }
492      }      }
493    else    else
# Line 380  if (*first_op == OP_REVERSE) Line 497  if (*first_op == OP_REVERSE)
497    
498      {      {
499      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
500        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
501      current_subject -= gone_back;      current_subject -= gone_back;
502      }      }
503    
504      /* Save the earliest consulted character */
505    
506      if (current_subject < md->start_used_ptr)
507        md->start_used_ptr = current_subject;
508    
509    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
510    
511    end_code = this_start_code;    end_code = this_start_code;
# Line 392  if (*first_op == OP_REVERSE) Line 514  if (*first_op == OP_REVERSE)
514      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
515      if (back <= gone_back)      if (back <= gone_back)
516        {        {
517        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
519        }        }
520      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 425  else Line 547  else
547    else    else
548      {      {
549      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
550        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552            ? IMM2_SIZE:0);
553      do      do
554        {        {
555        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
556        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
557        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
558        }        }
# Line 438  else Line 562  else
562    
563  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
564    
565  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566    
567  /* Loop for scanning the subject */  /* Loop for scanning the subject */
568    
# Line 448  for (;;) Line 572  for (;;)
572    int i, j;    int i, j;
573    int clen, dlen;    int clen, dlen;
574    unsigned int c, d;    unsigned int c, d;
575      int forced_fail = 0;
576      BOOL partial_newline = FALSE;
577      BOOL could_continue = reset_could_continue;
578      reset_could_continue = FALSE;
579    
580    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
581    new state list. */    new state list. */
582    
# Line 461  for (;;) Line 589  for (;;)
589    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
590    workspace[1] = active_count;    workspace[1] = active_count;
591    
592  #ifdef DEBUG  #ifdef PCRE_DEBUG
593    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
595    printf("\"\n");    printf("\"\n");
596    
597    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 484  for (;;) Line 612  for (;;)
612    if (ptr < end_subject)    if (ptr < end_subject)
613      {      {
614      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of bytes in the character */
615  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
616      if (utf8) { GETCHARLEN(c, ptr, clen); } else      if (utf) { GETCHARLEN(c, ptr, clen); } else
617  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
618      c = *ptr;      c = *ptr;
619      }      }
620    else    else
# Line 503  for (;;) Line 631  for (;;)
631    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
632      {      {
633      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
634      const uschar *code;      BOOL caseless = FALSE;
635        const pcre_uchar *code;
636      int state_offset = current_state->offset;      int state_offset = current_state->offset;
637      int count, codevalue;      int count, codevalue, rrc;
 #ifdef SUPPORT_UCP  
     int chartype, script;  
 #endif  
638    
639  #ifdef DEBUG  #ifdef PCRE_DEBUG
640      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
641      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
642        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
643          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
644  #endif  #endif
645    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
646      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
647      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
648      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
649        state, arrange for it to passed on. */
650    
651      if (state_offset < 0)      if (state_offset < 0)
652        {        {
# Line 532  for (;;) Line 655  for (;;)
655          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
656          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
657            current_state->data - 1);            current_state->data - 1);
658            if (could_continue) reset_could_continue = TRUE;
659          continue;          continue;
660          }          }
661        else        else
# Line 540  for (;;) Line 664  for (;;)
664          }          }
665        }        }
666    
667      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
668        See the note at the head of this module about the possibility of improving
669        performance here. */
670    
671      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
672        {        {
# Line 557  for (;;) Line 683  for (;;)
683      code = start_code + state_offset;      code = start_code + state_offset;
684      codevalue = *code;      codevalue = *code;
685    
686        /* If this opcode inspects a character, but we are at the end of the
687        subject, remember the fact for use when testing for a partial match. */
688    
689        if (clen == 0 && poptable[codevalue] != 0)
690          could_continue = TRUE;
691    
692      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
693      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
694      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
# Line 571  for (;;) Line 703  for (;;)
703      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
704        {        {
705        dlen = 1;        dlen = 1;
706  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
707        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
708  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
709        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
710        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
711          {          {
# Line 585  for (;;) Line 717  for (;;)
717            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
718            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
719            case OP_NOT_HSPACE:            case OP_NOT_HSPACE:
720            case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;            case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
721            case OP_NOT_VSPACE:            case OP_NOT_VSPACE:
722            case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;            case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
723            default: break;            default: break;
724            }            }
725          }          }
# Line 603  for (;;) Line 735  for (;;)
735    
736      switch (codevalue)      switch (codevalue)
737        {        {
738    /* ========================================================================== */
739          /* These cases are never obeyed. This is a fudge that causes a compile-
740          time error if the vectors coptable or poptable, which are indexed by
741          opcode, are not the correct length. It seems to be the only way to do
742          such a check at compile time, as the sizeof() operator does not work
743          in the C preprocessor. */
744    
745          case OP_TABLE_LENGTH:
746          case OP_TABLE_LENGTH +
747            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
748             (sizeof(poptable) == OP_TABLE_LENGTH)):
749          break;
750    
751  /* ========================================================================== */  /* ========================================================================== */
752        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
753        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
754        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
755          subpattern, because the possessive subpattern repeats are always handled
756          using recursive calls. Thus, it never adds any new states.
757    
758          At the end of the (sub)pattern, unless we have an empty string and
759          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
760          start of the subject, save the match data, shifting up all previous
761        matches so we always have the longest first. */        matches so we always have the longest first. */
762    
763        case OP_KET:        case OP_KET:
764        case OP_KETRMIN:        case OP_KETRMIN:
765        case OP_KETRMAX:        case OP_KETRMAX:
766          case OP_KETRPOS:
767        if (code != end_code)        if (code != end_code)
768          {          {
769          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 621  for (;;) Line 772  for (;;)
772            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
773            }            }
774          }          }
775        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
776          {          {
777          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
778            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
779              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
780          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
781          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
782          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
783            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
784            offsets[0] = current_subject - start_subject;                match_count = 0;
785            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
786            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
787              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
788            }              {
789          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
790            {              offsets[1] = (int)(ptr - start_subject);
791            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
792              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], current_subject));
793              match_count, rlevel*2-2, SP));              }
794            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
795                {
796                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
797                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
798                  match_count, rlevel*2-2, SP));
799                return match_count;
800                }
801            }            }
802          }          }
803        break;        break;
# Line 652  for (;;) Line 809  for (;;)
809        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
810        case OP_ALT:        case OP_ALT:
811        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
812        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
813        break;        break;
814    
815        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 660  for (;;) Line 817  for (;;)
817        case OP_SBRA:        case OP_SBRA:
818        do        do
819          {          {
820          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821          code += GET(code, 1);          code += GET(code, 1);
822          }          }
823        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 669  for (;;) Line 826  for (;;)
826        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
827        case OP_CBRA:        case OP_CBRA:
828        case OP_SCBRA:        case OP_SCBRA:
829        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
830        code += GET(code, 1);        code += GET(code, 1);
831        while (*code == OP_ALT)        while (*code == OP_ALT)
832          {          {
833          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
834          code += GET(code, 1);          code += GET(code, 1);
835          }          }
836        break;        break;
# Line 684  for (;;) Line 841  for (;;)
841        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
842        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
843        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
844        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
845          break;
846    
847          /*-----------------------------------------------------------------*/
848          case OP_SKIPZERO:
849          code += 1 + GET(code, 2);
850          while (*code == OP_ALT) code += GET(code, 1);
851          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
852        break;        break;
853    
854        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
855        case OP_CIRC:        case OP_CIRC:
856        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
857          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
858        break;        break;
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_EOD:        case OP_CIRCM:
862        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
863              (ptr != end_subject && WAS_NEWLINE(ptr)))
864            { ADD_ACTIVE(state_offset + 1, 0); }
865        break;        break;
866    
867        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
868        case OP_OPT:        case OP_EOD:
869        ims = code[1];        if (ptr >= end_subject)
870        ADD_ACTIVE(state_offset + 2, 0);          {
871            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
872              could_continue = TRUE;
873            else { ADD_ACTIVE(state_offset + 1, 0); }
874            }
875        break;        break;
876    
877        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 726  for (;;) Line 893  for (;;)
893    
894        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
895        case OP_ANY:        case OP_ANY:
896        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
897            {
898            if (ptr + 1 >= md->end_subject &&
899                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
900                NLBLOCK->nltype == NLTYPE_FIXED &&
901                NLBLOCK->nllen == 2 &&
902                c == NLBLOCK->nl[0])
903              {
904              could_continue = partial_newline = TRUE;
905              }
906            else
907              {
908              ADD_NEW(state_offset + 1, 0);
909              }
910            }
911          break;
912    
913          /*-----------------------------------------------------------------*/
914          case OP_ALLANY:
915          if (clen > 0)
916          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
917        break;        break;
918    
919        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
920        case OP_EODN:        case OP_EODN:
921        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
922            could_continue = TRUE;
923          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
924          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
925        break;        break;
926    
# Line 740  for (;;) Line 928  for (;;)
928        case OP_DOLL:        case OP_DOLL:
929        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
930          {          {
931          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
932              (IS_NEWLINE(ptr) &&            could_continue = TRUE;
933                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)          else if (clen == 0 ||
934                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
935                   (ptr == end_subject - md->nllen)
936              ))              ))
937            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
938            else if (ptr + 1 >= md->end_subject &&
939                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
940                     NLBLOCK->nltype == NLTYPE_FIXED &&
941                     NLBLOCK->nllen == 2 &&
942                     c == NLBLOCK->nl[0])
943              {
944              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
945                {
946                reset_could_continue = TRUE;
947                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
948                }
949              else could_continue = partial_newline = TRUE;
950              }
951            }
952          break;
953    
954          /*-----------------------------------------------------------------*/
955          case OP_DOLLM:
956          if ((md->moptions & PCRE_NOTEOL) == 0)
957            {
958            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
959              could_continue = TRUE;
960            else if (clen == 0 ||
961                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
962              { ADD_ACTIVE(state_offset + 1, 0); }
963            else if (ptr + 1 >= md->end_subject &&
964                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
965                     NLBLOCK->nltype == NLTYPE_FIXED &&
966                     NLBLOCK->nllen == 2 &&
967                     c == NLBLOCK->nl[0])
968              {
969              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
970                {
971                reset_could_continue = TRUE;
972                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
973                }
974              else could_continue = partial_newline = TRUE;
975              }
976          }          }
977        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        else if (IS_NEWLINE(ptr))
978          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
979        break;        break;
980    
# Line 777  for (;;) Line 1005  for (;;)
1005    
1006          if (ptr > start_subject)          if (ptr > start_subject)
1007            {            {
1008            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1009  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1010            if (utf8) BACKCHAR(temp);  #ifdef SUPPORT_UTF
1011              if (utf) { BACKCHAR(temp); }
1012  #endif  #endif
1013            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1014    #ifdef SUPPORT_UCP
1015              if ((md->poptions & PCRE_UCP) != 0)
1016                {
1017                if (d == '_') left_word = TRUE; else
1018                  {
1019                  int cat = UCD_CATEGORY(d);
1020                  left_word = (cat == ucp_L || cat == ucp_N);
1021                  }
1022                }
1023              else
1024    #endif
1025            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1026            }            }
1027          else left_word = 0;          else left_word = FALSE;
1028    
1029          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
1030            else right_word = 0;            {
1031    #ifdef SUPPORT_UCP
1032              if ((md->poptions & PCRE_UCP) != 0)
1033                {
1034                if (c == '_') right_word = TRUE; else
1035                  {
1036                  int cat = UCD_CATEGORY(c);
1037                  right_word = (cat == ucp_L || cat == ucp_N);
1038                  }
1039                }
1040              else
1041    #endif
1042              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1043              }
1044            else right_word = FALSE;
1045    
1046          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1047            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 806  for (;;) Line 1060  for (;;)
1060        if (clen > 0)        if (clen > 0)
1061          {          {
1062          BOOL OK;          BOOL OK;
1063          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1064          switch(code[1])          switch(code[1])
1065            {            {
1066            case PT_ANY:            case PT_ANY:
# Line 814  for (;;) Line 1068  for (;;)
1068            break;            break;
1069    
1070            case PT_LAMP:            case PT_LAMP:
1071            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1072                   prop->chartype == ucp_Lt;
1073            break;            break;
1074    
1075            case PT_GC:            case PT_GC:
1076            OK = category == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1077            break;            break;
1078    
1079            case PT_PC:            case PT_PC:
1080            OK = chartype == code[2];            OK = prop->chartype == code[2];
1081            break;            break;
1082    
1083            case PT_SC:            case PT_SC:
1084            OK = script == code[2];            OK = prop->script == code[2];
1085              break;
1086    
1087              /* These are specials for combination cases. */
1088    
1089              case PT_ALNUM:
1090              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1091                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1092              break;
1093    
1094              case PT_SPACE:    /* Perl space */
1095              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1096                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1097              break;
1098    
1099              case PT_PXSPACE:  /* POSIX space */
1100              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1101                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1102                   c == CHAR_FF || c == CHAR_CR;
1103              break;
1104    
1105              case PT_WORD:
1106              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1107                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1108                   c == CHAR_UNDERSCORE;
1109            break;            break;
1110    
1111            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 846  for (;;) Line 1125  for (;;)
1125  /* ========================================================================== */  /* ========================================================================== */
1126        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
1127        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
1128        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1129        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1130    
1131        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1132        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 856  for (;;) Line 1135  for (;;)
1135        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1136        if (clen > 0)        if (clen > 0)
1137          {          {
1138          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1139                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1140                NLBLOCK->nltype == NLTYPE_FIXED &&
1141                NLBLOCK->nllen == 2 &&
1142                c == NLBLOCK->nl[0])
1143              {
1144              could_continue = partial_newline = TRUE;
1145              }
1146            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1147              (c < 256 &&              (c < 256 &&
1148                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1149                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1150            {            {
1151            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 882  for (;;) Line 1166  for (;;)
1166        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1167        if (clen > 0)        if (clen > 0)
1168          {          {
1169          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1170                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1171                NLBLOCK->nltype == NLTYPE_FIXED &&
1172                NLBLOCK->nllen == 2 &&
1173                c == NLBLOCK->nl[0])
1174              {
1175              could_continue = partial_newline = TRUE;
1176              }
1177            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1178              (c < 256 &&              (c < 256 &&
1179                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1180                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1181            {            {
1182            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 907  for (;;) Line 1196  for (;;)
1196        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1197        if (clen > 0)        if (clen > 0)
1198          {          {
1199          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1200                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1201                NLBLOCK->nltype == NLTYPE_FIXED &&
1202                NLBLOCK->nllen == 2 &&
1203                c == NLBLOCK->nl[0])
1204              {
1205              could_continue = partial_newline = TRUE;
1206              }
1207            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208              (c < 256 &&              (c < 256 &&
1209                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1210                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211            {            {
1212            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 930  for (;;) Line 1224  for (;;)
1224        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1225        if (clen > 0)        if (clen > 0)
1226          {          {
1227          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1228                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1229                NLBLOCK->nltype == NLTYPE_FIXED &&
1230                NLBLOCK->nllen == 2 &&
1231                c == NLBLOCK->nl[0])
1232              {
1233              could_continue = partial_newline = TRUE;
1234              }
1235            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1236              (c < 256 &&              (c < 256 &&
1237                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1238                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1239            {            {
1240            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1241              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1242            else            else
1243              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1244            }            }
# Line 950  for (;;) Line 1249  for (;;)
1249        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1250        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1251        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1252        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1253        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1254        if (clen > 0)        if (clen > 0)
1255          {          {
1256          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1257                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1258                NLBLOCK->nltype == NLTYPE_FIXED &&
1259                NLBLOCK->nllen == 2 &&
1260                c == NLBLOCK->nl[0])
1261              {
1262              could_continue = partial_newline = TRUE;
1263              }
1264            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265              (c < 256 &&              (c < 256 &&
1266                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1267                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268            {            {
1269            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 968  for (;;) Line 1272  for (;;)
1272              next_active_state--;              next_active_state--;
1273              }              }
1274            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1275              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1276            else            else
1277              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1278            }            }
# Line 990  for (;;) Line 1294  for (;;)
1294        if (clen > 0)        if (clen > 0)
1295          {          {
1296          BOOL OK;          BOOL OK;
1297          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1298          switch(code[2])          switch(code[2])
1299            {            {
1300            case PT_ANY:            case PT_ANY:
# Line 998  for (;;) Line 1302  for (;;)
1302            break;            break;
1303    
1304            case PT_LAMP:            case PT_LAMP:
1305            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1306                prop->chartype == ucp_Lt;
1307            break;            break;
1308    
1309            case PT_GC:            case PT_GC:
1310            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1311            break;            break;
1312    
1313            case PT_PC:            case PT_PC:
1314            OK = chartype == code[3];            OK = prop->chartype == code[3];
1315            break;            break;
1316    
1317            case PT_SC:            case PT_SC:
1318            OK = script == code[3];            OK = prop->script == code[3];
1319              break;
1320    
1321              /* These are specials for combination cases. */
1322    
1323              case PT_ALNUM:
1324              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1325                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1326              break;
1327    
1328              case PT_SPACE:    /* Perl space */
1329              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1330                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1331              break;
1332    
1333              case PT_PXSPACE:  /* POSIX space */
1334              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1335                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1336                   c == CHAR_FF || c == CHAR_CR;
1337              break;
1338    
1339              case PT_WORD:
1340              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1341                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1342                   c == CHAR_UNDERSCORE;
1343            break;            break;
1344    
1345            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1039  for (;;) Line 1368  for (;;)
1368        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1369        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1370        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1371        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1372          {          {
1373          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1374          int ncount = 0;          int ncount = 0;
1375          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1376            {            {
# Line 1053  for (;;) Line 1382  for (;;)
1382            int nd;            int nd;
1383            int ndlen = 1;            int ndlen = 1;
1384            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1385            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1386            ncount++;            ncount++;
1387            nptr += ndlen;            nptr += ndlen;
1388            }            }
# Line 1074  for (;;) Line 1403  for (;;)
1403          int ncount = 0;          int ncount = 0;
1404          switch (c)          switch (c)
1405            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1406            case 0x000b:            case 0x000b:
1407            case 0x000c:            case 0x000c:
1408            case 0x0085:            case 0x0085:
1409            case 0x2028:            case 0x2028:
1410            case 0x2029:            case 0x2029:
1411              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1412              goto ANYNL01;
1413    
1414              case 0x000d:
1415              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1416              /* Fall through */
1417    
1418              ANYNL01:
1419              case 0x000a:
1420            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1421              {              {
1422              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1091  for (;;) Line 1425  for (;;)
1425            count++;            count++;
1426            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, ncount);
1427            break;            break;
1428    
1429            default:            default:
1430            break;            break;
1431            }            }
# Line 1105  for (;;) Line 1440  for (;;)
1440        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1441        if (clen > 0)        if (clen > 0)
1442          {          {
1443          BOOL OK;          BOOL OK;
1444          switch (c)          switch (c)
1445            {            {
1446            case 0x000a:            case 0x000a:
# Line 1116  for (;;) Line 1451  for (;;)
1451            case 0x2028:            case 0x2028:
1452            case 0x2029:            case 0x2029:
1453            OK = TRUE;            OK = TRUE;
1454            break;            break;
1455    
1456            default:            default:
1457            OK = FALSE;            OK = FALSE;
1458            break;            break;
1459            }            }
1460    
1461          if (OK == (d == OP_VSPACE))          if (OK == (d == OP_VSPACE))
1462            {            {
1463            if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1464              {              {
1465              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1144  for (;;) Line 1479  for (;;)
1479        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480        if (clen > 0)        if (clen > 0)
1481          {          {
1482          BOOL OK;          BOOL OK;
1483          switch (c)          switch (c)
1484            {            {
1485            case 0x09:      /* HT */            case 0x09:      /* HT */
# Line 1168  for (;;) Line 1503  for (;;)
1503            case 0x3000:    /* IDEOGRAPHIC SPACE */            case 0x3000:    /* IDEOGRAPHIC SPACE */
1504            OK = TRUE;            OK = TRUE;
1505            break;            break;
1506    
1507            default:            default:
1508            OK = FALSE;            OK = FALSE;
1509            break;            break;
1510            }            }
1511    
1512          if (OK == (d == OP_HSPACE))          if (OK == (d == OP_HSPACE))
1513            {            {
1514            if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1515              {              {
1516              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1206  for (;;) Line 1541  for (;;)
1541        if (clen > 0)        if (clen > 0)
1542          {          {
1543          BOOL OK;          BOOL OK;
1544          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1545          switch(code[2])          switch(code[2])
1546            {            {
1547            case PT_ANY:            case PT_ANY:
# Line 1214  for (;;) Line 1549  for (;;)
1549            break;            break;
1550    
1551            case PT_LAMP:            case PT_LAMP:
1552            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1553                prop->chartype == ucp_Lt;
1554            break;            break;
1555    
1556            case PT_GC:            case PT_GC:
1557            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1558            break;            break;
1559    
1560            case PT_PC:            case PT_PC:
1561            OK = chartype == code[3];            OK = prop->chartype == code[3];
1562            break;            break;
1563    
1564            case PT_SC:            case PT_SC:
1565            OK = script == code[3];            OK = prop->script == code[3];
1566              break;
1567    
1568              /* These are specials for combination cases. */
1569    
1570              case PT_ALNUM:
1571              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1572                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1573              break;
1574    
1575              case PT_SPACE:    /* Perl space */
1576              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1577                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1578              break;
1579    
1580              case PT_PXSPACE:  /* POSIX space */
1581              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1582                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1583                   c == CHAR_FF || c == CHAR_CR;
1584              break;
1585    
1586              case PT_WORD:
1587              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1588                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1589                   c == CHAR_UNDERSCORE;
1590            break;            break;
1591    
1592            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1264  for (;;) Line 1624  for (;;)
1624        QS2:        QS2:
1625    
1626        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1627        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1628          {          {
1629          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1630          int ncount = 0;          int ncount = 0;
1631          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1632              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1279  for (;;) Line 1639  for (;;)
1639            int nd;            int nd;
1640            int ndlen = 1;            int ndlen = 1;
1641            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1642            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1643            ncount++;            ncount++;
1644            nptr += ndlen;            nptr += ndlen;
1645            }            }
# Line 1307  for (;;) Line 1667  for (;;)
1667          int ncount = 0;          int ncount = 0;
1668          switch (c)          switch (c)
1669            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1670            case 0x000b:            case 0x000b:
1671            case 0x000c:            case 0x000c:
1672            case 0x0085:            case 0x0085:
1673            case 0x2028:            case 0x2028:
1674            case 0x2029:            case 0x2029:
1675              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1676              goto ANYNL02;
1677    
1678              case 0x000d:
1679              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1680              /* Fall through */
1681    
1682              ANYNL02:
1683              case 0x000a:
1684            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1685                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1686              {              {
# Line 1324  for (;;) Line 1689  for (;;)
1689              }              }
1690            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1691            break;            break;
1692    
1693            default:            default:
1694            break;            break;
1695            }            }
# Line 1346  for (;;) Line 1712  for (;;)
1712        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1713        if (clen > 0)        if (clen > 0)
1714          {          {
1715          BOOL OK;          BOOL OK;
1716          switch (c)          switch (c)
1717            {            {
1718            case 0x000a:            case 0x000a:
# Line 1358  for (;;) Line 1724  for (;;)
1724            case 0x2029:            case 0x2029:
1725            OK = TRUE;            OK = TRUE;
1726            break;            break;
1727    
1728            default:            default:
1729            OK = FALSE;            OK = FALSE;
1730            break;            break;
1731            }            }
1732          if (OK == (d == OP_VSPACE))          if (OK == (d == OP_VSPACE))
1733            {            {
1734            if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1735                codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1736              {              {
# Line 1392  for (;;) Line 1758  for (;;)
1758        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1759        if (clen > 0)        if (clen > 0)
1760          {          {
1761          BOOL OK;          BOOL OK;
1762          switch (c)          switch (c)
1763            {            {
1764            case 0x09:      /* HT */            case 0x09:      /* HT */
# Line 1416  for (;;) Line 1782  for (;;)
1782            case 0x3000:    /* IDEOGRAPHIC SPACE */            case 0x3000:    /* IDEOGRAPHIC SPACE */
1783            OK = TRUE;            OK = TRUE;
1784            break;            break;
1785    
1786            default:            default:
1787            OK = FALSE;            OK = FALSE;
1788            break;            break;
1789            }            }
1790    
1791          if (OK == (d == OP_HSPACE))          if (OK == (d == OP_HSPACE))
1792            {            {
1793            if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1794                codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1795              {              {
# Line 1442  for (;;) Line 1808  for (;;)
1808        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1809        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1810        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1811          { ADD_ACTIVE(state_offset + 6, 0); }          { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1812        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1813        if (clen > 0)        if (clen > 0)
1814          {          {
1815          BOOL OK;          BOOL OK;
1816          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1817          switch(code[4])          switch(code[1 + IMM2_SIZE + 1])
1818            {            {
1819            case PT_ANY:            case PT_ANY:
1820            OK = TRUE;            OK = TRUE;
1821            break;            break;
1822    
1823            case PT_LAMP:            case PT_LAMP:
1824            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1825                prop->chartype == ucp_Lt;
1826            break;            break;
1827    
1828            case PT_GC:            case PT_GC:
1829            OK = category == code[5];            OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1830            break;            break;
1831    
1832            case PT_PC:            case PT_PC:
1833            OK = chartype == code[5];            OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1834            break;            break;
1835    
1836            case PT_SC:            case PT_SC:
1837            OK = script == code[5];            OK = prop->script == code[1 + IMM2_SIZE + 2];
1838              break;
1839    
1840              /* These are specials for combination cases. */
1841    
1842              case PT_ALNUM:
1843              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1844                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1845              break;
1846    
1847              case PT_SPACE:    /* Perl space */
1848              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1849                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1850              break;
1851    
1852              case PT_PXSPACE:  /* POSIX space */
1853              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1854                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1855                   c == CHAR_FF || c == CHAR_CR;
1856              break;
1857    
1858              case PT_WORD:
1859              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1860                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1861                   c == CHAR_UNDERSCORE;
1862            break;            break;
1863    
1864            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1485  for (;;) Line 1876  for (;;)
1876              next_active_state--;              next_active_state--;
1877              }              }
1878            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1879              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1880            else            else
1881              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1882            }            }
# Line 1498  for (;;) Line 1889  for (;;)
1889        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1890        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1891        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1892          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1893        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1894        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1895          {          {
1896          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1897          int ncount = 0;          int ncount = 0;
1898          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1899            {            {
# Line 1514  for (;;) Line 1905  for (;;)
1905            int nd;            int nd;
1906            int ndlen = 1;            int ndlen = 1;
1907            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1908            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1909            ncount++;            ncount++;
1910            nptr += ndlen;            nptr += ndlen;
1911            }            }
1912            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1913                reset_could_continue = TRUE;
1914          if (++count >= GET2(code, 1))          if (++count >= GET2(code, 1))
1915            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1916          else          else
1917            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1918          }          }
# Line 1532  for (;;) Line 1925  for (;;)
1925        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1926        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1927        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1928          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1929        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1930        if (clen > 0)        if (clen > 0)
1931          {          {
1932          int ncount = 0;          int ncount = 0;
1933          switch (c)          switch (c)
1934            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1935            case 0x000b:            case 0x000b:
1936            case 0x000c:            case 0x000c:
1937            case 0x0085:            case 0x0085:
1938            case 0x2028:            case 0x2028:
1939            case 0x2029:            case 0x2029:
1940              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1941              goto ANYNL03;
1942    
1943              case 0x000d:
1944              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1945              /* Fall through */
1946    
1947              ANYNL03:
1948              case 0x000a:
1949            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1950              {              {
1951              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1952              next_active_state--;              next_active_state--;
1953              }              }
1954            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1955              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1956            else            else
1957              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1958            break;            break;
1959    
1960            default:            default:
1961            break;            break;
1962            }            }
# Line 1570  for (;;) Line 1969  for (;;)
1969        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1970        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1971        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1972          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1973        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1974        if (clen > 0)        if (clen > 0)
1975          {          {
1976          BOOL OK;          BOOL OK;
1977          switch (c)          switch (c)
1978            {            {
1979            case 0x000a:            case 0x000a:
# Line 1586  for (;;) Line 1985  for (;;)
1985            case 0x2029:            case 0x2029:
1986            OK = TRUE;            OK = TRUE;
1987            break;            break;
1988    
1989            default:            default:
1990            OK = FALSE;            OK = FALSE;
1991            }            }
1992    
1993          if (OK == (d == OP_VSPACE))          if (OK == (d == OP_VSPACE))
1994            {            {
1995            if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1996              {              {
1997              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1998              next_active_state--;              next_active_state--;
1999              }              }
2000            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2001              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2002            else            else
2003              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
2004            }            }
# Line 1612  for (;;) Line 2011  for (;;)
2011        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2012        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2013        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2014          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2015        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2016        if (clen > 0)        if (clen > 0)
2017          {          {
2018          BOOL OK;          BOOL OK;
2019          switch (c)          switch (c)
2020            {            {
2021            case 0x09:      /* HT */            case 0x09:      /* HT */
# Line 1640  for (;;) Line 2039  for (;;)
2039            case 0x3000:    /* IDEOGRAPHIC SPACE */            case 0x3000:    /* IDEOGRAPHIC SPACE */
2040            OK = TRUE;            OK = TRUE;
2041            break;            break;
2042    
2043            default:            default:
2044            OK = FALSE;            OK = FALSE;
2045            break;            break;
2046            }            }
2047    
2048          if (OK == (d == OP_HSPACE))          if (OK == (d == OP_HSPACE))
2049            {            {
2050            if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2051              {              {
2052              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
2053              next_active_state--;              next_active_state--;
2054              }              }
2055            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2056              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2057            else            else
2058              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
2059            }            }
# Line 1673  for (;;) Line 2072  for (;;)
2072        break;        break;
2073    
2074        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2075        case OP_CHARNC:        case OP_CHARI:
2076        if (clen == 0) break;        if (clen == 0) break;
2077    
2078  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2079        if (utf8)        if (utf)
2080          {          {
2081          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2082            {            {
2083            unsigned int othercase;            unsigned int othercase;
2084            if (c < 128) othercase = fcc[c]; else            if (c < 128)
2085                othercase = fcc[c];
2086            /* If we have Unicode property support, we can use it to test the            else
2087            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2088                other case of the character. */
2089  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2090            othercase = _pcre_ucp_othercase(c);              othercase = UCD_OTHERCASE(c);
2091  #else  #else
2092            othercase = NOTACHAR;              othercase = NOTACHAR;
2093  #endif  #endif
2094    
2095            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2096            }            }
2097          }          }
2098        else        else
2099  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2100          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2101          {          {
2102          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2103              { ADD_NEW(state_offset + 2, 0); }
2104          }          }
2105        break;        break;
2106    
# Line 1713  for (;;) Line 2112  for (;;)
2112        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2113    
2114        case OP_EXTUNI:        case OP_EXTUNI:
2115        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2116          {          {
2117          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
2118          int ncount = 0;          int ncount = 0;
2119          while (nptr < end_subject)          while (nptr < end_subject)
2120            {            {
2121            int nclen = 1;            int nclen = 1;
2122            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
2123            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
2124            ncount++;            ncount++;
2125            nptr += nclen;            nptr += nclen;
2126            }            }
2127            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2128                reset_could_continue = TRUE;
2129          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2130          }          }
2131        break;        break;
# Line 1738  for (;;) Line 2139  for (;;)
2139        case OP_ANYNL:        case OP_ANYNL:
2140        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2141          {          {
         case 0x000a:  
2142          case 0x000b:          case 0x000b:
2143          case 0x000c:          case 0x000c:
2144          case 0x0085:          case 0x0085:
2145          case 0x2028:          case 0x2028:
2146          case 0x2029:          case 0x2029:
2147            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2148    
2149            case 0x000a:
2150          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2151          break;          break;
2152    
2153          case 0x000d:          case 0x000d:
2154          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 >= end_subject)
2155              {
2156              ADD_NEW(state_offset + 1, 0);
2157              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2158                reset_could_continue = TRUE;
2159              }
2160            else if (ptr[1] == 0x0a)
2161            {            {
2162            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2163            }            }
2164          else          else
2165            {            {
2166            ADD_NEW(state_offset + 1, 0);            ADD_NEW(state_offset + 1, 0);
2167            }            }
2168          break;          break;
2169          }          }
2170        break;        break;
# Line 1771  for (;;) Line 2181  for (;;)
2181          case 0x2028:          case 0x2028:
2182          case 0x2029:          case 0x2029:
2183          break;          break;
2184    
2185          default:          default:
2186          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2187          break;          break;
2188          }          }
# Line 1791  for (;;) Line 2201  for (;;)
2201          case 0x2029:          case 0x2029:
2202          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2203          break;          break;
2204    
2205          default: break;          default: break;
2206          }          }
2207        break;        break;
# Line 1820  for (;;) Line 2230  for (;;)
2230          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2231          case 0x3000:    /* IDEOGRAPHIC SPACE */          case 0x3000:    /* IDEOGRAPHIC SPACE */
2232          break;          break;
2233    
2234          default:          default:
2235          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2236          break;          break;
2237          }          }
# Line 1856  for (;;) Line 2266  for (;;)
2266        break;        break;
2267    
2268        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2269        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character casefully. This is only used for
2270        characters, that is, we know that d < 256. The character we are        one-byte characters, that is, we know that d < 256. The character we are
2271        checking (c) can be multibyte. */        checking (c) can be multibyte. */
2272    
2273        case OP_NOT:        case OP_NOT:
2274        if (clen > 0)        if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2275          {        break;
2276          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;  
2277          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }        /*-----------------------------------------------------------------*/
2278          }        /* Match a negated single character caselessly. This is only used for
2279          one-byte characters, that is, we know that d < 256. The character we are
2280          checking (c) can be multibyte. */
2281    
2282          case OP_NOTI:
2283          if (clen > 0 && c != d && c != fcc[d])
2284            { ADD_NEW(state_offset + dlen + 1, 0); }
2285        break;        break;
2286    
2287        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2288          case OP_PLUSI:
2289          case OP_MINPLUSI:
2290          case OP_POSPLUSI:
2291          case OP_NOTPLUSI:
2292          case OP_NOTMINPLUSI:
2293          case OP_NOTPOSPLUSI:
2294          caseless = TRUE;
2295          codevalue -= OP_STARI - OP_STAR;
2296    
2297          /* Fall through */
2298        case OP_PLUS:        case OP_PLUS:
2299        case OP_MINPLUS:        case OP_MINPLUS:
2300        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1880  for (;;) Line 2306  for (;;)
2306        if (clen > 0)        if (clen > 0)
2307          {          {
2308          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2309          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2310            {            {
2311  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2312            if (utf8 && d >= 128)            if (utf && d >= 128)
2313              {              {
2314  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2315              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2316  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2317              }              }
2318            else            else
2319  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2320            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2321            }            }
2322          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2323            {            {
# Line 1908  for (;;) Line 2334  for (;;)
2334        break;        break;
2335    
2336        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2337          case OP_QUERYI:
2338          case OP_MINQUERYI:
2339          case OP_POSQUERYI:
2340          case OP_NOTQUERYI:
2341          case OP_NOTMINQUERYI:
2342          case OP_NOTPOSQUERYI:
2343          caseless = TRUE;
2344          codevalue -= OP_STARI - OP_STAR;
2345          /* Fall through */
2346        case OP_QUERY:        case OP_QUERY:
2347        case OP_MINQUERY:        case OP_MINQUERY:
2348        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1918  for (;;) Line 2353  for (;;)
2353        if (clen > 0)        if (clen > 0)
2354          {          {
2355          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2356          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2357            {            {
2358  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2359            if (utf8 && d >= 128)            if (utf && d >= 128)
2360              {              {
2361  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2362              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2363  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2364              }              }
2365            else            else
2366  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2367            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2368            }            }
2369          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2370            {            {
# Line 1944  for (;;) Line 2379  for (;;)
2379        break;        break;
2380    
2381        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2382          case OP_STARI:
2383          case OP_MINSTARI:
2384          case OP_POSSTARI:
2385          case OP_NOTSTARI:
2386          case OP_NOTMINSTARI:
2387          case OP_NOTPOSSTARI:
2388          caseless = TRUE;
2389          codevalue -= OP_STARI - OP_STAR;
2390          /* Fall through */
2391        case OP_STAR:        case OP_STAR:
2392        case OP_MINSTAR:        case OP_MINSTAR:
2393        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1954  for (;;) Line 2398  for (;;)
2398        if (clen > 0)        if (clen > 0)
2399          {          {
2400          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2401          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2402            {            {
2403  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2404            if (utf8 && d >= 128)            if (utf && d >= 128)
2405              {              {
2406  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2407              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2408  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2409              }              }
2410            else            else
2411  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2412            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2413            }            }
2414          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2415            {            {
# Line 1980  for (;;) Line 2424  for (;;)
2424        break;        break;
2425    
2426        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2427          case OP_EXACTI:
2428          case OP_NOTEXACTI:
2429          caseless = TRUE;
2430          codevalue -= OP_STARI - OP_STAR;
2431          /* Fall through */
2432        case OP_EXACT:        case OP_EXACT:
2433        case OP_NOTEXACT:        case OP_NOTEXACT:
2434        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2435        if (clen > 0)        if (clen > 0)
2436          {          {
2437          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2438          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2439            {            {
2440  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2441            if (utf8 && d >= 128)            if (utf && d >= 128)
2442              {              {
2443  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2444              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2445  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2446              }              }
2447            else            else
2448  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2449            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2450            }            }
2451          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2452            {            {
2453            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2454              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2455            else            else
2456              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2457            }            }
# Line 2010  for (;;) Line 2459  for (;;)
2459        break;        break;
2460    
2461        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2462          case OP_UPTOI:
2463          case OP_MINUPTOI:
2464          case OP_POSUPTOI:
2465          case OP_NOTUPTOI:
2466          case OP_NOTMINUPTOI:
2467          case OP_NOTPOSUPTOI:
2468          caseless = TRUE;
2469          codevalue -= OP_STARI - OP_STAR;
2470          /* Fall through */
2471        case OP_UPTO:        case OP_UPTO:
2472        case OP_MINUPTO:        case OP_MINUPTO:
2473        case OP_POSUPTO:        case OP_POSUPTO:
2474        case OP_NOTUPTO:        case OP_NOTUPTO:
2475        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2476        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2477        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2478        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2479        if (clen > 0)        if (clen > 0)
2480          {          {
2481          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2482          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2483            {            {
2484  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2485            if (utf8 && d >= 128)            if (utf && d >= 128)
2486              {              {
2487  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2488              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2489  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2490              }              }
2491            else            else
2492  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2493            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2494            }            }
2495          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2496            {            {
# Line 2042  for (;;) Line 2500  for (;;)
2500              next_active_state--;              next_active_state--;
2501              }              }
2502            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2503              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2504            else            else
2505              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2506            }            }
# Line 2059  for (;;) Line 2517  for (;;)
2517          {          {
2518          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2519          int next_state_offset;          int next_state_offset;
2520          const uschar *ecode;          const pcre_uchar *ecode;
2521    
2522          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2523          can set isinclass from it. */          can set isinclass from it. */
2524    
2525          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2526            {            {
2527            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2528            if (clen > 0)            if (clen > 0)
2529              {              {
2530              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2531                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2532              }              }
2533            }            }
2534    
# Line 2081  for (;;) Line 2539  for (;;)
2539          else          else
2540           {           {
2541           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2542           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2543           }           }
2544    
2545          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2546          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2547          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2548    
2549          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2550    
2551          switch (*ecode)          switch (*ecode)
2552            {            {
# Line 2115  for (;;) Line 2573  for (;;)
2573            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2574            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2575            if (count >= GET2(ecode, 1))            if (count >= GET2(ecode, 1))
2576              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2577            if (isinclass)            if (isinclass)
2578              {              {
2579              int max = GET2(ecode, 3);              int max = GET2(ecode, 1 + IMM2_SIZE);
2580              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2581                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2582              else              else
2583                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2584              }              }
# Line 2135  for (;;) Line 2593  for (;;)
2593    
2594  /* ========================================================================== */  /* ========================================================================== */
2595        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2596        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2597          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2598          though the other "backtracking verbs" are not supported. */
2599    
2600          case OP_FAIL:
2601          forced_fail++;    /* Count FAILs for multiple states */
2602          break;
2603    
2604        case OP_ASSERT:        case OP_ASSERT:
2605        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 2145  for (;;) Line 2609  for (;;)
2609          int rc;          int rc;
2610          int local_offsets[2];          int local_offsets[2];
2611          int local_workspace[1000];          int local_workspace[1000];
2612          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2613    
2614          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2615    
# Line 2153  for (;;) Line 2617  for (;;)
2617            md,                                   /* static match data */            md,                                   /* static match data */
2618            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2619            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2620            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2621            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2622            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2623            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2624            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2625            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2626    
2627            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2628          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2629              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2630          }          }
2631        break;        break;
2632    
# Line 2173  for (;;) Line 2636  for (;;)
2636          {          {
2637          int local_offsets[1000];          int local_offsets[1000];
2638          int local_workspace[1000];          int local_workspace[1000];
2639          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2640            int condcode;
2641    
2642            /* Because of the way auto-callout works during compile, a callout item
2643            is inserted between OP_COND and an assertion condition. This does not
2644            happen for the other conditions. */
2645    
2646            if (code[LINK_SIZE+1] == OP_CALLOUT)
2647              {
2648              rrc = 0;
2649              if (PUBL(callout) != NULL)
2650                {
2651                PUBL(callout_block) cb;
2652                cb.version          = 1;   /* Version 1 of the callout block */
2653                cb.callout_number   = code[LINK_SIZE+2];
2654                cb.offset_vector    = offsets;
2655    #ifdef COMPILE_PCRE8
2656                cb.subject          = (PCRE_SPTR)start_subject;
2657    #else
2658                cb.subject          = (PCRE_SPTR16)start_subject;
2659    #endif
2660                cb.subject_length   = (int)(end_subject - start_subject);
2661                cb.start_match      = (int)(current_subject - start_subject);
2662                cb.current_position = (int)(ptr - start_subject);
2663                cb.pattern_position = GET(code, LINK_SIZE + 3);
2664                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2665                cb.capture_top      = 1;
2666                cb.capture_last     = -1;
2667                cb.callout_data     = md->callout_data;
2668                cb.mark             = NULL;   /* No (*MARK) support */
2669                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2670                }
2671              if (rrc > 0) break;                      /* Fail this thread */
2672              code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2673              }
2674    
2675            condcode = code[LINK_SIZE+1];
2676    
2677          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2678    
2679          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2680              return PCRE_ERROR_DFA_UCOND;
2681    
2682          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2683    
2684          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2685            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2686    
2687          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2688          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2689          recursed groups. */          recursed groups. */
2690    
2691          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2692            {            {
2693            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2694            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2695            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (md->recursive != NULL)
2696              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2697              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2698            }            }
2699    
2700          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2203  for (;;) Line 2702  for (;;)
2702          else          else
2703            {            {
2704            int rc;            int rc;
2705            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2706            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2707    
2708            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2709    
# Line 2212  for (;;) Line 2711  for (;;)
2711              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2712              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2713              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2714              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2715              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2716              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2717              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2718              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2719              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2720    
2721              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2722            if ((rc >= 0) ==            if ((rc >= 0) ==
2723                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2724              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2725            else            else
2726              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2727            }            }
2728          }          }
2729        break;        break;
# Line 2233  for (;;) Line 2731  for (;;)
2731        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2732        case OP_RECURSE:        case OP_RECURSE:
2733          {          {
2734            dfa_recursion_info *ri;
2735          int local_offsets[1000];          int local_offsets[1000];
2736          int local_workspace[1000];          int local_workspace[1000];
2737            const pcre_uchar *callpat = start_code + GET(code, 1);
2738            int recno = (callpat == md->start_code)? 0 :
2739              GET2(callpat, 1 + LINK_SIZE);
2740          int rc;          int rc;
2741    
2742          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2743            recursing + 1));  
2744            /* Check for repeating a recursion without advancing the subject
2745            pointer. This should catch convoluted mutual recursions. (Some simple
2746            cases are caught at compile time.) */
2747    
2748            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2749              if (recno == ri->group_num && ptr == ri->subject_position)
2750                return PCRE_ERROR_RECURSELOOP;
2751    
2752            /* Remember this recursion and where we started it so as to
2753            catch infinite loops. */
2754    
2755            new_recursive.group_num = recno;
2756            new_recursive.subject_position = ptr;
2757            new_recursive.prevrec = md->recursive;
2758            md->recursive = &new_recursive;
2759    
2760          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2761            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2762            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2763            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2764            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2765            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2766            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2767            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2768            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2769            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
2770            rlevel,                               /* function recursion level */  
2771            recursing + 1);                       /* regex recurse level */          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2772    
2773          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2774            recursing + 1, rc));            rc));
2775    
2776          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2777    
# Line 2268  for (;;) Line 2785  for (;;)
2785            {            {
2786            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2787              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2788              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2789              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #ifdef SUPPORT_UTF
2790                const pcre_uchar *p = start_subject + local_offsets[rc];
2791                const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2792                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2793    #endif
2794              if (charcount > 0)              if (charcount > 0)
2795                {                {
2796                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 2287  for (;;) Line 2806  for (;;)
2806        break;        break;
2807    
2808        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2809          case OP_BRAPOS:
2810          case OP_SBRAPOS:
2811          case OP_CBRAPOS:
2812          case OP_SCBRAPOS:
2813          case OP_BRAPOSZERO:
2814            {
2815            int charcount, matched_count;
2816            const pcre_uchar *local_ptr = ptr;
2817            BOOL allow_zero;
2818    
2819            if (codevalue == OP_BRAPOSZERO)
2820              {
2821              allow_zero = TRUE;
2822              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2823              }
2824            else allow_zero = FALSE;
2825    
2826            /* Loop to match the subpattern as many times as possible as if it were
2827            a complete pattern. */
2828    
2829            for (matched_count = 0;; matched_count++)
2830              {
2831              int local_offsets[2];
2832              int local_workspace[1000];
2833    
2834              int rc = internal_dfa_exec(
2835                md,                                   /* fixed match data */
2836                code,                                 /* this subexpression's code */
2837                local_ptr,                            /* where we currently are */
2838                (int)(ptr - start_subject),           /* start offset */
2839                local_offsets,                        /* offset vector */
2840                sizeof(local_offsets)/sizeof(int),    /* size of same */
2841                local_workspace,                      /* workspace vector */
2842                sizeof(local_workspace)/sizeof(int),  /* size of same */
2843                rlevel);                              /* function recursion level */
2844    
2845              /* Failed to match */
2846    
2847              if (rc < 0)
2848                {
2849                if (rc != PCRE_ERROR_NOMATCH) return rc;
2850                break;
2851                }
2852    
2853              /* Matched: break the loop if zero characters matched. */
2854    
2855              charcount = local_offsets[1] - local_offsets[0];
2856              if (charcount == 0) break;
2857              local_ptr += charcount;    /* Advance temporary position ptr */
2858              }
2859    
2860            /* At this point we have matched the subpattern matched_count
2861            times, and local_ptr is pointing to the character after the end of the
2862            last match. */
2863    
2864            if (matched_count > 0 || allow_zero)
2865              {
2866              const pcre_uchar *end_subpattern = code;
2867              int next_state_offset;
2868    
2869              do { end_subpattern += GET(end_subpattern, 1); }
2870                while (*end_subpattern == OP_ALT);
2871              next_state_offset =
2872                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2873    
2874              /* Optimization: if there are no more active states, and there
2875              are no new states yet set up, then skip over the subject string
2876              right here, to save looping. Otherwise, set up the new state to swing
2877              into action when the end of the matched substring is reached. */
2878    
2879              if (i + 1 >= active_count && new_count == 0)
2880                {
2881                ptr = local_ptr;
2882                clen = 0;
2883                ADD_NEW(next_state_offset, 0);
2884                }
2885              else
2886                {
2887                const pcre_uchar *p = ptr;
2888                const pcre_uchar *pp = local_ptr;
2889                charcount = (int)(pp - p);
2890    #ifdef SUPPORT_UTF
2891                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2892    #endif
2893                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2894                }
2895              }
2896            }
2897          break;
2898    
2899          /*-----------------------------------------------------------------*/
2900        case OP_ONCE:        case OP_ONCE:
2901          case OP_ONCE_NC:
2902          {          {
2903          int local_offsets[2];          int local_offsets[2];
2904          int local_workspace[1000];          int local_workspace[1000];
# Line 2296  for (;;) Line 2907  for (;;)
2907            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2908            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2909            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2910            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2911            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2912            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2913            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2914            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2915            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2916    
2917          if (rc >= 0)          if (rc >= 0)
2918            {            {
2919            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2920            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2921            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2922    
2923            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2924              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2925            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2926                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2927    
2928            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2929            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 2321  for (;;) Line 2931  for (;;)
2931    
2932            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2933                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2934              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2935    
2936            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2937            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 2336  for (;;) Line 2946  for (;;)
2946            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2947            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2948            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2949            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2950    
2951            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2952              {              {
# Line 2359  for (;;) Line 2969  for (;;)
2969              }              }
2970            else            else
2971              {              {
2972              const uschar *p = start_subject + local_offsets[0];  #ifdef SUPPORT_UTF
2973              const uschar *pp = start_subject + local_offsets[1];              const pcre_uchar *p = start_subject + local_offsets[0];
2974              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;              const pcre_uchar *pp = start_subject + local_offsets[1];
2975                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2976    #endif
2977              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2978              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2979                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2980              }              }
   
2981            }            }
2982          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2983          }          }
# Line 2377  for (;;) Line 2988  for (;;)
2988        /* Handle callouts */        /* Handle callouts */
2989    
2990        case OP_CALLOUT:        case OP_CALLOUT:
2991        if (pcre_callout != NULL)        rrc = 0;
2992          if (PUBL(callout) != NULL)
2993          {          {
2994          int rrc;          PUBL(callout_block) cb;
         pcre_callout_block cb;  
2995          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2996          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2997          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2998    #ifdef COMPILE_PCRE8
2999          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
3000          cb.subject_length   = end_subject - start_subject;  #else
3001          cb.start_match      = current_subject - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
3002          cb.current_position = ptr - start_subject;  #endif
3003            cb.subject_length   = (int)(end_subject - start_subject);
3004            cb.start_match      = (int)(current_subject - start_subject);
3005            cb.current_position = (int)(ptr - start_subject);
3006          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
3007          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
3008          cb.capture_top      = 1;          cb.capture_top      = 1;
3009          cb.capture_last     = -1;          cb.capture_last     = -1;
3010          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
3011          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
3012          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3013          }          }
3014          if (rrc == 0)
3015            { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3016        break;        break;
3017    
3018    
# Line 2411  for (;;) Line 3028  for (;;)
3028    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
3029    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
3030    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
3031    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
3032    
3033      The "forced_ fail" variable counts the number of (*F) encountered for the
3034      character. If it is equal to the original active_count (saved in
3035      workspace[1]) it means that (*F) was found on every active state. In this
3036      case we don't want to give a partial match.
3037    
3038      The "could_continue" variable is true if a state could have continued but
3039      for the fact that the end of the subject was reached. */
3040    
3041    if (new_count <= 0)    if (new_count <= 0)
3042      {      {
3043      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
3044          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on, and */
3045          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3046          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
3047          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3048            ||                                           /* or... */
3049            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3050             match_count < 0)                            /* no matches */
3051            ) &&                                         /* And... */
3052            (
3053            partial_newline ||                           /* Either partial NL */
3054              (                                          /* or ... */
3055              ptr >= end_subject &&                /* End of subject and */
3056              ptr > md->start_used_ptr)            /* Inspected non-empty string */
3057              )
3058            )
3059        {        {
3060        if (offsetcount >= 2)        if (offsetcount >= 2)
3061          {          {
3062          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
3063          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
3064          }          }
3065        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
3066        }        }
# Line 2478  Returns:          > 0 => number of match Line 3114  Returns:          > 0 => number of match
3114                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3115  */  */
3116    
3117  PCRE_EXP_DEFN int  #ifdef COMPILE_PCRE8
3118    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3119  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3120    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3121    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3122    #else
3123    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3124    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3125      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3126      int offsetcount, int *workspace, int wscount)
3127    #endif
3128  {  {
3129  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3130  dfa_match_data match_block;  dfa_match_data match_block;
3131  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3132  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3133  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
   
 pcre_study_data internal_study;  
3134  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3135    
3136  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3137  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3138  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3139  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3140  int first_byte = -1;  pcre_uchar first_char = 0;
3141  int req_byte = -1;  pcre_uchar first_char2 = 0;
3142  int req_byte2 = -1;  pcre_uchar req_char = 0;
3143    pcre_uchar req_char2 = 0;
3144  int newline;  int newline;
3145    
3146  /* Plausibility checks */  /* Plausibility checks */
# Line 2509  if (re == NULL || subject == NULL || wor Line 3150  if (re == NULL || subject == NULL || wor
3150     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3151  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3152  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3153    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3154    
3155  /* We need to find the pointer to any study data before we test for byte  /* We need to find the pointer to any study data before we test for byte
3156  flipping, so we scan the extra_data block first. This may set two fields in the  flipping, so we scan the extra_data block first. This may set two fields in the
# Line 2533  if (extra_data != NULL) Line 3175  if (extra_data != NULL)
3175    }    }
3176    
3177  /* Check that the first field in the block is the magic number. If it is not,  /* Check that the first field in the block is the magic number. If it is not,
3178  test for a regex that was compiled on a host of opposite endianness. If this is  return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3179  the case, flipped values are put in internal_re and internal_study if there was  REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3180  study data too. */  means that the pattern is likely compiled with different endianness. */
3181    
3182  if (re->magic_number != MAGIC_NUMBER)  if (re->magic_number != MAGIC_NUMBER)
3183    {    return re->magic_number == REVERSED_MAGIC_NUMBER?
3184    re = _pcre_try_flipped(re, &internal_re, study, &internal_study);      PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3185    if (re == NULL) return PCRE_ERROR_BADMAGIC;  if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
   if (study != NULL) study = &internal_study;  
   }  
3186    
3187  /* Set some local values */  /* Set some local values */
3188    
3189  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3190  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3191  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3192    
3193  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3194  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3195    utf = (re->options & PCRE_UTF8) != 0;
3196  #else  #else
3197  utf8 = FALSE;  utf = FALSE;
3198  #endif  #endif
3199    
3200  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2561  anchored = (options & (PCRE_ANCHORED|PCR Line 3202  anchored = (options & (PCRE_ANCHORED|PCR
3202    
3203  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3204    
3205  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3206      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3207  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3208  md->end_subject = end_subject;  md->end_subject = end_subject;
3209    md->start_offset = start_offset;
3210  md->moptions = options;  md->moptions = options;
3211  md->poptions = re->options;  md->poptions = re->options;
3212    
3213    /* If the BSR option is not set at match time, copy what was set
3214    at compile time. */
3215    
3216    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3217      {
3218      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3219        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3220    #ifdef BSR_ANYCRLF
3221      else md->moptions |= PCRE_BSR_ANYCRLF;
3222    #endif
3223      }
3224    
3225  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
3226  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
3227    
# Line 2575  switch ((((options & PCRE_NEWLINE_BITS) Line 3229  switch ((((options & PCRE_NEWLINE_BITS)
3229           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
3230    {    {
3231    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
3232    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3233    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3234    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
3235         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3236    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
3237    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3238    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2611  else Line 3265  else
3265  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3266  back the character offset. */  back the character offset. */
3267    
3268  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3269  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3270    {    {
3271    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3272      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3273    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3274      {      {
3275      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3276        {        {
3277        tb &= 0xc0;        offsets[0] = erroroffset;
3278        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3279        }        }
3280        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3281          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3282      }      }
3283      if (start_offset > 0 && start_offset < length &&
3284            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3285        return PCRE_ERROR_BADUTF8_OFFSET;
3286    }    }
3287  #endif  #endif
3288    
# Line 2632  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3290  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3290  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3291  in other programs later. */  in other programs later. */
3292    
3293  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3294    
3295  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3296  used in a loop when finding where to start. */  where to start. */
3297    
3298  lcc = md->tables + lcc_offset;  startline = (re->flags & PCRE_STARTLINE) != 0;
 startline = (re->options & PCRE_STARTLINE) != 0;  
3299  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3300    
3301  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2649  studied, there may be a bitmap of possib Line 3306  studied, there may be a bitmap of possib
3306    
3307  if (!anchored)  if (!anchored)
3308    {    {
3309    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3310      {      {
3311      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3312      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = (pcre_uchar)(re->first_char);
3313        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3314          {
3315          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3316    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3317          if (utf && first_char > 127)
3318            first_char2 = UCD_OTHERCASE(first_char);
3319    #endif
3320          }
3321      }      }
3322    else    else
3323      {      {
3324      if (startline && study != NULL &&      if (!startline && study != NULL &&
3325           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3326        start_bits = study->start_bits;        start_bits = study->start_bits;
3327      }      }
3328    }    }
# Line 2666  if (!anchored) Line 3330  if (!anchored)
3330  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
3331  character" set. */  character" set. */
3332    
3333  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3334    {    {
3335    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3336    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = (pcre_uchar)(re->req_char);
3337    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3338        {
3339        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3340    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3341        if (utf && req_char > 127)
3342          req_char2 = UCD_OTHERCASE(req_char);
3343    #endif
3344        }
3345    }    }
3346    
3347  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
3348  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
3349  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
3350    
3351  for (;;)  for (;;)
3352    {    {
# Line 2684  for (;;) Line 3354  for (;;)
3354    
3355    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3356      {      {
3357      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3358    
3359      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
3360      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
3361      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
3362      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
3363    
3364      if (firstline)      if (firstline)
3365        {        {
3366        const uschar *t = current_subject;        PCRE_PUCHAR t = current_subject;
3367    #ifdef SUPPORT_UTF
3368          if (utf)
3369            {
3370            while (t < md->end_subject && !IS_NEWLINE(t))
3371              {
3372              t++;
3373              ACROSSCHAR(t < end_subject, *t, t++);
3374              }
3375            }
3376          else
3377    #endif
3378        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3379        end_subject = t;        end_subject = t;
3380        }        }
3381    
3382      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
3383        starting point is not found. However, there is an option that disables
3384        these, for testing and for ensuring that all callouts do actually occur.
3385        The option can be set in the regex by (*NO_START_OPT) or passed in
3386        match-time options. */
3387    
3388        if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3389        {        {
3390        if (first_byte_caseless)        /* Advance to a known first char. */
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
3391    
3392      /* Or to just after a linebreak for a multiline match if possible */        if (has_first_char)
3393            {
3394            if (first_char != first_char2)
3395              while (current_subject < end_subject &&
3396                  *current_subject != first_char && *current_subject != first_char2)
3397                current_subject++;
3398            else
3399              while (current_subject < end_subject &&
3400                     *current_subject != first_char)
3401                current_subject++;
3402            }
3403    
3404      else if (startline)        /* Or to just after a linebreak for a multiline match if possible */
3405        {  
3406        if (current_subject > md->start_subject + start_offset)        else if (startline)
3407          {          {
3408          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (current_subject > md->start_subject + start_offset)
3409            current_subject++;            {
3410    #ifdef SUPPORT_UTF
3411              if (utf)
3412                {
3413                while (current_subject < end_subject &&
3414                       !WAS_NEWLINE(current_subject))
3415                  {
3416                  current_subject++;
3417                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3418                    current_subject++);
3419                  }
3420                }
3421              else
3422    #endif
3423              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3424                current_subject++;
3425    
3426              /* If we have just passed a CR and the newline option is ANY or
3427              ANYCRLF, and we are now at a LF, advance the match position by one
3428              more character. */
3429    
3430          /* If we have just passed a CR and the newline option is ANY or            if (current_subject[-1] == CHAR_CR &&
3431          ANYCRLF, and we are now at a LF, advance the match position by one more                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3432          character. */                 current_subject < end_subject &&
3433                   *current_subject == CHAR_NL)
3434          if (current_subject[-1] == '\r' &&              current_subject++;
3435               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&            }
              current_subject < end_subject &&  
              *current_subject == '\n')  
           current_subject++;  
3436          }          }
       }  
3437    
3438      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
3439    
3440      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
3441          {          {
3442          register unsigned int c = *current_subject;          while (current_subject < end_subject)
3443          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
3444              register unsigned int c = *current_subject;
3445    #ifndef COMPILE_PCRE8
3446              if (c > 255) c = 255;
3447    #endif
3448              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3449                {
3450                current_subject++;
3451    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3452                /* In non 8-bit mode, the iteration will stop for
3453                characters > 255 at the beginning or not stop at all. */
3454                if (utf)
3455                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3456                    current_subject++);
3457    #endif
3458                }
3459            else break;            else break;
3460              }
3461          }          }
3462        }        }
3463    
3464      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3465    
3466      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
   
   /* If req_byte is set, we know that that character must appear in the subject  
   for the match to succeed. If the first character is set, req_byte must be  
   later in the subject; otherwise the test starts at the match point. This  
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
3467    
3468      /* We don't need to repeat the search if we haven't yet reached the      /* The following two optimizations are disabled for partial matching or if
3469      place we found it at last time. */      disabling is explicitly requested (and of course, by the test above, this
3470        code is not obeyed when restarting after a partial match). */
3471    
3472      if (p > req_byte_ptr)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3473            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3474        {        {
3475        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3476          {        is a lower bound; no actual string of that length may actually match the
3477          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3478            {        bytes to avoid spending too much time in this optimization. */
3479            register int pp = *p++;  
3480            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3481            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3482          }          return PCRE_ERROR_NOMATCH;
3483        else  
3484          /* If req_char is set, we know that that character must appear in the
3485          subject for the match to succeed. If the first character is set, req_char
3486          must be later in the subject; otherwise the test starts at the match
3487          point. This optimization can save a huge amount of work in patterns with
3488          nested unlimited repeats that aren't going to match. Writing separate
3489          code for cased/caseless versions makes it go faster, as does using an
3490          autoincrement and backing off on a match.
3491    
3492          HOWEVER: when the subject string is very, very long, searching to its end
3493          can take a long time, and give bad performance on quite ordinary
3494          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3495          string... so we don't do this when the string is sufficiently long. */
3496    
3497          if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3498          {          {
3499          while (p < end_subject)          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3500    
3501            /* We don't need to repeat the search if we haven't yet reached the
3502            place we found it at last time. */
3503    
3504            if (p > req_char_ptr)
3505            {            {
3506            if (*p++ == req_byte) { p--; break; }            if (req_char != req_char2)
3507            }              {
3508          }              while (p < end_subject)
3509                  {
3510                  register int pp = *p++;
3511                  if (pp == req_char || pp == req_char2) { p--; break; }
3512                  }
3513                }
3514              else
3515                {
3516                while (p < end_subject)
3517                  {
3518                  if (*p++ == req_char) { p--; break; }
3519                  }
3520                }
3521    
3522        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3523        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3524    
3525        if (p >= end_subject) break;            if (p >= end_subject) break;
3526    
3527        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3528        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3529        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3530    
3531        req_byte_ptr = p;            req_char_ptr = p;
3532              }
3533            }
3534        }        }
3535      }      }   /* End of optimizations that are done when not restarting */
3536    
3537    /* OK, now we can do the business */    /* OK, now we can do the business */
3538    
3539      md->start_used_ptr = current_subject;
3540      md->recursive = NULL;
3541    
3542    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3543      md,                                /* fixed match data */      md,                                /* fixed match data */
3544      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2815  for (;;) Line 3548  for (;;)
3548      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3549      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3550      wscount,                           /* size of same */      wscount,                           /* size of same */
3551      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3552    
3553    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3554    on only if not anchored. */    on only if not anchored. */
# Line 2829  for (;;) Line 3560  for (;;)
3560    
3561    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3562    current_subject++;    current_subject++;
3563    if (utf8)  #ifdef SUPPORT_UTF
3564      if (utf)
3565      {      {
3566      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3567        current_subject++;        current_subject++);
3568      }      }
3569    #endif
3570    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3571    
3572    /* If we have just passed a CR and the newline option is CRLF or ANY or    /* If we have just passed a CR and we are now at a LF, and the pattern does
3573    ANYCRLF, and we are now at a LF, advance the match position by one more    not contain any explicit matches for \r or \n, and the newline option is CRLF
3574    character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3575    
3576    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
3577         (md->nltype == NLTYPE_ANY ||        current_subject < end_subject &&
3578          md->nltype == NLTYPE_ANYCRLF ||        *current_subject == CHAR_NL &&
3579          md->nllen == 2) &&        (re->flags & PCRE_HASCRORLF) == 0 &&
3580         current_subject < end_subject &&          (md->nltype == NLTYPE_ANY ||
3581         *current_subject == '\n')           md->nltype == NLTYPE_ANYCRLF ||
3582             md->nllen == 2))
3583      current_subject++;      current_subject++;
3584    
3585    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.178  
changed lines
  Added in v.919

  ViewVC Help
Powered by ViewVC 1.1.5