/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 530 by ph10, Tue Jun 1 13:42:06 2010 UTC revision 1364 by ph10, Sat Oct 5 15:45:11 2013 UTC
# Line 7  and semantics are as close as possible t Line 7  and semantics are as close as possible t
7  below for why this module is different).  below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2013 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 38  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a sort of DFA algorithm (not a true  alternative matching function that uses a sort of DFA algorithm (not a true
43  FSM). This is NOT Perl- compatible, but it has advantages in certain  FSM). This is NOT Perl-compatible, but it has advantages in certain
44  applications. */  applications. */
45    
46    
# Line 113  small value. Non-zero values in the tabl Line 112  small value. Non-zero values in the tabl
112  the character is to be found. ***NOTE*** If the start of this table is  the character is to be found. ***NOTE*** If the start of this table is
113  modified, the three tables that follow must also be modified. */  modified, the three tables that follow must also be modified. */
114    
115  static const uschar coptable[] = {  static const pcre_uint8 coptable[] = {
116    0,                             /* End                                    */    0,                             /* End                                    */
117    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
# Line 121  static const uschar coptable[] = { Line 120  static const uschar coptable[] = {
120    0, 0,                          /* \P, \p                                 */    0, 0,                          /* \P, \p                                 */
121    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122    0,                             /* \X                                     */    0,                             /* \X                                     */
123    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124    1,                             /* Char                                   */    1,                             /* Char                                   */
125    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
126    1,                             /* not                                    */    1,                             /* not                                    */
127      1,                             /* noti                                   */
128    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
129    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
132      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135      1+IMM2_SIZE,                   /* exact I                                */
136      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
138    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
141      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144      1+IMM2_SIZE,                   /* NOT exact I                            */
145      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
147    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
150      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
152    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 144  static const uschar coptable[] = { Line 155  static const uschar coptable[] = {
155    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
156    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
157    0,                             /* REF                                    */    0,                             /* REF                                    */
158      0,                             /* REFI                                   */
159      0,                             /* DNREF                                  */
160      0,                             /* DNREFI                                 */
161    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
162    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
163    0,                             /* Alt                                    */    0,                             /* Alt                                    */
164    0,                             /* Ket                                    */    0,                             /* Ket                                    */
165    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
166    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
167      0,                             /* KetRpos                                */
168      0,                             /* Reverse                                */
169    0,                             /* Assert                                 */    0,                             /* Assert                                 */
170    0,                             /* Assert not                             */    0,                             /* Assert not                             */
171    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
172    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
173    0,                             /* Reverse                                */    0, 0,                          /* ONCE, ONCE_NC                          */
174    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
175    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
176    0, 0,                          /* CREF, NCREF                            */    0, 0,                          /* CREF, NCREF                            */
177    0, 0,                          /* RREF, NRREF                            */    0, 0,                          /* RREF, NRREF                            */
178    0,                             /* DEF                                    */    0,                             /* DEF                                    */
179    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
180    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
181    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
182    0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */    0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
183      0, 0                           /* CLOSE, SKIPZERO  */
184  };  };
185    
186  /* This table identifies those opcodes that inspect a character. It is used to  /* This table identifies those opcodes that inspect a character. It is used to
# Line 171  remember the fact that a character could Line 188  remember the fact that a character could
188  the subject is reached. ***NOTE*** If the start of this table is modified, the  the subject is reached. ***NOTE*** If the start of this table is modified, the
189  two tables that follow must also be modified. */  two tables that follow must also be modified. */
190    
191  static const uschar poptable[] = {  static const pcre_uint8 poptable[] = {
192    0,                             /* End                                    */    0,                             /* End                                    */
193    0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
194    1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */    1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
# Line 179  static const uschar poptable[] = { Line 196  static const uschar poptable[] = {
196    1, 1,                          /* \P, \p                                 */    1, 1,                          /* \P, \p                                 */
197    1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */    1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
198    1,                             /* \X                                     */    1,                             /* \X                                     */
199    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
200    1,                             /* Char                                   */    1,                             /* Char                                   */
201    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
202    1,                             /* not                                    */    1,                             /* not                                    */
203      1,                             /* noti                                   */
204    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
205    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
206    1, 1, 1,                       /* upto, minupto, exact                   */    1, 1, 1,                       /* upto, minupto, exact                   */
207    1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */    1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
208      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
209      1, 1, 1,                       /* upto I, minupto I, exact I             */
210      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
211    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
212    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
213    1, 1, 1,                       /* NOT upto, minupto, exact               */    1, 1, 1,                       /* NOT upto, minupto, exact               */
214    1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */    1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
215      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
216      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
217      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
218    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
219    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
220    1, 1, 1,                       /* Type upto, minupto, exact              */    1, 1, 1,                       /* Type upto, minupto, exact              */
# Line 202  static const uschar poptable[] = { Line 226  static const uschar poptable[] = {
226    1,                             /* NCLASS                                 */    1,                             /* NCLASS                                 */
227    1,                             /* XCLASS - variable length               */    1,                             /* XCLASS - variable length               */
228    0,                             /* REF                                    */    0,                             /* REF                                    */
229      0,                             /* REFI                                   */
230      0,                             /* DNREF                                  */
231      0,                             /* DNREFI                                 */
232    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
233    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
234    0,                             /* Alt                                    */    0,                             /* Alt                                    */
235    0,                             /* Ket                                    */    0,                             /* Ket                                    */
236    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
237    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
238      0,                             /* KetRpos                                */
239      0,                             /* Reverse                                */
240    0,                             /* Assert                                 */    0,                             /* Assert                                 */
241    0,                             /* Assert not                             */    0,                             /* Assert not                             */
242    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
243    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
244    0,                             /* Reverse                                */    0, 0,                          /* ONCE, ONCE_NC                          */
245    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
246    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
247    0, 0,                          /* CREF, NCREF                            */    0, 0,                          /* CREF, NCREF                            */
248    0, 0,                          /* RREF, NRREF                            */    0, 0,                          /* RREF, NRREF                            */
249    0,                             /* DEF                                    */    0,                             /* DEF                                    */
250    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
251    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
252    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
253    0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */    0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
254      0, 0                           /* CLOSE, SKIPZERO                        */
255  };  };
256    
257  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
258  and \w */  and \w */
259    
260  static const uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
261    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
262    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
263    ctype_space, ctype_space,    ctype_space, ctype_space,
# Line 235  static const uschar toptable1[] = { Line 265  static const uschar toptable1[] = {
265    0, 0                            /* OP_ANY, OP_ALLANY */    0, 0                            /* OP_ANY, OP_ALLANY */
266  };  };
267    
268  static const uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
269    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
270    ctype_digit, 0,    ctype_digit, 0,
271    ctype_space, 0,    ctype_space, 0,
# Line 252  these structures in, is a vector of ints Line 282  these structures in, is a vector of ints
282  typedef struct stateblock {  typedef struct stateblock {
283    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
284    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
285    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
286  } stateblock;  } stateblock;
287    
288  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
289    
290    
291  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 275  Returns:       nothing Line 304  Returns:       nothing
304  */  */
305    
306  static void  static void
307  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
308  {  {
309  int c;  pcre_uint32 c;
310  while (length-- > 0)  while (length-- > 0)
311    {    {
312    if (isprint(c = *(p++)))    if (isprint(c = *(p++)))
313      fprintf(f, "%c", c);      fprintf(f, "%c", c);
314    else    else
315      fprintf(f, "\\x%02x", c);      fprintf(f, "\\x{%02x}", c);
316    }    }
317  }  }
318  #endif  #endif
# Line 308  Arguments: Line 337  Arguments:
337    offsetcount       size of same    offsetcount       size of same
338    workspace         vector of workspace    workspace         vector of workspace
339    wscount           size of same    wscount           size of same
   ims               the current ims flags  
340    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
341    
342  Returns:            > 0 => number of match offset pairs placed in offsets  Returns:            > 0 => number of match offset pairs placed in offsets
343                      = 0 => offsets overflowed; longest matches are present                      = 0 => offsets overflowed; longest matches are present
# Line 325  for the current character, one for the f Line 352  for the current character, one for the f
352      { \      { \
353      next_active_state->offset = (x); \      next_active_state->offset = (x); \
354      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
355      next_active_state++; \      next_active_state++; \
356      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
357      } \      } \
# Line 336  for the current character, one for the f Line 362  for the current character, one for the f
362      { \      { \
363      next_active_state->offset = (x); \      next_active_state->offset = (x); \
364      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
365      next_active_state->data   = (z); \      next_active_state->data   = (z); \
366      next_active_state++; \      next_active_state++; \
367      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 348  for the current character, one for the f Line 373  for the current character, one for the f
373      { \      { \
374      next_new_state->offset = (x); \      next_new_state->offset = (x); \
375      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
376      next_new_state++; \      next_new_state++; \
377      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
378      } \      } \
# Line 359  for the current character, one for the f Line 383  for the current character, one for the f
383      { \      { \
384      next_new_state->offset = (x); \      next_new_state->offset = (x); \
385      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
386      next_new_state->data   = (z); \      next_new_state->data   = (z); \
387      next_new_state++; \      next_new_state++; \
388      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
389          (x), (y), (z), __LINE__)); \
390      } \      } \
391    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
392    
# Line 371  for the current character, one for the f Line 395  for the current character, one for the f
395  static int  static int
396  internal_dfa_exec(  internal_dfa_exec(
397    dfa_match_data *md,    dfa_match_data *md,
398    const uschar *this_start_code,    const pcre_uchar *this_start_code,
399    const uschar *current_subject,    const pcre_uchar *current_subject,
400    int start_offset,    int start_offset,
401    int *offsets,    int *offsets,
402    int offsetcount,    int offsetcount,
403    int *workspace,    int *workspace,
404    int wscount,    int wscount,
405    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
406  {  {
407  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
408  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
409    
410  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
411  const uschar *ptr;  const pcre_uchar *ptr;
412  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
413    
414    dfa_recursion_info new_recursive;
415    
416  int active_count, new_count, match_count;  int active_count, new_count, match_count;
417    
418  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
419  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
420    
421  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
422  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
423  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
424    
425  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
426  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
427  #else  #else
428  BOOL utf8 = FALSE;  BOOL utf = FALSE;
429  #endif  #endif
430    
431    BOOL reset_could_continue = FALSE;
432    
433  rlevel++;  rlevel++;
434  offsetcount &= (-2);  offsetcount &= (-2);
435    
# Line 412  wscount = (wscount - (wscount % (INTS_PE Line 438  wscount = (wscount - (wscount % (INTS_PE
438            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
439    
440  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
441    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
442    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
443    
444  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
445  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 426  next_new_state = new_states = active_sta Line 452  next_new_state = new_states = active_sta
452  new_count = 0;  new_count = 0;
453    
454  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
455    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
456        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
457        ? IMM2_SIZE:0);
458    
459  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
460  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 454  if (*first_op == OP_REVERSE) Line 482  if (*first_op == OP_REVERSE)
482    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
483    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
484    
485  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
486    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
487    
488    if (utf8)    if (utf)
489      {      {
490      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
491        {        {
492        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
493        current_subject--;        current_subject--;
494        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
495        }        }
496      }      }
497    else    else
# Line 525  else Line 551  else
551    else    else
552      {      {
553      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
554        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
555            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
556            ? IMM2_SIZE:0);
557      do      do
558        {        {
559        ADD_NEW((int)(end_code - start_code + length), 0);        ADD_NEW((int)(end_code - start_code + length), 0);
# Line 538  else Line 566  else
566    
567  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
568    
569  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
570    
571  /* Loop for scanning the subject */  /* Loop for scanning the subject */
572    
# Line 547  for (;;) Line 575  for (;;)
575    {    {
576    int i, j;    int i, j;
577    int clen, dlen;    int clen, dlen;
578    unsigned int c, d;    pcre_uint32 c, d;
579    int forced_fail = 0;    int forced_fail = 0;
580    BOOL could_continue = FALSE;    BOOL partial_newline = FALSE;
581      BOOL could_continue = reset_could_continue;
582      reset_could_continue = FALSE;
583    
584    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
585    new state list. */    new state list. */
# Line 565  for (;;) Line 595  for (;;)
595    
596  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
597    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
598    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
599    printf("\"\n");    printf("\"\n");
600    
601    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 585  for (;;) Line 615  for (;;)
615    
616    if (ptr < end_subject)    if (ptr < end_subject)
617      {      {
618      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of data items in the character */
619  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
620      if (utf8) { GETCHARLEN(c, ptr, clen); } else      GETCHARLENTEST(c, ptr, clen);
621  #endif  /* SUPPORT_UTF8 */  #else
622      c = *ptr;      c = *ptr;
623    #endif  /* SUPPORT_UTF */
624      }      }
625    else    else
626      {      {
# Line 605  for (;;) Line 636  for (;;)
636    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
637      {      {
638      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
639      const uschar *code;      BOOL caseless = FALSE;
640        const pcre_uchar *code;
641      int state_offset = current_state->offset;      int state_offset = current_state->offset;
642      int count, codevalue, rrc;      int codevalue, rrc;
643        int count;
644    
645  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
646      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 616  for (;;) Line 649  for (;;)
649          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
650  #endif  #endif
651    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
652      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
653      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
654      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
655        state, arrange for it to passed on. */
656    
657      if (state_offset < 0)      if (state_offset < 0)
658        {        {
# Line 631  for (;;) Line 661  for (;;)
661          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
662          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
663            current_state->data - 1);            current_state->data - 1);
664            if (could_continue) reset_could_continue = TRUE;
665          continue;          continue;
666          }          }
667        else        else
# Line 670  for (;;) Line 701  for (;;)
701      permitted.      permitted.
702    
703      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
704      argument that is not a data character - but is always one byte long. We      argument that is not a data character - but is always one byte long because
705      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in      the values are small. We have to take special action to deal with  \P, \p,
706      this case. To keep the other cases fast, convert these ones to new opcodes.      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
707      */      these ones to new opcodes. */
708    
709      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
710        {        {
711        dlen = 1;        dlen = 1;
712  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
713        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
714  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
715        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
716        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
717          {          {
# Line 725  for (;;) Line 756  for (;;)
756    
757  /* ========================================================================== */  /* ========================================================================== */
758        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
759        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
760          state. Note that KETRPOS will always be encountered at the end of the
761          subpattern, because the possessive subpattern repeats are always handled
762          using recursive calls. Thus, it never adds any new states.
763    
764          At the end of the (sub)pattern, unless we have an empty string and
765        PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the        PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
766        start of the subject, save the match data, shifting up all previous        start of the subject, save the match data, shifting up all previous
767        matches so we always have the longest first. */        matches so we always have the longest first. */
# Line 733  for (;;) Line 769  for (;;)
769        case OP_KET:        case OP_KET:
770        case OP_KETRMIN:        case OP_KETRMIN:
771        case OP_KETRMAX:        case OP_KETRMAX:
772          case OP_KETRPOS:
773        if (code != end_code)        if (code != end_code)
774          {          {
775          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 749  for (;;) Line 786  for (;;)
786                  current_subject > start_subject + md->start_offset)))                  current_subject > start_subject + md->start_offset)))
787            {            {
788            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
789              else if (match_count > 0 && ++match_count * 2 >= offsetcount)              else if (match_count > 0 && ++match_count * 2 > offsetcount)
790                match_count = 0;                match_count = 0;
791            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
792            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
# Line 758  for (;;) Line 795  for (;;)
795              offsets[0] = (int)(current_subject - start_subject);              offsets[0] = (int)(current_subject - start_subject);
796              offsets[1] = (int)(ptr - start_subject);              offsets[1] = (int)(ptr - start_subject);
797              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
798                offsets[1] - offsets[0], current_subject));                offsets[1] - offsets[0], (char *)current_subject));
799              }              }
800            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
801              {              {
# Line 795  for (;;) Line 832  for (;;)
832        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
833        case OP_CBRA:        case OP_CBRA:
834        case OP_SCBRA:        case OP_SCBRA:
835        ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE),  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
836        code += GET(code, 1);        code += GET(code, 1);
837        while (*code == OP_ALT)        while (*code == OP_ALT)
838          {          {
# Line 822  for (;;) Line 859  for (;;)
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_CIRC:        case OP_CIRC:
862        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
863          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
864        break;        break;
865    
866        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
867        case OP_EOD:        case OP_CIRCM:
868        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
869              (ptr != end_subject && WAS_NEWLINE(ptr)))
870            { ADD_ACTIVE(state_offset + 1, 0); }
871        break;        break;
872    
873        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
874        case OP_OPT:        case OP_EOD:
875        ims = code[1];        if (ptr >= end_subject)
876        ADD_ACTIVE(state_offset + 2, 0);          {
877            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
878              could_continue = TRUE;
879            else { ADD_ACTIVE(state_offset + 1, 0); }
880            }
881        break;        break;
882    
883        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 860  for (;;) Line 900  for (;;)
900        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
901        case OP_ANY:        case OP_ANY:
902        if (clen > 0 && !IS_NEWLINE(ptr))        if (clen > 0 && !IS_NEWLINE(ptr))
903          { ADD_NEW(state_offset + 1, 0); }          {
904            if (ptr + 1 >= md->end_subject &&
905                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
906                NLBLOCK->nltype == NLTYPE_FIXED &&
907                NLBLOCK->nllen == 2 &&
908                c == NLBLOCK->nl[0])
909              {
910              could_continue = partial_newline = TRUE;
911              }
912            else
913              {
914              ADD_NEW(state_offset + 1, 0);
915              }
916            }
917        break;        break;
918    
919        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 871  for (;;) Line 924  for (;;)
924    
925        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
926        case OP_EODN:        case OP_EODN:
927        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
928            could_continue = TRUE;
929          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
930          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
931        break;        break;
932    
# Line 879  for (;;) Line 934  for (;;)
934        case OP_DOLL:        case OP_DOLL:
935        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
936          {          {
937          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
938              could_continue = TRUE;
939            else if (clen == 0 ||
940              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
941                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 (ptr == end_subject - md->nllen)
942              ))              ))
943            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
944            else if (ptr + 1 >= md->end_subject &&
945                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
946                     NLBLOCK->nltype == NLTYPE_FIXED &&
947                     NLBLOCK->nllen == 2 &&
948                     c == NLBLOCK->nl[0])
949              {
950              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
951                {
952                reset_could_continue = TRUE;
953                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
954                }
955              else could_continue = partial_newline = TRUE;
956              }
957          }          }
958        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        break;
959    
960          /*-----------------------------------------------------------------*/
961          case OP_DOLLM:
962          if ((md->moptions & PCRE_NOTEOL) == 0)
963            {
964            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
965              could_continue = TRUE;
966            else if (clen == 0 ||
967                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
968              { ADD_ACTIVE(state_offset + 1, 0); }
969            else if (ptr + 1 >= md->end_subject &&
970                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
971                     NLBLOCK->nltype == NLTYPE_FIXED &&
972                     NLBLOCK->nllen == 2 &&
973                     c == NLBLOCK->nl[0])
974              {
975              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
976                {
977                reset_could_continue = TRUE;
978                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
979                }
980              else could_continue = partial_newline = TRUE;
981              }
982            }
983          else if (IS_NEWLINE(ptr))
984          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
985        break;        break;
986    
# Line 916  for (;;) Line 1011  for (;;)
1011    
1012          if (ptr > start_subject)          if (ptr > start_subject)
1013            {            {
1014            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1015            if (temp < md->start_used_ptr) md->start_used_ptr = temp;            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1016  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1017            if (utf8) BACKCHAR(temp);            if (utf) { BACKCHAR(temp); }
1018  #endif  #endif
1019            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1020  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1021            if ((md->poptions & PCRE_UCP) != 0)            if ((md->poptions & PCRE_UCP) != 0)
1022              {              {
1023              if (d == '_') left_word = TRUE; else              if (d == '_') left_word = TRUE; else
1024                {                {
1025                int cat = UCD_CATEGORY(d);                int cat = UCD_CATEGORY(d);
1026                left_word = (cat == ucp_L || cat == ucp_N);                left_word = (cat == ucp_L || cat == ucp_N);
1027                }                }
1028              }              }
1029            else            else
1030  #endif  #endif
1031            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1032            }            }
1033          else left_word = FALSE;          else left_word = FALSE;
1034    
1035          if (clen > 0)          if (clen > 0)
1036            {            {
1037  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1038            if ((md->poptions & PCRE_UCP) != 0)            if ((md->poptions & PCRE_UCP) != 0)
1039              {              {
1040              if (c == '_') right_word = TRUE; else              if (c == '_') right_word = TRUE; else
1041                {                {
1042                int cat = UCD_CATEGORY(c);                int cat = UCD_CATEGORY(c);
1043                right_word = (cat == ucp_L || cat == ucp_N);                right_word = (cat == ucp_L || cat == ucp_N);
1044                }                }
1045              }              }
1046            else            else
1047  #endif  #endif
1048            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1049            }            }
1050          else right_word = FALSE;          else right_word = FALSE;
1051    
1052          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
# Line 971  for (;;) Line 1066  for (;;)
1066        if (clen > 0)        if (clen > 0)
1067          {          {
1068          BOOL OK;          BOOL OK;
1069            const pcre_uint32 *cp;
1070          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1071          switch(code[1])          switch(code[1])
1072            {            {
# Line 979  for (;;) Line 1075  for (;;)
1075            break;            break;
1076    
1077            case PT_LAMP:            case PT_LAMP:
1078            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1079                 prop->chartype == ucp_Lt;                 prop->chartype == ucp_Lt;
1080            break;            break;
1081    
1082            case PT_GC:            case PT_GC:
1083            OK = _pcre_ucp_gentype[prop->chartype] == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1084            break;            break;
1085    
1086            case PT_PC:            case PT_PC:
# Line 994  for (;;) Line 1090  for (;;)
1090            case PT_SC:            case PT_SC:
1091            OK = prop->script == code[2];            OK = prop->script == code[2];
1092            break;            break;
1093    
1094            /* These are specials for combination cases. */            /* These are specials for combination cases. */
1095    
1096            case PT_ALNUM:            case PT_ALNUM:
1097            OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1098                 _pcre_ucp_gentype[prop->chartype] == ucp_N;                 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1099            break;            break;
1100    
1101              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1102              which means that Perl space and POSIX space are now identical. PCRE
1103              was changed at release 8.34. */
1104    
1105            case PT_SPACE:    /* Perl space */            case PT_SPACE:    /* Perl space */
           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||  
                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;  
           break;  
   
1106            case PT_PXSPACE:  /* POSIX space */            case PT_PXSPACE:  /* POSIX space */
1107            OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1108                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1109                 c == CHAR_FF || c == CHAR_CR;                 c == CHAR_FF || c == CHAR_CR;
1110            break;            break;
1111    
1112            case PT_WORD:            case PT_WORD:
1113            OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1114                 _pcre_ucp_gentype[prop->chartype] == ucp_N ||                 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1115                 c == CHAR_UNDERSCORE;                 c == CHAR_UNDERSCORE;
1116            break;            break;
1117    
1118              case PT_CLIST:
1119              cp = PRIV(ucd_caseless_sets) + code[2];
1120              for (;;)
1121                {
1122                if (c < *cp) { OK = FALSE; break; }
1123                if (c == *cp++) { OK = TRUE; break; }
1124                }
1125              break;
1126    
1127              case PT_UCNC:
1128              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1129                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1130                   c >= 0xe000;
1131              break;
1132    
1133            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1134    
# Line 1046  for (;;) Line 1157  for (;;)
1157        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1158        if (clen > 0)        if (clen > 0)
1159          {          {
1160          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1161                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1162                NLBLOCK->nltype == NLTYPE_FIXED &&
1163                NLBLOCK->nllen == 2 &&
1164                c == NLBLOCK->nl[0])
1165              {
1166              could_continue = partial_newline = TRUE;
1167              }
1168            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1169              (c < 256 &&              (c < 256 &&
1170                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1171                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1069  for (;;) Line 1188  for (;;)
1188        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1189        if (clen > 0)        if (clen > 0)
1190          {          {
1191          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1192                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1193                NLBLOCK->nltype == NLTYPE_FIXED &&
1194                NLBLOCK->nllen == 2 &&
1195                c == NLBLOCK->nl[0])
1196              {
1197              could_continue = partial_newline = TRUE;
1198              }
1199            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1200              (c < 256 &&              (c < 256 &&
1201                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1202                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1091  for (;;) Line 1218  for (;;)
1218        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1219        if (clen > 0)        if (clen > 0)
1220          {          {
1221          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1222                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1223                NLBLOCK->nltype == NLTYPE_FIXED &&
1224                NLBLOCK->nllen == 2 &&
1225                c == NLBLOCK->nl[0])
1226              {
1227              could_continue = partial_newline = TRUE;
1228              }
1229            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1230              (c < 256 &&              (c < 256 &&
1231                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1232                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1111  for (;;) Line 1246  for (;;)
1246        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1247        if (clen > 0)        if (clen > 0)
1248          {          {
1249          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1250                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1251                NLBLOCK->nltype == NLTYPE_FIXED &&
1252                NLBLOCK->nllen == 2 &&
1253                c == NLBLOCK->nl[0])
1254              {
1255              could_continue = partial_newline = TRUE;
1256              }
1257            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1258              (c < 256 &&              (c < 256 &&
1259                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1260                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1261            {            {
1262            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1263              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1264            else            else
1265              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1266            }            }
# Line 1128  for (;;) Line 1271  for (;;)
1271        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1272        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1273        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1274        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1275        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1276        if (clen > 0)        if (clen > 0)
1277          {          {
1278          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1279                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1280                NLBLOCK->nltype == NLTYPE_FIXED &&
1281                NLBLOCK->nllen == 2 &&
1282                c == NLBLOCK->nl[0])
1283              {
1284              could_continue = partial_newline = TRUE;
1285              }
1286            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1287              (c < 256 &&              (c < 256 &&
1288                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1289                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1142  for (;;) Line 1293  for (;;)
1293              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1294              next_active_state--;              next_active_state--;
1295              }              }
1296            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1297              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1298            else            else
1299              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1300            }            }
# Line 1165  for (;;) Line 1316  for (;;)
1316        if (clen > 0)        if (clen > 0)
1317          {          {
1318          BOOL OK;          BOOL OK;
1319            const pcre_uint32 *cp;
1320          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1321          switch(code[2])          switch(code[2])
1322            {            {
# Line 1173  for (;;) Line 1325  for (;;)
1325            break;            break;
1326    
1327            case PT_LAMP:            case PT_LAMP:
1328            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1329              prop->chartype == ucp_Lt;              prop->chartype == ucp_Lt;
1330            break;            break;
1331    
1332            case PT_GC:            case PT_GC:
1333            OK = _pcre_ucp_gentype[prop->chartype] == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1334            break;            break;
1335    
1336            case PT_PC:            case PT_PC:
# Line 1190  for (;;) Line 1342  for (;;)
1342            break;            break;
1343    
1344            /* These are specials for combination cases. */            /* These are specials for combination cases. */
1345    
1346            case PT_ALNUM:            case PT_ALNUM:
1347            OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1348                 _pcre_ucp_gentype[prop->chartype] == ucp_N;                 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1349            break;            break;
1350    
1351              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1352              which means that Perl space and POSIX space are now identical. PCRE
1353              was changed at release 8.34. */
1354    
1355            case PT_SPACE:    /* Perl space */            case PT_SPACE:    /* Perl space */
           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||  
                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;  
           break;  
   
1356            case PT_PXSPACE:  /* POSIX space */            case PT_PXSPACE:  /* POSIX space */
1357            OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1358                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1359                 c == CHAR_FF || c == CHAR_CR;                 c == CHAR_FF || c == CHAR_CR;
1360            break;            break;
1361    
1362            case PT_WORD:            case PT_WORD:
1363            OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1364                 _pcre_ucp_gentype[prop->chartype] == ucp_N ||                 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1365                 c == CHAR_UNDERSCORE;                 c == CHAR_UNDERSCORE;
1366            break;            break;
1367    
1368              case PT_CLIST:
1369              cp = PRIV(ucd_caseless_sets) + code[3];
1370              for (;;)
1371                {
1372                if (c < *cp) { OK = FALSE; break; }
1373                if (c == *cp++) { OK = TRUE; break; }
1374                }
1375              break;
1376    
1377              case PT_UCNC:
1378              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1379                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1380                   c >= 0xe000;
1381              break;
1382    
1383            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1384    
# Line 1239  for (;;) Line 1406  for (;;)
1406        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1407        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1408        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1409        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1410          {          {
1411          const uschar *nptr = ptr + clen;          int lgb, rgb;
1412            const pcre_uchar *nptr = ptr + clen;
1413          int ncount = 0;          int ncount = 0;
1414          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1415            {            {
1416            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1417            next_active_state--;            next_active_state--;
1418            }            }
1419            lgb = UCD_GRAPHBREAK(c);
1420          while (nptr < end_subject)          while (nptr < end_subject)
1421            {            {
1422            int nd;            dlen = 1;
1423            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1424            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1425            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1426            ncount++;            ncount++;
1427            nptr += ndlen;            lgb = rgb;
1428              nptr += dlen;
1429            }            }
1430          count++;          count++;
1431          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
# Line 1274  for (;;) Line 1444  for (;;)
1444          int ncount = 0;          int ncount = 0;
1445          switch (c)          switch (c)
1446            {            {
1447            case 0x000b:            case CHAR_VT:
1448            case 0x000c:            case CHAR_FF:
1449            case 0x0085:            case CHAR_NEL:
1450    #ifndef EBCDIC
1451            case 0x2028:            case 0x2028:
1452            case 0x2029:            case 0x2029:
1453    #endif  /* Not EBCDIC */
1454            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1455            goto ANYNL01;            goto ANYNL01;
1456    
1457            case 0x000d:            case CHAR_CR:
1458            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1459            /* Fall through */            /* Fall through */
1460    
1461            ANYNL01:            ANYNL01:
1462            case 0x000a:            case CHAR_LF:
1463            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1464              {              {
1465              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1314  for (;;) Line 1486  for (;;)
1486          BOOL OK;          BOOL OK;
1487          switch (c)          switch (c)
1488            {            {
1489            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1490            OK = TRUE;            OK = TRUE;
1491            break;            break;
1492    
# Line 1353  for (;;) Line 1519  for (;;)
1519          BOOL OK;          BOOL OK;
1520          switch (c)          switch (c)
1521            {            {
1522            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
1523            OK = TRUE;            OK = TRUE;
1524            break;            break;
1525    
# Line 1412  for (;;) Line 1560  for (;;)
1560        if (clen > 0)        if (clen > 0)
1561          {          {
1562          BOOL OK;          BOOL OK;
1563            const pcre_uint32 *cp;
1564          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1565          switch(code[2])          switch(code[2])
1566            {            {
# Line 1420  for (;;) Line 1569  for (;;)
1569            break;            break;
1570    
1571            case PT_LAMP:            case PT_LAMP:
1572            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1573              prop->chartype == ucp_Lt;              prop->chartype == ucp_Lt;
1574            break;            break;
1575    
1576            case PT_GC:            case PT_GC:
1577            OK = _pcre_ucp_gentype[prop->chartype] == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1578            break;            break;
1579    
1580            case PT_PC:            case PT_PC:
# Line 1435  for (;;) Line 1584  for (;;)
1584            case PT_SC:            case PT_SC:
1585            OK = prop->script == code[3];            OK = prop->script == code[3];
1586            break;            break;
1587    
1588            /* These are specials for combination cases. */            /* These are specials for combination cases. */
1589    
1590            case PT_ALNUM:            case PT_ALNUM:
1591            OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1592                 _pcre_ucp_gentype[prop->chartype] == ucp_N;                 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1593            break;            break;
1594    
1595              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1596              which means that Perl space and POSIX space are now identical. PCRE
1597              was changed at release 8.34. */
1598    
1599            case PT_SPACE:    /* Perl space */            case PT_SPACE:    /* Perl space */
           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||  
                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;  
           break;  
   
1600            case PT_PXSPACE:  /* POSIX space */            case PT_PXSPACE:  /* POSIX space */
1601            OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1602                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1603                 c == CHAR_FF || c == CHAR_CR;                 c == CHAR_FF || c == CHAR_CR;
1604            break;            break;
1605    
1606            case PT_WORD:            case PT_WORD:
1607            OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1608                 _pcre_ucp_gentype[prop->chartype] == ucp_N ||                 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1609                 c == CHAR_UNDERSCORE;                 c == CHAR_UNDERSCORE;
1610            break;            break;
1611    
1612              case PT_CLIST:
1613              cp = PRIV(ucd_caseless_sets) + code[3];
1614              for (;;)
1615                {
1616                if (c < *cp) { OK = FALSE; break; }
1617                if (c == *cp++) { OK = TRUE; break; }
1618                }
1619              break;
1620    
1621              case PT_UCNC:
1622              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1623                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1624                   c >= 0xe000;
1625              break;
1626    
1627            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1628    
# Line 1495  for (;;) Line 1659  for (;;)
1659        QS2:        QS2:
1660    
1661        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1662        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1663          {          {
1664          const uschar *nptr = ptr + clen;          int lgb, rgb;
1665            const pcre_uchar *nptr = ptr + clen;
1666          int ncount = 0;          int ncount = 0;
1667          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1668              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1505  for (;;) Line 1670  for (;;)
1670            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1671            next_active_state--;            next_active_state--;
1672            }            }
1673            lgb = UCD_GRAPHBREAK(c);
1674          while (nptr < end_subject)          while (nptr < end_subject)
1675            {            {
1676            int nd;            dlen = 1;
1677            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1678            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1679            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1680            ncount++;            ncount++;
1681            nptr += ndlen;            lgb = rgb;
1682              nptr += dlen;
1683            }            }
1684          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1685          }          }
# Line 1538  for (;;) Line 1705  for (;;)
1705          int ncount = 0;          int ncount = 0;
1706          switch (c)          switch (c)
1707            {            {
1708            case 0x000b:            case CHAR_VT:
1709            case 0x000c:            case CHAR_FF:
1710            case 0x0085:            case CHAR_NEL:
1711    #ifndef EBCDIC
1712            case 0x2028:            case 0x2028:
1713            case 0x2029:            case 0x2029:
1714    #endif  /* Not EBCDIC */
1715            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1716            goto ANYNL02;            goto ANYNL02;
1717    
1718            case 0x000d:            case CHAR_CR:
1719            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1720            /* Fall through */            /* Fall through */
1721    
1722            ANYNL02:            ANYNL02:
1723            case 0x000a:            case CHAR_LF:
1724            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1725                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1726              {              {
1727              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1728              next_active_state--;              next_active_state--;
1729              }              }
1730            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1731            break;            break;
1732    
1733            default:            default:
# Line 1586  for (;;) Line 1755  for (;;)
1755          BOOL OK;          BOOL OK;
1756          switch (c)          switch (c)
1757            {            {
1758            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1759            OK = TRUE;            OK = TRUE;
1760            break;            break;
1761    
# Line 1608  for (;;) Line 1771  for (;;)
1771              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1772              next_active_state--;              next_active_state--;
1773              }              }
1774            ADD_NEW_DATA(-(state_offset + count), 0, 0);            ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1775            }            }
1776          }          }
1777        break;        break;
# Line 1632  for (;;) Line 1795  for (;;)
1795          BOOL OK;          BOOL OK;
1796          switch (c)          switch (c)
1797            {            {
1798            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
1799            OK = TRUE;            OK = TRUE;
1800            break;            break;
1801    
# Line 1667  for (;;) Line 1812  for (;;)
1812              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1813              next_active_state--;              next_active_state--;
1814              }              }
1815            ADD_NEW_DATA(-(state_offset + count), 0, 0);            ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1816            }            }
1817          }          }
1818        break;        break;
# Line 1679  for (;;) Line 1824  for (;;)
1824        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1825        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1826        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1827          { ADD_ACTIVE(state_offset + 6, 0); }          { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1828        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1829        if (clen > 0)        if (clen > 0)
1830          {          {
1831          BOOL OK;          BOOL OK;
1832            const pcre_uint32 *cp;
1833          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1834          switch(code[4])          switch(code[1 + IMM2_SIZE + 1])
1835            {            {
1836            case PT_ANY:            case PT_ANY:
1837            OK = TRUE;            OK = TRUE;
1838            break;            break;
1839    
1840            case PT_LAMP:            case PT_LAMP:
1841            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1842              prop->chartype == ucp_Lt;              prop->chartype == ucp_Lt;
1843            break;            break;
1844    
1845            case PT_GC:            case PT_GC:
1846            OK = _pcre_ucp_gentype[prop->chartype] == code[5];            OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1847            break;            break;
1848    
1849            case PT_PC:            case PT_PC:
1850            OK = prop->chartype == code[5];            OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1851            break;            break;
1852    
1853            case PT_SC:            case PT_SC:
1854            OK = prop->script == code[5];            OK = prop->script == code[1 + IMM2_SIZE + 2];
1855            break;            break;
1856    
1857            /* These are specials for combination cases. */            /* These are specials for combination cases. */
1858    
1859            case PT_ALNUM:            case PT_ALNUM:
1860            OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1861                 _pcre_ucp_gentype[prop->chartype] == ucp_N;                 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1862            break;            break;
1863    
1864              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1865              which means that Perl space and POSIX space are now identical. PCRE
1866              was changed at release 8.34. */
1867    
1868            case PT_SPACE:    /* Perl space */            case PT_SPACE:    /* Perl space */
           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||  
                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;  
           break;  
   
1869            case PT_PXSPACE:  /* POSIX space */            case PT_PXSPACE:  /* POSIX space */
1870            OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1871                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1872                 c == CHAR_FF || c == CHAR_CR;                 c == CHAR_FF || c == CHAR_CR;
1873            break;            break;
1874    
1875            case PT_WORD:            case PT_WORD:
1876            OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1877                 _pcre_ucp_gentype[prop->chartype] == ucp_N ||                 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1878                 c == CHAR_UNDERSCORE;                 c == CHAR_UNDERSCORE;
1879            break;            break;
1880    
1881              case PT_CLIST:
1882              cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1883              for (;;)
1884                {
1885                if (c < *cp) { OK = FALSE; break; }
1886                if (c == *cp++) { OK = TRUE; break; }
1887                }
1888              break;
1889    
1890              case PT_UCNC:
1891              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1892                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1893                   c >= 0xe000;
1894              break;
1895    
1896            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1897    
# Line 1746  for (;;) Line 1907  for (;;)
1907              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1908              next_active_state--;              next_active_state--;
1909              }              }
1910            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1911              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1912            else            else
1913              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1914            }            }
# Line 1760  for (;;) Line 1921  for (;;)
1921        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1922        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1923        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1924          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1925        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1926        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1927          {          {
1928          const uschar *nptr = ptr + clen;          int lgb, rgb;
1929            const pcre_uchar *nptr = ptr + clen;
1930          int ncount = 0;          int ncount = 0;
1931          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1932            {            {
1933            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1934            next_active_state--;            next_active_state--;
1935            }            }
1936            lgb = UCD_GRAPHBREAK(c);
1937          while (nptr < end_subject)          while (nptr < end_subject)
1938            {            {
1939            int nd;            dlen = 1;
1940            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1941            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1942            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1943            ncount++;            ncount++;
1944            nptr += ndlen;            lgb = rgb;
1945              nptr += dlen;
1946            }            }
1947          if (++count >= GET2(code, 1))          if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1948            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              reset_could_continue = TRUE;
1949            if (++count >= (int)GET2(code, 1))
1950              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1951          else          else
1952            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1953          }          }
# Line 1794  for (;;) Line 1960  for (;;)
1960        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1961        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1962        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1963          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1964        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1965        if (clen > 0)        if (clen > 0)
1966          {          {
1967          int ncount = 0;          int ncount = 0;
1968          switch (c)          switch (c)
1969            {            {
1970            case 0x000b:            case CHAR_VT:
1971            case 0x000c:            case CHAR_FF:
1972            case 0x0085:            case CHAR_NEL:
1973    #ifndef EBCDIC
1974            case 0x2028:            case 0x2028:
1975            case 0x2029:            case 0x2029:
1976    #endif  /* Not EBCDIC */
1977            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1978            goto ANYNL03;            goto ANYNL03;
1979    
1980            case 0x000d:            case CHAR_CR:
1981            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1982            /* Fall through */            /* Fall through */
1983    
1984            ANYNL03:            ANYNL03:
1985            case 0x000a:            case CHAR_LF:
1986            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1987              {              {
1988              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1989              next_active_state--;              next_active_state--;
1990              }              }
1991            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1992              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1993            else            else
1994              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1995            break;            break;
# Line 1838  for (;;) Line 2006  for (;;)
2006        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2007        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2008        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2009          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2010        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2011        if (clen > 0)        if (clen > 0)
2012          {          {
2013          BOOL OK;          BOOL OK;
2014          switch (c)          switch (c)
2015            {            {
2016            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
2017            OK = TRUE;            OK = TRUE;
2018            break;            break;
2019    
# Line 1866  for (;;) Line 2028  for (;;)
2028              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
2029              next_active_state--;              next_active_state--;
2030              }              }
2031            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2032              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2033            else            else
2034              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
2035            }            }
# Line 1880  for (;;) Line 2042  for (;;)
2042        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2043        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2044        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2045          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2046        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2047        if (clen > 0)        if (clen > 0)
2048          {          {
2049          BOOL OK;          BOOL OK;
2050          switch (c)          switch (c)
2051            {            {
2052            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
2053            OK = TRUE;            OK = TRUE;
2054            break;            break;
2055    
# Line 1921  for (;;) Line 2065  for (;;)
2065              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
2066              next_active_state--;              next_active_state--;
2067              }              }
2068            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2069              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2070            else            else
2071              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
2072            }            }
# Line 1941  for (;;) Line 2085  for (;;)
2085        break;        break;
2086    
2087        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2088        case OP_CHARNC:        case OP_CHARI:
2089        if (clen == 0) break;        if (clen == 0) break;
2090    
2091  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2092        if (utf8)        if (utf)
2093          {          {
2094          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2095            {            {
2096            unsigned int othercase;            unsigned int othercase;
2097            if (c < 128) othercase = fcc[c]; else            if (c < 128)
2098                othercase = fcc[c];
2099            /* If we have Unicode property support, we can use it to test the            else
2100            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2101                other case of the character. */
2102  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2103            othercase = UCD_OTHERCASE(c);              othercase = UCD_OTHERCASE(c);
2104  #else  #else
2105            othercase = NOTACHAR;              othercase = NOTACHAR;
2106  #endif  #endif
2107    
2108            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2109            }            }
2110          }          }
2111        else        else
2112  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2113          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2114          {          {
2115          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2116              { ADD_NEW(state_offset + 2, 0); }
2117          }          }
2118        break;        break;
2119    
# Line 1981  for (;;) Line 2125  for (;;)
2125        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2126    
2127        case OP_EXTUNI:        case OP_EXTUNI:
2128        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
2129          {          {
2130          const uschar *nptr = ptr + clen;          int lgb, rgb;
2131            const pcre_uchar *nptr = ptr + clen;
2132          int ncount = 0;          int ncount = 0;
2133            lgb = UCD_GRAPHBREAK(c);
2134          while (nptr < end_subject)          while (nptr < end_subject)
2135            {            {
2136            int nclen = 1;            dlen = 1;
2137            GETCHARLEN(c, nptr, nclen);            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2138            if (UCD_CATEGORY(c) != ucp_M) break;            rgb = UCD_GRAPHBREAK(d);
2139              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2140            ncount++;            ncount++;
2141            nptr += nclen;            lgb = rgb;
2142              nptr += dlen;
2143            }            }
2144            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2145                reset_could_continue = TRUE;
2146          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2147          }          }
2148        break;        break;
# Line 2006  for (;;) Line 2156  for (;;)
2156        case OP_ANYNL:        case OP_ANYNL:
2157        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2158          {          {
2159          case 0x000b:          case CHAR_VT:
2160          case 0x000c:          case CHAR_FF:
2161          case 0x0085:          case CHAR_NEL:
2162    #ifndef EBCDIC
2163          case 0x2028:          case 0x2028:
2164          case 0x2029:          case 0x2029:
2165    #endif  /* Not EBCDIC */
2166          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2167    
2168          case 0x000a:          case CHAR_LF:
2169          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2170          break;          break;
2171    
2172          case 0x000d:          case CHAR_CR:
2173          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 >= end_subject)
2174              {
2175              ADD_NEW(state_offset + 1, 0);
2176              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2177                reset_could_continue = TRUE;
2178              }
2179            else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2180            {            {
2181            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2182            }            }
# Line 2034  for (;;) Line 2192  for (;;)
2192        case OP_NOT_VSPACE:        case OP_NOT_VSPACE:
2193        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2194          {          {
2195          case 0x000a:          VSPACE_CASES:
         case 0x000b:  
         case 0x000c:  
         case 0x000d:  
         case 0x0085:  
         case 0x2028:  
         case 0x2029:  
2196          break;          break;
2197    
2198          default:          default:
# Line 2053  for (;;) Line 2205  for (;;)
2205        case OP_VSPACE:        case OP_VSPACE:
2206        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2207          {          {
2208          case 0x000a:          VSPACE_CASES:
         case 0x000b:  
         case 0x000c:  
         case 0x000d:  
         case 0x0085:  
         case 0x2028:  
         case 0x2029:  
2209          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2210          break;          break;
2211    
2212          default: break;          default:
2213            break;
2214          }          }
2215        break;        break;
2216    
# Line 2071  for (;;) Line 2218  for (;;)
2218        case OP_NOT_HSPACE:        case OP_NOT_HSPACE:
2219        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2220          {          {
2221          case 0x09:      /* HT */          HSPACE_CASES:
         case 0x20:      /* SPACE */  
         case 0xa0:      /* NBSP */  
         case 0x1680:    /* OGHAM SPACE MARK */  
         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
         case 0x2000:    /* EN QUAD */  
         case 0x2001:    /* EM QUAD */  
         case 0x2002:    /* EN SPACE */  
         case 0x2003:    /* EM SPACE */  
         case 0x2004:    /* THREE-PER-EM SPACE */  
         case 0x2005:    /* FOUR-PER-EM SPACE */  
         case 0x2006:    /* SIX-PER-EM SPACE */  
         case 0x2007:    /* FIGURE SPACE */  
         case 0x2008:    /* PUNCTUATION SPACE */  
         case 0x2009:    /* THIN SPACE */  
         case 0x200A:    /* HAIR SPACE */  
         case 0x202f:    /* NARROW NO-BREAK SPACE */  
         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
         case 0x3000:    /* IDEOGRAPHIC SPACE */  
2222          break;          break;
2223    
2224          default:          default:
# Line 2102  for (;;) Line 2231  for (;;)
2231        case OP_HSPACE:        case OP_HSPACE:
2232        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2233          {          {
2234          case 0x09:      /* HT */          HSPACE_CASES:
         case 0x20:      /* SPACE */  
         case 0xa0:      /* NBSP */  
         case 0x1680:    /* OGHAM SPACE MARK */  
         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
         case 0x2000:    /* EN QUAD */  
         case 0x2001:    /* EM QUAD */  
         case 0x2002:    /* EN SPACE */  
         case 0x2003:    /* EM SPACE */  
         case 0x2004:    /* THREE-PER-EM SPACE */  
         case 0x2005:    /* FOUR-PER-EM SPACE */  
         case 0x2006:    /* SIX-PER-EM SPACE */  
         case 0x2007:    /* FIGURE SPACE */  
         case 0x2008:    /* PUNCTUATION SPACE */  
         case 0x2009:    /* THIN SPACE */  
         case 0x200A:    /* HAIR SPACE */  
         case 0x202f:    /* NARROW NO-BREAK SPACE */  
         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
         case 0x3000:    /* IDEOGRAPHIC SPACE */  
2235          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2236          break;          break;
2237    
2238            default:
2239            break;
2240          }          }
2241        break;        break;
2242    
2243        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2244        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character casefully. */
       characters, that is, we know that d < 256. The character we are  
       checking (c) can be multibyte. */  
2245    
2246        case OP_NOT:        case OP_NOT:
2247          if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2248          break;
2249    
2250          /*-----------------------------------------------------------------*/
2251          /* Match a negated single character caselessly. */
2252    
2253          case OP_NOTI:
2254        if (clen > 0)        if (clen > 0)
2255          {          {
2256          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          unsigned int otherd;
2257          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  #ifdef SUPPORT_UTF
2258            if (utf && d >= 128)
2259              {
2260    #ifdef SUPPORT_UCP
2261              otherd = UCD_OTHERCASE(d);
2262    #endif  /* SUPPORT_UCP */
2263              }
2264            else
2265    #endif  /* SUPPORT_UTF */
2266            otherd = TABLE_GET(d, fcc, d);
2267            if (c != d && c != otherd)
2268              { ADD_NEW(state_offset + dlen + 1, 0); }
2269          }          }
2270        break;        break;
2271    
2272        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2273          case OP_PLUSI:
2274          case OP_MINPLUSI:
2275          case OP_POSPLUSI:
2276          case OP_NOTPLUSI:
2277          case OP_NOTMINPLUSI:
2278          case OP_NOTPOSPLUSI:
2279          caseless = TRUE;
2280          codevalue -= OP_STARI - OP_STAR;
2281    
2282          /* Fall through */
2283        case OP_PLUS:        case OP_PLUS:
2284        case OP_MINPLUS:        case OP_MINPLUS:
2285        case OP_POSPLUS:        case OP_POSPLUS:
# Line 2150  for (;;) Line 2290  for (;;)
2290        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2291        if (clen > 0)        if (clen > 0)
2292          {          {
2293          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2294          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2295            {            {
2296  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2297            if (utf8 && d >= 128)            if (utf && d >= 128)
2298              {              {
2299  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2300              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2301  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2302              }              }
2303            else            else
2304  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2305            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2306            }            }
2307          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2308            {            {
# Line 2179  for (;;) Line 2319  for (;;)
2319        break;        break;
2320    
2321        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2322          case OP_QUERYI:
2323          case OP_MINQUERYI:
2324          case OP_POSQUERYI:
2325          case OP_NOTQUERYI:
2326          case OP_NOTMINQUERYI:
2327          case OP_NOTPOSQUERYI:
2328          caseless = TRUE;
2329          codevalue -= OP_STARI - OP_STAR;
2330          /* Fall through */
2331        case OP_QUERY:        case OP_QUERY:
2332        case OP_MINQUERY:        case OP_MINQUERY:
2333        case OP_POSQUERY:        case OP_POSQUERY:
# Line 2188  for (;;) Line 2337  for (;;)
2337        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2338        if (clen > 0)        if (clen > 0)
2339          {          {
2340          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2341          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2342            {            {
2343  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2344            if (utf8 && d >= 128)            if (utf && d >= 128)
2345              {              {
2346  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2347              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2348  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2349              }              }
2350            else            else
2351  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2352            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2353            }            }
2354          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2355            {            {
# Line 2215  for (;;) Line 2364  for (;;)
2364        break;        break;
2365    
2366        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2367          case OP_STARI:
2368          case OP_MINSTARI:
2369          case OP_POSSTARI:
2370          case OP_NOTSTARI:
2371          case OP_NOTMINSTARI:
2372          case OP_NOTPOSSTARI:
2373          caseless = TRUE;
2374          codevalue -= OP_STARI - OP_STAR;
2375          /* Fall through */
2376        case OP_STAR:        case OP_STAR:
2377        case OP_MINSTAR:        case OP_MINSTAR:
2378        case OP_POSSTAR:        case OP_POSSTAR:
# Line 2224  for (;;) Line 2382  for (;;)
2382        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2383        if (clen > 0)        if (clen > 0)
2384          {          {
2385          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2386          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2387            {            {
2388  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2389            if (utf8 && d >= 128)            if (utf && d >= 128)
2390              {              {
2391  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2392              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2393  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2394              }              }
2395            else            else
2396  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2397            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2398            }            }
2399          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2400            {            {
# Line 2251  for (;;) Line 2409  for (;;)
2409        break;        break;
2410    
2411        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2412          case OP_EXACTI:
2413          case OP_NOTEXACTI:
2414          caseless = TRUE;
2415          codevalue -= OP_STARI - OP_STAR;
2416          /* Fall through */
2417        case OP_EXACT:        case OP_EXACT:
2418        case OP_NOTEXACT:        case OP_NOTEXACT:
2419        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2420        if (clen > 0)        if (clen > 0)
2421          {          {
2422          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2423          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2424            {            {
2425  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2426            if (utf8 && d >= 128)            if (utf && d >= 128)
2427              {              {
2428  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2429              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2430  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2431              }              }
2432            else            else
2433  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2434            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2435            }            }
2436          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2437            {            {
2438            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2439              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2440            else            else
2441              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2442            }            }
# Line 2281  for (;;) Line 2444  for (;;)
2444        break;        break;
2445    
2446        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2447          case OP_UPTOI:
2448          case OP_MINUPTOI:
2449          case OP_POSUPTOI:
2450          case OP_NOTUPTOI:
2451          case OP_NOTMINUPTOI:
2452          case OP_NOTPOSUPTOI:
2453          caseless = TRUE;
2454          codevalue -= OP_STARI - OP_STAR;
2455          /* Fall through */
2456        case OP_UPTO:        case OP_UPTO:
2457        case OP_MINUPTO:        case OP_MINUPTO:
2458        case OP_POSUPTO:        case OP_POSUPTO:
2459        case OP_NOTUPTO:        case OP_NOTUPTO:
2460        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2461        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2462        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2463        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2464        if (clen > 0)        if (clen > 0)
2465          {          {
2466          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2467          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2468            {            {
2469  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2470            if (utf8 && d >= 128)            if (utf && d >= 128)
2471              {              {
2472  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2473              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2474  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2475              }              }
2476            else            else
2477  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2478            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2479            }            }
2480          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2481            {            {
# Line 2312  for (;;) Line 2484  for (;;)
2484              active_count--;             /* Remove non-match possibility */              active_count--;             /* Remove non-match possibility */
2485              next_active_state--;              next_active_state--;
2486              }              }
2487            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2488              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2489            else            else
2490              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2491            }            }
# Line 2330  for (;;) Line 2502  for (;;)
2502          {          {
2503          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2504          int next_state_offset;          int next_state_offset;
2505          const uschar *ecode;          const pcre_uchar *ecode;
2506    
2507          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2508          can set isinclass from it. */          can set isinclass from it. */
2509    
2510          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2511            {            {
2512            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2513            if (clen > 0)            if (clen > 0)
2514              {              {
2515              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2516                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2517              }              }
2518            }            }
2519    
# Line 2352  for (;;) Line 2524  for (;;)
2524          else          else
2525           {           {
2526           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2527           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2528           }           }
2529    
2530          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
# Line 2385  for (;;) Line 2557  for (;;)
2557            case OP_CRRANGE:            case OP_CRRANGE:
2558            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2559            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2560            if (count >= GET2(ecode, 1))            if (count >= (int)GET2(ecode, 1))
2561              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2562            if (isinclass)            if (isinclass)
2563              {              {
2564              int max = GET2(ecode, 3);              int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2565              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2566                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2567              else              else
2568                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2569              }              }
# Line 2422  for (;;) Line 2594  for (;;)
2594          int rc;          int rc;
2595          int local_offsets[2];          int local_offsets[2];
2596          int local_workspace[1000];          int local_workspace[1000];
2597          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2598    
2599          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2600    
# Line 2435  for (;;) Line 2607  for (;;)
2607            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2608            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2609            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2610            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2611    
2612          if (rc == PCRE_ERROR_DFA_UITEM) return rc;          if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2613          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
# Line 2461  for (;;) Line 2631  for (;;)
2631          if (code[LINK_SIZE+1] == OP_CALLOUT)          if (code[LINK_SIZE+1] == OP_CALLOUT)
2632            {            {
2633            rrc = 0;            rrc = 0;
2634            if (pcre_callout != NULL)            if (PUBL(callout) != NULL)
2635              {              {
2636              pcre_callout_block cb;              PUBL(callout_block) cb;
2637              cb.version          = 1;   /* Version 1 of the callout block */              cb.version          = 1;   /* Version 1 of the callout block */
2638              cb.callout_number   = code[LINK_SIZE+2];              cb.callout_number   = code[LINK_SIZE+2];
2639              cb.offset_vector    = offsets;              cb.offset_vector    = offsets;
2640    #if defined COMPILE_PCRE8
2641              cb.subject          = (PCRE_SPTR)start_subject;              cb.subject          = (PCRE_SPTR)start_subject;
2642    #elif defined COMPILE_PCRE16
2643                cb.subject          = (PCRE_SPTR16)start_subject;
2644    #elif defined COMPILE_PCRE32
2645                cb.subject          = (PCRE_SPTR32)start_subject;
2646    #endif
2647              cb.subject_length   = (int)(end_subject - start_subject);              cb.subject_length   = (int)(end_subject - start_subject);
2648              cb.start_match      = (int)(current_subject - start_subject);              cb.start_match      = (int)(current_subject - start_subject);
2649              cb.current_position = (int)(ptr - start_subject);              cb.current_position = (int)(ptr - start_subject);
# Line 2476  for (;;) Line 2652  for (;;)
2652              cb.capture_top      = 1;              cb.capture_top      = 1;
2653              cb.capture_last     = -1;              cb.capture_last     = -1;
2654              cb.callout_data     = md->callout_data;              cb.callout_data     = md->callout_data;
2655              if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */              cb.mark             = NULL;   /* No (*MARK) support */
2656                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2657              }              }
2658            if (rrc > 0) break;                      /* Fail this thread */            if (rrc > 0) break;                      /* Fail this thread */
2659            code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */            code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2660            }            }
2661    
2662          condcode = code[LINK_SIZE+1];          condcode = code[LINK_SIZE+1];
# Line 2500  for (;;) Line 2677  for (;;)
2677    
2678          else if (condcode == OP_RREF || condcode == OP_NRREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2679            {            {
2680            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2681            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2682            if (recursing > 0)            if (md->recursive != NULL)
2683              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2684            else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }            else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2685            }            }
2686    
# Line 2512  for (;;) Line 2689  for (;;)
2689          else          else
2690            {            {
2691            int rc;            int rc;
2692            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2693            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2694    
2695            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2696    
# Line 2526  for (;;) Line 2703  for (;;)
2703              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2704              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2705              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2706              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2707    
2708            if (rc == PCRE_ERROR_DFA_UITEM) return rc;            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2709            if ((rc >= 0) ==            if ((rc >= 0) ==
# Line 2543  for (;;) Line 2718  for (;;)
2718        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2719        case OP_RECURSE:        case OP_RECURSE:
2720          {          {
2721            dfa_recursion_info *ri;
2722          int local_offsets[1000];          int local_offsets[1000];
2723          int local_workspace[1000];          int local_workspace[1000];
2724            const pcre_uchar *callpat = start_code + GET(code, 1);
2725            int recno = (callpat == md->start_code)? 0 :
2726              GET2(callpat, 1 + LINK_SIZE);
2727          int rc;          int rc;
2728    
2729          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2730            recursing + 1));  
2731            /* Check for repeating a recursion without advancing the subject
2732            pointer. This should catch convoluted mutual recursions. (Some simple
2733            cases are caught at compile time.) */
2734    
2735            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2736              if (recno == ri->group_num && ptr == ri->subject_position)
2737                return PCRE_ERROR_RECURSELOOP;
2738    
2739            /* Remember this recursion and where we started it so as to
2740            catch infinite loops. */
2741    
2742            new_recursive.group_num = recno;
2743            new_recursive.subject_position = ptr;
2744            new_recursive.prevrec = md->recursive;
2745            md->recursive = &new_recursive;
2746    
2747          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2748            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2749            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2750            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2751            (int)(ptr - start_subject),           /* start offset */            (int)(ptr - start_subject),           /* start offset */
2752            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2753            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2754            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2755            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2756            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing + 1);                       /* regex recurse level */  
2757    
2758          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2759            recursing + 1, rc));  
2760            DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2761              rc));
2762    
2763          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2764    
# Line 2578  for (;;) Line 2772  for (;;)
2772            {            {
2773            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2774              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2775              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2776              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2777                if (utf)
2778                  {
2779                  const pcre_uchar *p = start_subject + local_offsets[rc];
2780                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2781                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2782                  }
2783    #endif
2784              if (charcount > 0)              if (charcount > 0)
2785                {                {
2786                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 2597  for (;;) Line 2796  for (;;)
2796        break;        break;
2797    
2798        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2799          case OP_BRAPOS:
2800          case OP_SBRAPOS:
2801          case OP_CBRAPOS:
2802          case OP_SCBRAPOS:
2803          case OP_BRAPOSZERO:
2804            {
2805            int charcount, matched_count;
2806            const pcre_uchar *local_ptr = ptr;
2807            BOOL allow_zero;
2808    
2809            if (codevalue == OP_BRAPOSZERO)
2810              {
2811              allow_zero = TRUE;
2812              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2813              }
2814            else allow_zero = FALSE;
2815    
2816            /* Loop to match the subpattern as many times as possible as if it were
2817            a complete pattern. */
2818    
2819            for (matched_count = 0;; matched_count++)
2820              {
2821              int local_offsets[2];
2822              int local_workspace[1000];
2823    
2824              int rc = internal_dfa_exec(
2825                md,                                   /* fixed match data */
2826                code,                                 /* this subexpression's code */
2827                local_ptr,                            /* where we currently are */
2828                (int)(ptr - start_subject),           /* start offset */
2829                local_offsets,                        /* offset vector */
2830                sizeof(local_offsets)/sizeof(int),    /* size of same */
2831                local_workspace,                      /* workspace vector */
2832                sizeof(local_workspace)/sizeof(int),  /* size of same */
2833                rlevel);                              /* function recursion level */
2834    
2835              /* Failed to match */
2836    
2837              if (rc < 0)
2838                {
2839                if (rc != PCRE_ERROR_NOMATCH) return rc;
2840                break;
2841                }
2842    
2843              /* Matched: break the loop if zero characters matched. */
2844    
2845              charcount = local_offsets[1] - local_offsets[0];
2846              if (charcount == 0) break;
2847              local_ptr += charcount;    /* Advance temporary position ptr */
2848              }
2849    
2850            /* At this point we have matched the subpattern matched_count
2851            times, and local_ptr is pointing to the character after the end of the
2852            last match. */
2853    
2854            if (matched_count > 0 || allow_zero)
2855              {
2856              const pcre_uchar *end_subpattern = code;
2857              int next_state_offset;
2858    
2859              do { end_subpattern += GET(end_subpattern, 1); }
2860                while (*end_subpattern == OP_ALT);
2861              next_state_offset =
2862                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2863    
2864              /* Optimization: if there are no more active states, and there
2865              are no new states yet set up, then skip over the subject string
2866              right here, to save looping. Otherwise, set up the new state to swing
2867              into action when the end of the matched substring is reached. */
2868    
2869              if (i + 1 >= active_count && new_count == 0)
2870                {
2871                ptr = local_ptr;
2872                clen = 0;
2873                ADD_NEW(next_state_offset, 0);
2874                }
2875              else
2876                {
2877                const pcre_uchar *p = ptr;
2878                const pcre_uchar *pp = local_ptr;
2879                charcount = (int)(pp - p);
2880    #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2881                if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2882    #endif
2883                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2884                }
2885              }
2886            }
2887          break;
2888    
2889          /*-----------------------------------------------------------------*/
2890        case OP_ONCE:        case OP_ONCE:
2891          case OP_ONCE_NC:
2892          {          {
2893          int local_offsets[2];          int local_offsets[2];
2894          int local_workspace[1000];          int local_workspace[1000];
# Line 2611  for (;;) Line 2902  for (;;)
2902            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2903            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2904            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2905            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2906    
2907          if (rc >= 0)          if (rc >= 0)
2908            {            {
2909            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2910            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2911            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2912    
2913            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2914              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2915            next_state_offset =            next_state_offset =
2916              (int)(end_subpattern - start_code + LINK_SIZE + 1);              (int)(end_subpattern - start_code + LINK_SIZE + 1);
2917    
2918            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
# Line 2647  for (;;) Line 2936  for (;;)
2936            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2937            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2938            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2939            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2940    
2941            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2942              {              {
# Line 2670  for (;;) Line 2959  for (;;)
2959              }              }
2960            else            else
2961              {              {
2962              const uschar *p = start_subject + local_offsets[0];  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2963              const uschar *pp = start_subject + local_offsets[1];              if (utf)
2964              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;                {
2965                  const pcre_uchar *p = start_subject + local_offsets[0];
2966                  const pcre_uchar *pp = start_subject + local_offsets[1];
2967                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2968                  }
2969    #endif
2970              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2971              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2972                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2973              }              }
   
2974            }            }
2975          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2976          }          }
# Line 2689  for (;;) Line 2982  for (;;)
2982    
2983        case OP_CALLOUT:        case OP_CALLOUT:
2984        rrc = 0;        rrc = 0;
2985        if (pcre_callout != NULL)        if (PUBL(callout) != NULL)
2986          {          {
2987          pcre_callout_block cb;          PUBL(callout_block) cb;
2988          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2989          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2990          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2991    #if defined COMPILE_PCRE8
2992          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2993    #elif defined COMPILE_PCRE16
2994            cb.subject          = (PCRE_SPTR16)start_subject;
2995    #elif defined COMPILE_PCRE32
2996            cb.subject          = (PCRE_SPTR32)start_subject;
2997    #endif
2998          cb.subject_length   = (int)(end_subject - start_subject);          cb.subject_length   = (int)(end_subject - start_subject);
2999          cb.start_match      = (int)(current_subject - start_subject);          cb.start_match      = (int)(current_subject - start_subject);
3000          cb.current_position = (int)(ptr - start_subject);          cb.current_position = (int)(ptr - start_subject);
# Line 2704  for (;;) Line 3003  for (;;)
3003          cb.capture_top      = 1;          cb.capture_top      = 1;
3004          cb.capture_last     = -1;          cb.capture_last     = -1;
3005          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
3006          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
3007            if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3008          }          }
3009        if (rrc == 0)        if (rrc == 0)
3010          { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }          { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3011        break;        break;
3012    
3013    
# Line 2736  for (;;) Line 3036  for (;;)
3036    if (new_count <= 0)    if (new_count <= 0)
3037      {      {
3038      if (rlevel == 1 &&                               /* Top level, and */      if (rlevel == 1 &&                               /* Top level, and */
3039          could_continue &&                            /* Some could go on */          could_continue &&                            /* Some could go on, and */
3040          forced_fail != workspace[1] &&               /* Not all forced fail & */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3041          (                                            /* either... */          (                                            /* either... */
3042          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
# Line 2744  for (;;) Line 3044  for (;;)
3044          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3045           match_count < 0)                            /* no matches */           match_count < 0)                            /* no matches */
3046          ) &&                                         /* And... */          ) &&                                         /* And... */
3047          ptr >= end_subject &&                     /* Reached end of subject */          (
3048          ptr > current_subject)                    /* Matched non-empty string */          partial_newline ||                           /* Either partial NL */
3049        {            (                                          /* or ... */
3050        if (offsetcount >= 2)            ptr >= end_subject &&                /* End of subject and */
3051          {            ptr > md->start_used_ptr)            /* Inspected non-empty string */
3052          offsets[0] = (int)(md->start_used_ptr - start_subject);            )
3053          offsets[1] = (int)(end_subject - start_subject);          )
         }  
3054        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
       }  
   
3055      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3056        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3057        rlevel*2-2, SP));        rlevel*2-2, SP));
# Line 2804  Returns:          > 0 => number of match Line 3101  Returns:          > 0 => number of match
3101                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3102  */  */
3103    
3104    #if defined COMPILE_PCRE8
3105  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3106  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3107    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3108    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3109    #elif defined COMPILE_PCRE16
3110    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3111    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3112      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3113      int offsetcount, int *workspace, int wscount)
3114    #elif defined COMPILE_PCRE32
3115    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3116    pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3117      PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3118      int offsetcount, int *workspace, int wscount)
3119    #endif
3120  {  {
3121  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3122  dfa_match_data match_block;  dfa_match_data match_block;
3123  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3124  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3125  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
   
 pcre_study_data internal_study;  
3126  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3127    
3128  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3129  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3130  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3131  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3132  int first_byte = -1;  pcre_uchar first_char = 0;
3133  int req_byte = -1;  pcre_uchar first_char2 = 0;
3134  int req_byte2 = -1;  pcre_uchar req_char = 0;
3135    pcre_uchar req_char2 = 0;
3136  int newline;  int newline;
3137    
3138  /* Plausibility checks */  /* Plausibility checks */
# Line 2835  if (re == NULL || subject == NULL || wor Line 3142  if (re == NULL || subject == NULL || wor
3142     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3143  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3144  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3145    if (length < 0) return PCRE_ERROR_BADLENGTH;
3146    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3147    
3148    /* Check that the first field in the block is the magic number. If it is not,
3149    return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3150    REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3151    means that the pattern is likely compiled with different endianness. */
3152    
3153    if (re->magic_number != MAGIC_NUMBER)
3154      return re->magic_number == REVERSED_MAGIC_NUMBER?
3155        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3156    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3157    
3158  /* We need to find the pointer to any study data before we test for byte  /* If restarting after a partial match, do some sanity checks on the contents
3159  flipping, so we scan the extra_data block first. This may set two fields in the  of the workspace. */
3160  match block, so we must initialize them beforehand. However, the other fields  
3161  in the match block must not be set until after the byte flipping. */  if ((options & PCRE_DFA_RESTART) != 0)
3162      {
3163      if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3164        workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3165          return PCRE_ERROR_DFA_BADRESTART;
3166      }
3167    
3168    /* Set up study, callout, and table data */
3169    
3170  md->tables = re->tables;  md->tables = re->tables;
3171  md->callout_data = NULL;  md->callout_data = NULL;
# Line 2858  if (extra_data != NULL) Line 3184  if (extra_data != NULL)
3184      md->tables = extra_data->tables;      md->tables = extra_data->tables;
3185    }    }
3186    
 /* Check that the first field in the block is the magic number. If it is not,  
 test for a regex that was compiled on a host of opposite endianness. If this is  
 the case, flipped values are put in internal_re and internal_study if there was  
 study data too. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   {  
   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);  
   if (re == NULL) return PCRE_ERROR_BADMAGIC;  
   if (study != NULL) study = &internal_study;  
   }  
   
3187  /* Set some local values */  /* Set some local values */
3188    
3189  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3190  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3191  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3192    
3193  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3194  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3195    utf = (re->options & PCRE_UTF8) != 0;
3196  #else  #else
3197  utf8 = FALSE;  utf = FALSE;
3198  #endif  #endif
3199    
3200  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2887  anchored = (options & (PCRE_ANCHORED|PCR Line 3202  anchored = (options & (PCRE_ANCHORED|PCR
3202    
3203  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3204    
3205  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3206      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3207  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3208  md->end_subject = end_subject;  md->end_subject = end_subject;
3209  md->start_offset = start_offset;  md->start_offset = start_offset;
3210  md->moptions = options;  md->moptions = options;
# Line 2950  else Line 3265  else
3265  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3266  back the character offset. */  back the character offset. */
3267    
3268  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3269  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3270    {    {
3271    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3272      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3273    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3274      {      {
3275      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3276        {        {
3277        tb &= 0xc0;        offsets[0] = erroroffset;
3278        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3279        }        }
3280    #if defined COMPILE_PCRE8
3281        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3282          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3283    #elif defined COMPILE_PCRE16
3284        return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3285          PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3286    #elif defined COMPILE_PCRE32
3287        return PCRE_ERROR_BADUTF32;
3288    #endif
3289      }      }
3290    #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3291      if (start_offset > 0 && start_offset < length &&
3292            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3293        return PCRE_ERROR_BADUTF8_OFFSET;
3294    #endif
3295    }    }
3296  #endif  #endif
3297    
# Line 2971  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3299  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3299  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3300  in other programs later. */  in other programs later. */
3301    
3302  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3303    
3304  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3305  used in a loop when finding where to start. */  where to start. */
3306    
 lcc = md->tables + lcc_offset;  
3307  startline = (re->flags & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
3308  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3309    
# Line 2990  if (!anchored) Line 3317  if (!anchored)
3317    {    {
3318    if ((re->flags & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3319      {      {
3320      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3321      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = (pcre_uchar)(re->first_char);
3322        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3323          {
3324          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3325    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3326          if (utf && first_char > 127)
3327            first_char2 = UCD_OTHERCASE(first_char);
3328    #endif
3329          }
3330      }      }
3331    else    else
3332      {      {
# Line 3007  character" set. */ Line 3341  character" set. */
3341    
3342  if ((re->flags & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3343    {    {
3344    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3345    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = (pcre_uchar)(re->req_char);
3346    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3347        {
3348        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3349    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3350        if (utf && req_char > 127)
3351          req_char2 = UCD_OTHERCASE(req_char);
3352    #endif
3353        }
3354    }    }
3355    
3356  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
# Line 3022  for (;;) Line 3363  for (;;)
3363    
3364    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3365      {      {
3366      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3367    
3368      /* If firstline is TRUE, the start of the match is constrained to the first      /* If firstline is TRUE, the start of the match is constrained to the first
3369      line of a multiline string. Implement this by temporarily adjusting      line of a multiline string. Implement this by temporarily adjusting
# Line 3031  for (;;) Line 3372  for (;;)
3372    
3373      if (firstline)      if (firstline)
3374        {        {
3375        USPTR t = current_subject;        PCRE_PUCHAR t = current_subject;
3376  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3377        if (utf8)        if (utf)
3378          {          {
3379          while (t < md->end_subject && !IS_NEWLINE(t))          while (t < md->end_subject && !IS_NEWLINE(t))
3380            {            {
3381            t++;            t++;
3382            while (t < end_subject && (*t & 0xc0) == 0x80) t++;            ACROSSCHAR(t < end_subject, *t, t++);
3383            }            }
3384          }          }
3385        else        else
# Line 3049  for (;;) Line 3390  for (;;)
3390    
3391      /* There are some optimizations that avoid running the match if a known      /* There are some optimizations that avoid running the match if a known
3392      starting point is not found. However, there is an option that disables      starting point is not found. However, there is an option that disables
3393      these, for testing and for ensuring that all callouts do actually occur. */      these, for testing and for ensuring that all callouts do actually occur.
3394        The option can be set in the regex by (*NO_START_OPT) or passed in
3395        match-time options. */
3396    
3397      if ((options & PCRE_NO_START_OPTIMIZE) == 0)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3398        {        {
3399        /* Advance to a known first byte. */        /* Advance to a known first char. */
3400    
3401        if (first_byte >= 0)        if (has_first_char)
3402          {          {
3403          if (first_byte_caseless)          if (first_char != first_char2)
3404              {
3405              pcre_uchar csc;
3406            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3407                   lcc[*current_subject] != first_byte)                   (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3408              current_subject++;              current_subject++;
3409              }
3410          else          else
3411            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3412                   *current_subject != first_byte)                   RAWUCHARTEST(current_subject) != first_char)
3413              current_subject++;              current_subject++;
3414          }          }
3415    
# Line 3073  for (;;) Line 3419  for (;;)
3419          {          {
3420          if (current_subject > md->start_subject + start_offset)          if (current_subject > md->start_subject + start_offset)
3421            {            {
3422  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3423            if (utf8)            if (utf)
3424              {              {
3425              while (current_subject < end_subject &&              while (current_subject < end_subject &&
3426                     !WAS_NEWLINE(current_subject))                     !WAS_NEWLINE(current_subject))
3427                {                {
3428                current_subject++;                current_subject++;
3429                while(current_subject < end_subject &&                ACROSSCHAR(current_subject < end_subject, *current_subject,
3430                      (*current_subject & 0xc0) == 0x80)                  current_subject++);
                 current_subject++;  
3431                }                }
3432              }              }
3433            else            else
# Line 3094  for (;;) Line 3439  for (;;)
3439            ANYCRLF, and we are now at a LF, advance the match position by one            ANYCRLF, and we are now at a LF, advance the match position by one
3440            more character. */            more character. */
3441    
3442            if (current_subject[-1] == CHAR_CR &&            if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3443                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3444                 current_subject < end_subject &&                 current_subject < end_subject &&
3445                 *current_subject == CHAR_NL)                 RAWUCHARTEST(current_subject) == CHAR_NL)
3446              current_subject++;              current_subject++;
3447            }            }
3448          }          }
# Line 3108  for (;;) Line 3453  for (;;)
3453          {          {
3454          while (current_subject < end_subject)          while (current_subject < end_subject)
3455            {            {
3456            register unsigned int c = *current_subject;            register pcre_uint32 c = RAWUCHARTEST(current_subject);
3457            if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;  #ifndef COMPILE_PCRE8
3458              else break;            if (c > 255) c = 255;
3459    #endif
3460              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3461                {
3462                current_subject++;
3463    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3464                /* In non 8-bit mode, the iteration will stop for
3465                characters > 255 at the beginning or not stop at all. */
3466                if (utf)
3467                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3468                    current_subject++);
3469    #endif
3470                }
3471              else break;
3472            }            }
3473          }          }
3474        }        }
# Line 3123  for (;;) Line 3481  for (;;)
3481      disabling is explicitly requested (and of course, by the test above, this      disabling is explicitly requested (and of course, by the test above, this
3482      code is not obeyed when restarting after a partial match). */      code is not obeyed when restarting after a partial match). */
3483    
3484      if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3485          (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)          (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3486        {        {
3487        /* If the pattern was studied, a minimum subject length may be set. This        /* If the pattern was studied, a minimum subject length may be set. This
# Line 3135  for (;;) Line 3493  for (;;)
3493            (pcre_uint32)(end_subject - current_subject) < study->minlength)            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3494          return PCRE_ERROR_NOMATCH;          return PCRE_ERROR_NOMATCH;
3495    
3496        /* If req_byte is set, we know that that character must appear in the        /* If req_char is set, we know that that character must appear in the
3497        subject for the match to succeed. If the first character is set, req_byte        subject for the match to succeed. If the first character is set, req_char
3498        must be later in the subject; otherwise the test starts at the match        must be later in the subject; otherwise the test starts at the match
3499        point. This optimization can save a huge amount of work in patterns with        point. This optimization can save a huge amount of work in patterns with
3500        nested unlimited repeats that aren't going to match. Writing separate        nested unlimited repeats that aren't going to match. Writing separate
# Line 3148  for (;;) Line 3506  for (;;)
3506        patterns. This showed up when somebody was matching /^C/ on a 32-megabyte        patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3507        string... so we don't do this when the string is sufficiently long. */        string... so we don't do this when the string is sufficiently long. */
3508    
3509        if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)        if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3510          {          {
3511          register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3512    
3513          /* We don't need to repeat the search if we haven't yet reached the          /* We don't need to repeat the search if we haven't yet reached the
3514          place we found it at last time. */          place we found it at last time. */
3515    
3516          if (p > req_byte_ptr)          if (p > req_char_ptr)
3517            {            {
3518            if (req_byte_caseless)            if (req_char != req_char2)
3519              {              {
3520              while (p < end_subject)              while (p < end_subject)
3521                {                {
3522                register int pp = *p++;                register pcre_uint32 pp = RAWUCHARINCTEST(p);
3523                if (pp == req_byte || pp == req_byte2) { p--; break; }                if (pp == req_char || pp == req_char2) { p--; break; }
3524                }                }
3525              }              }
3526            else            else
3527              {              {
3528              while (p < end_subject)              while (p < end_subject)
3529                {                {
3530                if (*p++ == req_byte) { p--; break; }                if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3531                }                }
3532              }              }
3533    
# Line 3182  for (;;) Line 3540  for (;;)
3540            found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3541            the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3542    
3543            req_byte_ptr = p;            req_char_ptr = p;
3544            }            }
3545          }          }
3546        }        }
# Line 3191  for (;;) Line 3549  for (;;)
3549    /* OK, now we can do the business */    /* OK, now we can do the business */
3550    
3551    md->start_used_ptr = current_subject;    md->start_used_ptr = current_subject;
3552      md->recursive = NULL;
3553    
3554    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3555      md,                                /* fixed match data */      md,                                /* fixed match data */
# Line 3201  for (;;) Line 3560  for (;;)
3560      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3561      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3562      wscount,                           /* size of same */      wscount,                           /* size of same */
3563      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3564    
3565    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3566    on only if not anchored. */    on only if not anchored. */
3567    
3568    if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;    if (rc != PCRE_ERROR_NOMATCH || anchored)
3569        {
3570        if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3571          {
3572          offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3573          offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3574          if (offsetcount > 2)
3575            offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3576          }
3577        return rc;
3578        }
3579    
3580    /* Advance to the next subject character unless we are at the end of a line    /* Advance to the next subject character unless we are at the end of a line
3581    and firstline is set. */    and firstline is set. */
3582    
3583    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3584    current_subject++;    current_subject++;
3585    if (utf8)  #ifdef SUPPORT_UTF
3586      if (utf)
3587      {      {
3588      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3589        current_subject++;        current_subject++);
3590      }      }
3591    #endif
3592    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3593    
3594    /* If we have just passed a CR and we are now at a LF, and the pattern does    /* If we have just passed a CR and we are now at a LF, and the pattern does
3595    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
3596    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3597    
3598    if (current_subject[-1] == CHAR_CR &&    if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3599        current_subject < end_subject &&        current_subject < end_subject &&
3600        *current_subject == CHAR_NL &&        RAWUCHARTEST(current_subject) == CHAR_NL &&
3601        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
3602          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
3603           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.530  
changed lines
  Added in v.1364

  ViewVC Help
Powered by ViewVC 1.1.5