/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 473 by ph10, Sat Jan 2 12:40:07 2010 UTC revision 1365 by ph10, Sun Oct 6 18:33:56 2013 UTC
# Line 7  and semantics are as close as possible t Line 7  and semantics are as close as possible t
7  below for why this module is different).  below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2013 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 38  POSSIBILITY OF SUCH DAMAGE. Line 38  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
39  */  */
40    
   
41  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
42  alternative matching function that uses a sort of DFA algorithm (not a true  alternative matching function that uses a sort of DFA algorithm (not a true
43  FSM). This is NOT Perl- compatible, but it has advantages in certain  FSM). This is NOT Perl-compatible, but it has advantages in certain
44  applications. */  applications. */
45    
46    
# Line 106  never stored, so we push them well clear Line 105  never stored, so we push them well clear
105    
106    
107  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
108  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
109  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
110  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111  small value. Non-zero values in the table are the offsets from the opcode where  small value. Non-zero values in the table are the offsets from the opcode where
112  the character is to be found. ***NOTE*** If the start of this table is  the character is to be found. ***NOTE*** If the start of this table is
113  modified, the three tables that follow must also be modified. */  modified, the three tables that follow must also be modified. */
114    
115  static const uschar coptable[] = {  static const pcre_uint8 coptable[] = {
116    0,                             /* End                                    */    0,                             /* End                                    */
117    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119    0, 0, 0,                       /* Any, AllAny, Anybyte                   */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0,                          /* \P, \p                                 */
121    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0,                             /* \X                                     */
123      0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124    1,                             /* Char                                   */    1,                             /* Char                                   */
125    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
126    1,                             /* not                                    */    1,                             /* not                                    */
127      1,                             /* noti                                   */
128    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
129    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
132      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135      1+IMM2_SIZE,                   /* exact I                                */
136      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
138    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
141      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144      1+IMM2_SIZE,                   /* NOT exact I                            */
145      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
147    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
150      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
152    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 143  static const uschar coptable[] = { Line 155  static const uschar coptable[] = {
155    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
156    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
157    0,                             /* REF                                    */    0,                             /* REF                                    */
158      0,                             /* REFI                                   */
159      0,                             /* DNREF                                  */
160      0,                             /* DNREFI                                 */
161    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
162    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
163    0,                             /* Alt                                    */    0,                             /* Alt                                    */
164    0,                             /* Ket                                    */    0,                             /* Ket                                    */
165    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
166    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
167      0,                             /* KetRpos                                */
168      0,                             /* Reverse                                */
169    0,                             /* Assert                                 */    0,                             /* Assert                                 */
170    0,                             /* Assert not                             */    0,                             /* Assert not                             */
171    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
172    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
173    0,                             /* Reverse                                */    0, 0,                          /* ONCE, ONCE_NC                          */
174    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
175    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
176    0,                             /* CREF                                   */    0, 0,                          /* CREF, DNCREF                           */
177    0,                             /* RREF                                   */    0, 0,                          /* RREF, DNRREF                           */
178    0,                             /* DEF                                    */    0,                             /* DEF                                    */
179    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
180    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
181    0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
182      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
183      0, 0                           /* CLOSE, SKIPZERO  */
184  };  };
185    
186  /* This table identifies those opcodes that inspect a character. It is used to  /* This table identifies those opcodes that inspect a character. It is used to
# Line 169  remember the fact that a character could Line 188  remember the fact that a character could
188  the subject is reached. ***NOTE*** If the start of this table is modified, the  the subject is reached. ***NOTE*** If the start of this table is modified, the
189  two tables that follow must also be modified. */  two tables that follow must also be modified. */
190    
191  static const uschar poptable[] = {  static const pcre_uint8 poptable[] = {
192    0,                             /* End                                    */    0,                             /* End                                    */
193    0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
194    1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */    1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
195    1, 1, 1,                       /* Any, AllAny, Anybyte                   */    1, 1, 1,                       /* Any, AllAny, Anybyte                   */
196    1, 1, 1,                       /* NOTPROP, PROP, EXTUNI                  */    1, 1,                          /* \P, \p                                 */
197    1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */    1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
198    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    1,                             /* \X                                     */
199      0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
200    1,                             /* Char                                   */    1,                             /* Char                                   */
201    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
202    1,                             /* not                                    */    1,                             /* not                                    */
203      1,                             /* noti                                   */
204    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
205    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
206    1, 1, 1,                       /* upto, minupto, exact                   */    1, 1, 1,                       /* upto, minupto, exact                   */
207    1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */    1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
208      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
209      1, 1, 1,                       /* upto I, minupto I, exact I             */
210      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
211    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
212    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
213    1, 1, 1,                       /* NOT upto, minupto, exact               */    1, 1, 1,                       /* NOT upto, minupto, exact               */
214    1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */    1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
215      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
216      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
217      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
218    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
219    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
220    1, 1, 1,                       /* Type upto, minupto, exact              */    1, 1, 1,                       /* Type upto, minupto, exact              */
# Line 199  static const uschar poptable[] = { Line 226  static const uschar poptable[] = {
226    1,                             /* NCLASS                                 */    1,                             /* NCLASS                                 */
227    1,                             /* XCLASS - variable length               */    1,                             /* XCLASS - variable length               */
228    0,                             /* REF                                    */    0,                             /* REF                                    */
229      0,                             /* REFI                                   */
230      0,                             /* DNREF                                  */
231      0,                             /* DNREFI                                 */
232    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
233    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
234    0,                             /* Alt                                    */    0,                             /* Alt                                    */
235    0,                             /* Ket                                    */    0,                             /* Ket                                    */
236    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
237    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
238      0,                             /* KetRpos                                */
239      0,                             /* Reverse                                */
240    0,                             /* Assert                                 */    0,                             /* Assert                                 */
241    0,                             /* Assert not                             */    0,                             /* Assert not                             */
242    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
243    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
244    0,                             /* Reverse                                */    0, 0,                          /* ONCE, ONCE_NC                          */
245    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
246    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
247    0,                             /* CREF                                   */    0, 0,                          /* CREF, DNCREF                           */
248    0,                             /* RREF                                   */    0, 0,                          /* RREF, DNRREF                           */
249    0,                             /* DEF                                    */    0,                             /* DEF                                    */
250    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
251    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
252    0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
253      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
254      0, 0                           /* CLOSE, SKIPZERO                        */
255  };  };
256    
257  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
258  and \w */  and \w */
259    
260  static const uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
261    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
262    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
263    ctype_space, ctype_space,    ctype_space, ctype_space,
# Line 231  static const uschar toptable1[] = { Line 265  static const uschar toptable1[] = {
265    0, 0                            /* OP_ANY, OP_ALLANY */    0, 0                            /* OP_ANY, OP_ALLANY */
266  };  };
267    
268  static const uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
269    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
270    ctype_digit, 0,    ctype_digit, 0,
271    ctype_space, 0,    ctype_space, 0,
# Line 248  these structures in, is a vector of ints Line 282  these structures in, is a vector of ints
282  typedef struct stateblock {  typedef struct stateblock {
283    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
284    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
285    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
286  } stateblock;  } stateblock;
287    
288  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
289    
290    
291  #ifdef DEBUG  #ifdef PCRE_DEBUG
292  /*************************************************  /*************************************************
293  *             Print character string             *  *             Print character string             *
294  *************************************************/  *************************************************/
# Line 271  Returns:       nothing Line 304  Returns:       nothing
304  */  */
305    
306  static void  static void
307  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
308  {  {
309  int c;  pcre_uint32 c;
310  while (length-- > 0)  while (length-- > 0)
311    {    {
312    if (isprint(c = *(p++)))    if (isprint(c = *(p++)))
313      fprintf(f, "%c", c);      fprintf(f, "%c", c);
314    else    else
315      fprintf(f, "\\x%02x", c);      fprintf(f, "\\x{%02x}", c);
316    }    }
317  }  }
318  #endif  #endif
# Line 304  Arguments: Line 337  Arguments:
337    offsetcount       size of same    offsetcount       size of same
338    workspace         vector of workspace    workspace         vector of workspace
339    wscount           size of same    wscount           size of same
   ims               the current ims flags  
340    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
341    
342  Returns:            > 0 => number of match offset pairs placed in offsets  Returns:            > 0 => number of match offset pairs placed in offsets
343                      = 0 => offsets overflowed; longest matches are present                      = 0 => offsets overflowed; longest matches are present
# Line 321  for the current character, one for the f Line 352  for the current character, one for the f
352      { \      { \
353      next_active_state->offset = (x); \      next_active_state->offset = (x); \
354      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
355      next_active_state++; \      next_active_state++; \
356      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
357      } \      } \
# Line 332  for the current character, one for the f Line 362  for the current character, one for the f
362      { \      { \
363      next_active_state->offset = (x); \      next_active_state->offset = (x); \
364      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
365      next_active_state->data   = (z); \      next_active_state->data   = (z); \
366      next_active_state++; \      next_active_state++; \
367      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 344  for the current character, one for the f Line 373  for the current character, one for the f
373      { \      { \
374      next_new_state->offset = (x); \      next_new_state->offset = (x); \
375      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
376      next_new_state++; \      next_new_state++; \
377      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
378      } \      } \
# Line 355  for the current character, one for the f Line 383  for the current character, one for the f
383      { \      { \
384      next_new_state->offset = (x); \      next_new_state->offset = (x); \
385      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
386      next_new_state->data   = (z); \      next_new_state->data   = (z); \
387      next_new_state++; \      next_new_state++; \
388      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
389          (x), (y), (z), __LINE__)); \
390      } \      } \
391    else return PCRE_ERROR_DFA_WSSIZE    else return PCRE_ERROR_DFA_WSSIZE
392    
# Line 367  for the current character, one for the f Line 395  for the current character, one for the f
395  static int  static int
396  internal_dfa_exec(  internal_dfa_exec(
397    dfa_match_data *md,    dfa_match_data *md,
398    const uschar *this_start_code,    const pcre_uchar *this_start_code,
399    const uschar *current_subject,    const pcre_uchar *current_subject,
400    int start_offset,    int start_offset,
401    int *offsets,    int *offsets,
402    int offsetcount,    int offsetcount,
403    int *workspace,    int *workspace,
404    int wscount,    int wscount,
405    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
406  {  {
407  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
408  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
409    
410  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
411  const uschar *ptr;  const pcre_uchar *ptr;
412  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
413    
414    dfa_recursion_info new_recursive;
415    
416  int active_count, new_count, match_count;  int active_count, new_count, match_count;
417    
418  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
419  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
420    
421  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
422  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
423  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
424    
425  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
426  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
427  #else  #else
428  BOOL utf8 = FALSE;  BOOL utf = FALSE;
429  #endif  #endif
430    
431    BOOL reset_could_continue = FALSE;
432    
433  rlevel++;  rlevel++;
434  offsetcount &= (-2);  offsetcount &= (-2);
435    
# Line 408  wscount = (wscount - (wscount % (INTS_PE Line 438  wscount = (wscount - (wscount % (INTS_PE
438            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
439    
440  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
441    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
442    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
443    
444  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
445  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 422  next_new_state = new_states = active_sta Line 452  next_new_state = new_states = active_sta
452  new_count = 0;  new_count = 0;
453    
454  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
455    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
456        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
457        ? IMM2_SIZE:0);
458    
459  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
460  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 450  if (*first_op == OP_REVERSE) Line 482  if (*first_op == OP_REVERSE)
482    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
483    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
484    
485  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
486    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
487    
488    if (utf8)    if (utf)
489      {      {
490      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
491        {        {
492        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
493        current_subject--;        current_subject--;
494        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
495        }        }
496      }      }
497    else    else
# Line 471  if (*first_op == OP_REVERSE) Line 501  if (*first_op == OP_REVERSE)
501    
502      {      {
503      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
504        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
505      current_subject -= gone_back;      current_subject -= gone_back;
506      }      }
507    
# Line 488  if (*first_op == OP_REVERSE) Line 518  if (*first_op == OP_REVERSE)
518      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
519      if (back <= gone_back)      if (back <= gone_back)
520        {        {
521        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
522        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
523        }        }
524      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 521  else Line 551  else
551    else    else
552      {      {
553      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
554        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
555            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
556            ? IMM2_SIZE:0);
557      do      do
558        {        {
559        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
560        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
561        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
562        }        }
# Line 534  else Line 566  else
566    
567  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
568    
569  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
570    
571  /* Loop for scanning the subject */  /* Loop for scanning the subject */
572    
# Line 543  for (;;) Line 575  for (;;)
575    {    {
576    int i, j;    int i, j;
577    int clen, dlen;    int clen, dlen;
578    unsigned int c, d;    pcre_uint32 c, d;
579    int forced_fail = 0;    int forced_fail = 0;
580    BOOL could_continue = FALSE;    BOOL partial_newline = FALSE;
581      BOOL could_continue = reset_could_continue;
582      reset_could_continue = FALSE;
583    
584    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
585    new state list. */    new state list. */
# Line 559  for (;;) Line 593  for (;;)
593    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
594    workspace[1] = active_count;    workspace[1] = active_count;
595    
596  #ifdef DEBUG  #ifdef PCRE_DEBUG
597    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
598    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
599    printf("\"\n");    printf("\"\n");
600    
601    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 581  for (;;) Line 615  for (;;)
615    
616    if (ptr < end_subject)    if (ptr < end_subject)
617      {      {
618      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of data items in the character */
619  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
620      if (utf8) { GETCHARLEN(c, ptr, clen); } else      GETCHARLENTEST(c, ptr, clen);
621  #endif  /* SUPPORT_UTF8 */  #else
622      c = *ptr;      c = *ptr;
623    #endif  /* SUPPORT_UTF */
624      }      }
625    else    else
626      {      {
# Line 601  for (;;) Line 636  for (;;)
636    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
637      {      {
638      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
639      const uschar *code;      BOOL caseless = FALSE;
640        const pcre_uchar *code;
641      int state_offset = current_state->offset;      int state_offset = current_state->offset;
642      int count, codevalue, rrc;      int codevalue, rrc;
643        int count;
644    
645  #ifdef DEBUG  #ifdef PCRE_DEBUG
646      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
647      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
648        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
649          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
650  #endif  #endif
651    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
652      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
653      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
654      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
655        state, arrange for it to passed on. */
656    
657      if (state_offset < 0)      if (state_offset < 0)
658        {        {
# Line 627  for (;;) Line 661  for (;;)
661          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
662          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
663            current_state->data - 1);            current_state->data - 1);
664            if (could_continue) reset_could_continue = TRUE;
665          continue;          continue;
666          }          }
667        else        else
# Line 666  for (;;) Line 701  for (;;)
701      permitted.      permitted.
702    
703      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
704      argument that is not a data character - but is always one byte long. We      argument that is not a data character - but is always one byte long because
705      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in      the values are small. We have to take special action to deal with  \P, \p,
706      this case. To keep the other cases fast, convert these ones to new opcodes.      \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
707      */      these ones to new opcodes. */
708    
709      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
710        {        {
711        dlen = 1;        dlen = 1;
712  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
713        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
714  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
715        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
716        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
717          {          {
# Line 706  for (;;) Line 741  for (;;)
741    
742      switch (codevalue)      switch (codevalue)
743        {        {
744    /* ========================================================================== */
745          /* These cases are never obeyed. This is a fudge that causes a compile-
746          time error if the vectors coptable or poptable, which are indexed by
747          opcode, are not the correct length. It seems to be the only way to do
748          such a check at compile time, as the sizeof() operator does not work
749          in the C preprocessor. */
750    
751          case OP_TABLE_LENGTH:
752          case OP_TABLE_LENGTH +
753            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
754             (sizeof(poptable) == OP_TABLE_LENGTH)):
755          break;
756    
757  /* ========================================================================== */  /* ========================================================================== */
758        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
759        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
760          state. Note that KETRPOS will always be encountered at the end of the
761          subpattern, because the possessive subpattern repeats are always handled
762          using recursive calls. Thus, it never adds any new states.
763    
764          At the end of the (sub)pattern, unless we have an empty string and
765        PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the        PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
766        start of the subject, save the match data, shifting up all previous        start of the subject, save the match data, shifting up all previous
767        matches so we always have the longest first. */        matches so we always have the longest first. */
# Line 717  for (;;) Line 769  for (;;)
769        case OP_KET:        case OP_KET:
770        case OP_KETRMIN:        case OP_KETRMIN:
771        case OP_KETRMAX:        case OP_KETRMAX:
772          case OP_KETRPOS:
773        if (code != end_code)        if (code != end_code)
774          {          {
775          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 733  for (;;) Line 786  for (;;)
786                  current_subject > start_subject + md->start_offset)))                  current_subject > start_subject + md->start_offset)))
787            {            {
788            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
789              else if (match_count > 0 && ++match_count * 2 >= offsetcount)              else if (match_count > 0 && ++match_count * 2 > offsetcount)
790                match_count = 0;                match_count = 0;
791            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
792            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
793            if (offsetcount >= 2)            if (offsetcount >= 2)
794              {              {
795              offsets[0] = current_subject - start_subject;              offsets[0] = (int)(current_subject - start_subject);
796              offsets[1] = ptr - start_subject;              offsets[1] = (int)(ptr - start_subject);
797              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
798                offsets[1] - offsets[0], current_subject));                offsets[1] - offsets[0], (char *)current_subject));
799              }              }
800            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
801              {              {
# Line 762  for (;;) Line 815  for (;;)
815        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
816        case OP_ALT:        case OP_ALT:
817        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
818        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
819        break;        break;
820    
821        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 770  for (;;) Line 823  for (;;)
823        case OP_SBRA:        case OP_SBRA:
824        do        do
825          {          {
826          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
827          code += GET(code, 1);          code += GET(code, 1);
828          }          }
829        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 779  for (;;) Line 832  for (;;)
832        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
833        case OP_CBRA:        case OP_CBRA:
834        case OP_SCBRA:        case OP_SCBRA:
835        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
836        code += GET(code, 1);        code += GET(code, 1);
837        while (*code == OP_ALT)        while (*code == OP_ALT)
838          {          {
839          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
840          code += GET(code, 1);          code += GET(code, 1);
841          }          }
842        break;        break;
# Line 794  for (;;) Line 847  for (;;)
847        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
848        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
849        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
850        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
851        break;        break;
852    
853        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
854        case OP_SKIPZERO:        case OP_SKIPZERO:
855        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
856        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
857        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
858        break;        break;
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_CIRC:        case OP_CIRC:
862        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
863          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
864        break;        break;
865    
866        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
867        case OP_EOD:        case OP_CIRCM:
868        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
869              (ptr != end_subject && WAS_NEWLINE(ptr)))
870            { ADD_ACTIVE(state_offset + 1, 0); }
871        break;        break;
872    
873        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
874        case OP_OPT:        case OP_EOD:
875        ims = code[1];        if (ptr >= end_subject)
876        ADD_ACTIVE(state_offset + 2, 0);          {
877            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
878              could_continue = TRUE;
879            else { ADD_ACTIVE(state_offset + 1, 0); }
880            }
881        break;        break;
882    
883        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 844  for (;;) Line 900  for (;;)
900        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
901        case OP_ANY:        case OP_ANY:
902        if (clen > 0 && !IS_NEWLINE(ptr))        if (clen > 0 && !IS_NEWLINE(ptr))
903          { ADD_NEW(state_offset + 1, 0); }          {
904            if (ptr + 1 >= md->end_subject &&
905                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
906                NLBLOCK->nltype == NLTYPE_FIXED &&
907                NLBLOCK->nllen == 2 &&
908                c == NLBLOCK->nl[0])
909              {
910              could_continue = partial_newline = TRUE;
911              }
912            else
913              {
914              ADD_NEW(state_offset + 1, 0);
915              }
916            }
917        break;        break;
918    
919        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 855  for (;;) Line 924  for (;;)
924    
925        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
926        case OP_EODN:        case OP_EODN:
927        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
928            could_continue = TRUE;
929          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
930          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
931        break;        break;
932    
# Line 863  for (;;) Line 934  for (;;)
934        case OP_DOLL:        case OP_DOLL:
935        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
936          {          {
937          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
938              could_continue = TRUE;
939            else if (clen == 0 ||
940              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
941                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 (ptr == end_subject - md->nllen)
942              ))              ))
943            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
944            else if (ptr + 1 >= md->end_subject &&
945                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
946                     NLBLOCK->nltype == NLTYPE_FIXED &&
947                     NLBLOCK->nllen == 2 &&
948                     c == NLBLOCK->nl[0])
949              {
950              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
951                {
952                reset_could_continue = TRUE;
953                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
954                }
955              else could_continue = partial_newline = TRUE;
956              }
957            }
958          break;
959    
960          /*-----------------------------------------------------------------*/
961          case OP_DOLLM:
962          if ((md->moptions & PCRE_NOTEOL) == 0)
963            {
964            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
965              could_continue = TRUE;
966            else if (clen == 0 ||
967                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
968              { ADD_ACTIVE(state_offset + 1, 0); }
969            else if (ptr + 1 >= md->end_subject &&
970                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
971                     NLBLOCK->nltype == NLTYPE_FIXED &&
972                     NLBLOCK->nllen == 2 &&
973                     c == NLBLOCK->nl[0])
974              {
975              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
976                {
977                reset_could_continue = TRUE;
978                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
979                }
980              else could_continue = partial_newline = TRUE;
981              }
982          }          }
983        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        else if (IS_NEWLINE(ptr))
984          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
985        break;        break;
986    
# Line 900  for (;;) Line 1011  for (;;)
1011    
1012          if (ptr > start_subject)          if (ptr > start_subject)
1013            {            {
1014            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
1015            if (temp < md->start_used_ptr) md->start_used_ptr = temp;            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1016  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1017            if (utf8) BACKCHAR(temp);            if (utf) { BACKCHAR(temp); }
1018  #endif  #endif
1019            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1020    #ifdef SUPPORT_UCP
1021              if ((md->poptions & PCRE_UCP) != 0)
1022                {
1023                if (d == '_') left_word = TRUE; else
1024                  {
1025                  int cat = UCD_CATEGORY(d);
1026                  left_word = (cat == ucp_L || cat == ucp_N);
1027                  }
1028                }
1029              else
1030    #endif
1031            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1032            }            }
1033          else left_word = 0;          else left_word = FALSE;
1034    
1035          if (clen > 0)          if (clen > 0)
1036              {
1037    #ifdef SUPPORT_UCP
1038              if ((md->poptions & PCRE_UCP) != 0)
1039                {
1040                if (c == '_') right_word = TRUE; else
1041                  {
1042                  int cat = UCD_CATEGORY(c);
1043                  right_word = (cat == ucp_L || cat == ucp_N);
1044                  }
1045                }
1046              else
1047    #endif
1048            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1049          else right_word = 0;            }
1050            else right_word = FALSE;
1051    
1052          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1053            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 931  for (;;) Line 1066  for (;;)
1066        if (clen > 0)        if (clen > 0)
1067          {          {
1068          BOOL OK;          BOOL OK;
1069            const pcre_uint32 *cp;
1070          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1071          switch(code[1])          switch(code[1])
1072            {            {
# Line 939  for (;;) Line 1075  for (;;)
1075            break;            break;
1076    
1077            case PT_LAMP:            case PT_LAMP:
1078            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1079                   prop->chartype == ucp_Lt;
1080            break;            break;
1081    
1082            case PT_GC:            case PT_GC:
1083            OK = _pcre_ucp_gentype[prop->chartype] == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1084            break;            break;
1085    
1086            case PT_PC:            case PT_PC:
# Line 954  for (;;) Line 1091  for (;;)
1091            OK = prop->script == code[2];            OK = prop->script == code[2];
1092            break;            break;
1093    
1094              /* These are specials for combination cases. */
1095    
1096              case PT_ALNUM:
1097              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1098                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1099              break;
1100    
1101              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1102              which means that Perl space and POSIX space are now identical. PCRE
1103              was changed at release 8.34. */
1104    
1105              case PT_SPACE:    /* Perl space */
1106              case PT_PXSPACE:  /* POSIX space */
1107              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1108                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1109                   c == CHAR_FF || c == CHAR_CR;
1110              break;
1111    
1112              case PT_WORD:
1113              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1114                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1115                   c == CHAR_UNDERSCORE;
1116              break;
1117    
1118              case PT_CLIST:
1119              cp = PRIV(ucd_caseless_sets) + code[2];
1120              for (;;)
1121                {
1122                if (c < *cp) { OK = FALSE; break; }
1123                if (c == *cp++) { OK = TRUE; break; }
1124                }
1125              break;
1126    
1127              case PT_UCNC:
1128              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1129                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1130                   c >= 0xe000;
1131              break;
1132    
1133            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1134    
1135            default:            default:
# Line 981  for (;;) Line 1157  for (;;)
1157        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1158        if (clen > 0)        if (clen > 0)
1159          {          {
1160          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1161                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1162                NLBLOCK->nltype == NLTYPE_FIXED &&
1163                NLBLOCK->nllen == 2 &&
1164                c == NLBLOCK->nl[0])
1165              {
1166              could_continue = partial_newline = TRUE;
1167              }
1168            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1169              (c < 256 &&              (c < 256 &&
1170                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1171                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1004  for (;;) Line 1188  for (;;)
1188        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1189        if (clen > 0)        if (clen > 0)
1190          {          {
1191          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1192                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1193                NLBLOCK->nltype == NLTYPE_FIXED &&
1194                NLBLOCK->nllen == 2 &&
1195                c == NLBLOCK->nl[0])
1196              {
1197              could_continue = partial_newline = TRUE;
1198              }
1199            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1200              (c < 256 &&              (c < 256 &&
1201                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1202                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1026  for (;;) Line 1218  for (;;)
1218        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1219        if (clen > 0)        if (clen > 0)
1220          {          {
1221          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1222                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1223                NLBLOCK->nltype == NLTYPE_FIXED &&
1224                NLBLOCK->nllen == 2 &&
1225                c == NLBLOCK->nl[0])
1226              {
1227              could_continue = partial_newline = TRUE;
1228              }
1229            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1230              (c < 256 &&              (c < 256 &&
1231                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1232                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1046  for (;;) Line 1246  for (;;)
1246        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1247        if (clen > 0)        if (clen > 0)
1248          {          {
1249          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1250                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1251                NLBLOCK->nltype == NLTYPE_FIXED &&
1252                NLBLOCK->nllen == 2 &&
1253                c == NLBLOCK->nl[0])
1254              {
1255              could_continue = partial_newline = TRUE;
1256              }
1257            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1258              (c < 256 &&              (c < 256 &&
1259                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1260                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1261            {            {
1262            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1263              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1264            else            else
1265              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1266            }            }
# Line 1063  for (;;) Line 1271  for (;;)
1271        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1272        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1273        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1274        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1275        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1276        if (clen > 0)        if (clen > 0)
1277          {          {
1278          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1279                (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1280                NLBLOCK->nltype == NLTYPE_FIXED &&
1281                NLBLOCK->nllen == 2 &&
1282                c == NLBLOCK->nl[0])
1283              {
1284              could_continue = partial_newline = TRUE;
1285              }
1286            else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1287              (c < 256 &&              (c < 256 &&
1288                (d != OP_ANY || !IS_NEWLINE(ptr)) &&                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1289                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
# Line 1077  for (;;) Line 1293  for (;;)
1293              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1294              next_active_state--;              next_active_state--;
1295              }              }
1296            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1297              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1298            else            else
1299              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1300            }            }
# Line 1100  for (;;) Line 1316  for (;;)
1316        if (clen > 0)        if (clen > 0)
1317          {          {
1318          BOOL OK;          BOOL OK;
1319            const pcre_uint32 *cp;
1320          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1321          switch(code[2])          switch(code[2])
1322            {            {
# Line 1108  for (;;) Line 1325  for (;;)
1325            break;            break;
1326    
1327            case PT_LAMP:            case PT_LAMP:
1328            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1329                prop->chartype == ucp_Lt;
1330            break;            break;
1331    
1332            case PT_GC:            case PT_GC:
1333            OK = _pcre_ucp_gentype[prop->chartype] == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1334            break;            break;
1335    
1336            case PT_PC:            case PT_PC:
# Line 1123  for (;;) Line 1341  for (;;)
1341            OK = prop->script == code[3];            OK = prop->script == code[3];
1342            break;            break;
1343    
1344              /* These are specials for combination cases. */
1345    
1346              case PT_ALNUM:
1347              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1348                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1349              break;
1350    
1351              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1352              which means that Perl space and POSIX space are now identical. PCRE
1353              was changed at release 8.34. */
1354    
1355              case PT_SPACE:    /* Perl space */
1356              case PT_PXSPACE:  /* POSIX space */
1357              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1358                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1359                   c == CHAR_FF || c == CHAR_CR;
1360              break;
1361    
1362              case PT_WORD:
1363              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1364                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1365                   c == CHAR_UNDERSCORE;
1366              break;
1367    
1368              case PT_CLIST:
1369              cp = PRIV(ucd_caseless_sets) + code[3];
1370              for (;;)
1371                {
1372                if (c < *cp) { OK = FALSE; break; }
1373                if (c == *cp++) { OK = TRUE; break; }
1374                }
1375              break;
1376    
1377              case PT_UCNC:
1378              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1379                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1380                   c >= 0xe000;
1381              break;
1382    
1383            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1384    
1385            default:            default:
# Line 1149  for (;;) Line 1406  for (;;)
1406        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1407        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1408        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1409        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1410          {          {
1411          const uschar *nptr = ptr + clen;          int lgb, rgb;
1412            const pcre_uchar *nptr = ptr + clen;
1413          int ncount = 0;          int ncount = 0;
1414          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1415            {            {
1416            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1417            next_active_state--;            next_active_state--;
1418            }            }
1419            lgb = UCD_GRAPHBREAK(c);
1420          while (nptr < end_subject)          while (nptr < end_subject)
1421            {            {
1422            int nd;            dlen = 1;
1423            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1424            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1425            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1426            ncount++;            ncount++;
1427            nptr += ndlen;            lgb = rgb;
1428              nptr += dlen;
1429            }            }
1430          count++;          count++;
1431          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
# Line 1184  for (;;) Line 1444  for (;;)
1444          int ncount = 0;          int ncount = 0;
1445          switch (c)          switch (c)
1446            {            {
1447            case 0x000b:            case CHAR_VT:
1448            case 0x000c:            case CHAR_FF:
1449            case 0x0085:            case CHAR_NEL:
1450    #ifndef EBCDIC
1451            case 0x2028:            case 0x2028:
1452            case 0x2029:            case 0x2029:
1453    #endif  /* Not EBCDIC */
1454            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1455            goto ANYNL01;            goto ANYNL01;
1456    
1457            case 0x000d:            case CHAR_CR:
1458            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1459            /* Fall through */            /* Fall through */
1460    
1461            ANYNL01:            ANYNL01:
1462            case 0x000a:            case CHAR_LF:
1463            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1464              {              {
1465              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1224  for (;;) Line 1486  for (;;)
1486          BOOL OK;          BOOL OK;
1487          switch (c)          switch (c)
1488            {            {
1489            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1490            OK = TRUE;            OK = TRUE;
1491            break;            break;
1492    
# Line 1263  for (;;) Line 1519  for (;;)
1519          BOOL OK;          BOOL OK;
1520          switch (c)          switch (c)
1521            {            {
1522            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
1523            OK = TRUE;            OK = TRUE;
1524            break;            break;
1525    
# Line 1322  for (;;) Line 1560  for (;;)
1560        if (clen > 0)        if (clen > 0)
1561          {          {
1562          BOOL OK;          BOOL OK;
1563            const pcre_uint32 *cp;
1564          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1565          switch(code[2])          switch(code[2])
1566            {            {
# Line 1330  for (;;) Line 1569  for (;;)
1569            break;            break;
1570    
1571            case PT_LAMP:            case PT_LAMP:
1572            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1573                prop->chartype == ucp_Lt;
1574            break;            break;
1575    
1576            case PT_GC:            case PT_GC:
1577            OK = _pcre_ucp_gentype[prop->chartype] == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1578            break;            break;
1579    
1580            case PT_PC:            case PT_PC:
# Line 1345  for (;;) Line 1585  for (;;)
1585            OK = prop->script == code[3];            OK = prop->script == code[3];
1586            break;            break;
1587    
1588              /* These are specials for combination cases. */
1589    
1590              case PT_ALNUM:
1591              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1592                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1593              break;
1594    
1595              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1596              which means that Perl space and POSIX space are now identical. PCRE
1597              was changed at release 8.34. */
1598    
1599              case PT_SPACE:    /* Perl space */
1600              case PT_PXSPACE:  /* POSIX space */
1601              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1602                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1603                   c == CHAR_FF || c == CHAR_CR;
1604              break;
1605    
1606              case PT_WORD:
1607              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1608                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1609                   c == CHAR_UNDERSCORE;
1610              break;
1611    
1612              case PT_CLIST:
1613              cp = PRIV(ucd_caseless_sets) + code[3];
1614              for (;;)
1615                {
1616                if (c < *cp) { OK = FALSE; break; }
1617                if (c == *cp++) { OK = TRUE; break; }
1618                }
1619              break;
1620    
1621              case PT_UCNC:
1622              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1623                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1624                   c >= 0xe000;
1625              break;
1626    
1627            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1628    
1629            default:            default:
# Line 1380  for (;;) Line 1659  for (;;)
1659        QS2:        QS2:
1660    
1661        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1662        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1663          {          {
1664          const uschar *nptr = ptr + clen;          int lgb, rgb;
1665            const pcre_uchar *nptr = ptr + clen;
1666          int ncount = 0;          int ncount = 0;
1667          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1668              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1390  for (;;) Line 1670  for (;;)
1670            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1671            next_active_state--;            next_active_state--;
1672            }            }
1673            lgb = UCD_GRAPHBREAK(c);
1674          while (nptr < end_subject)          while (nptr < end_subject)
1675            {            {
1676            int nd;            dlen = 1;
1677            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1678            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1679            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1680            ncount++;            ncount++;
1681            nptr += ndlen;            lgb = rgb;
1682              nptr += dlen;
1683            }            }
1684          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1685          }          }
# Line 1423  for (;;) Line 1705  for (;;)
1705          int ncount = 0;          int ncount = 0;
1706          switch (c)          switch (c)
1707            {            {
1708            case 0x000b:            case CHAR_VT:
1709            case 0x000c:            case CHAR_FF:
1710            case 0x0085:            case CHAR_NEL:
1711    #ifndef EBCDIC
1712            case 0x2028:            case 0x2028:
1713            case 0x2029:            case 0x2029:
1714    #endif  /* Not EBCDIC */
1715            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1716            goto ANYNL02;            goto ANYNL02;
1717    
1718            case 0x000d:            case CHAR_CR:
1719            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1720            /* Fall through */            /* Fall through */
1721    
1722            ANYNL02:            ANYNL02:
1723            case 0x000a:            case CHAR_LF:
1724            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1725                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1726              {              {
1727              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1728              next_active_state--;              next_active_state--;
1729              }              }
1730            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1731            break;            break;
1732    
1733            default:            default:
# Line 1471  for (;;) Line 1755  for (;;)
1755          BOOL OK;          BOOL OK;
1756          switch (c)          switch (c)
1757            {            {
1758            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
1759            OK = TRUE;            OK = TRUE;
1760            break;            break;
1761    
# Line 1493  for (;;) Line 1771  for (;;)
1771              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1772              next_active_state--;              next_active_state--;
1773              }              }
1774            ADD_NEW_DATA(-(state_offset + count), 0, 0);            ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1775            }            }
1776          }          }
1777        break;        break;
# Line 1517  for (;;) Line 1795  for (;;)
1795          BOOL OK;          BOOL OK;
1796          switch (c)          switch (c)
1797            {            {
1798            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
1799            OK = TRUE;            OK = TRUE;
1800            break;            break;
1801    
# Line 1552  for (;;) Line 1812  for (;;)
1812              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1813              next_active_state--;              next_active_state--;
1814              }              }
1815            ADD_NEW_DATA(-(state_offset + count), 0, 0);            ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1816            }            }
1817          }          }
1818        break;        break;
# Line 1564  for (;;) Line 1824  for (;;)
1824        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1825        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1826        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1827          { ADD_ACTIVE(state_offset + 6, 0); }          { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1828        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1829        if (clen > 0)        if (clen > 0)
1830          {          {
1831          BOOL OK;          BOOL OK;
1832            const pcre_uint32 *cp;
1833          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1834          switch(code[4])          switch(code[1 + IMM2_SIZE + 1])
1835            {            {
1836            case PT_ANY:            case PT_ANY:
1837            OK = TRUE;            OK = TRUE;
1838            break;            break;
1839    
1840            case PT_LAMP:            case PT_LAMP:
1841            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1842                prop->chartype == ucp_Lt;
1843            break;            break;
1844    
1845            case PT_GC:            case PT_GC:
1846            OK = _pcre_ucp_gentype[prop->chartype] == code[5];            OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1847            break;            break;
1848    
1849            case PT_PC:            case PT_PC:
1850            OK = prop->chartype == code[5];            OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1851            break;            break;
1852    
1853            case PT_SC:            case PT_SC:
1854            OK = prop->script == code[5];            OK = prop->script == code[1 + IMM2_SIZE + 2];
1855              break;
1856    
1857              /* These are specials for combination cases. */
1858    
1859              case PT_ALNUM:
1860              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1861                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1862              break;
1863    
1864              /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1865              which means that Perl space and POSIX space are now identical. PCRE
1866              was changed at release 8.34. */
1867    
1868              case PT_SPACE:    /* Perl space */
1869              case PT_PXSPACE:  /* POSIX space */
1870              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1871                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1872                   c == CHAR_FF || c == CHAR_CR;
1873              break;
1874    
1875              case PT_WORD:
1876              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1877                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1878                   c == CHAR_UNDERSCORE;
1879              break;
1880    
1881              case PT_CLIST:
1882              cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1883              for (;;)
1884                {
1885                if (c < *cp) { OK = FALSE; break; }
1886                if (c == *cp++) { OK = TRUE; break; }
1887                }
1888              break;
1889    
1890              case PT_UCNC:
1891              OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1892                   c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1893                   c >= 0xe000;
1894            break;            break;
1895    
1896            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1606  for (;;) Line 1907  for (;;)
1907              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1908              next_active_state--;              next_active_state--;
1909              }              }
1910            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1911              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1912            else            else
1913              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1914            }            }
# Line 1620  for (;;) Line 1921  for (;;)
1921        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1922        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1923        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1924          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1925        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1926        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
1927          {          {
1928          const uschar *nptr = ptr + clen;          int lgb, rgb;
1929            const pcre_uchar *nptr = ptr + clen;
1930          int ncount = 0;          int ncount = 0;
1931          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1932            {            {
1933            active_count--;           /* Remove non-match possibility */            active_count--;           /* Remove non-match possibility */
1934            next_active_state--;            next_active_state--;
1935            }            }
1936            lgb = UCD_GRAPHBREAK(c);
1937          while (nptr < end_subject)          while (nptr < end_subject)
1938            {            {
1939            int nd;            dlen = 1;
1940            int ndlen = 1;            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1941            GETCHARLEN(nd, nptr, ndlen);            rgb = UCD_GRAPHBREAK(d);
1942            if (UCD_CATEGORY(nd) != ucp_M) break;            if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1943            ncount++;            ncount++;
1944            nptr += ndlen;            lgb = rgb;
1945              nptr += dlen;
1946            }            }
1947          if (++count >= GET2(code, 1))          if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1948            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              reset_could_continue = TRUE;
1949            if (++count >= (int)GET2(code, 1))
1950              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1951          else          else
1952            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1953          }          }
# Line 1654  for (;;) Line 1960  for (;;)
1960        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1961        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1962        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1963          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1964        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1965        if (clen > 0)        if (clen > 0)
1966          {          {
1967          int ncount = 0;          int ncount = 0;
1968          switch (c)          switch (c)
1969            {            {
1970            case 0x000b:            case CHAR_VT:
1971            case 0x000c:            case CHAR_FF:
1972            case 0x0085:            case CHAR_NEL:
1973    #ifndef EBCDIC
1974            case 0x2028:            case 0x2028:
1975            case 0x2029:            case 0x2029:
1976    #endif  /* Not EBCDIC */
1977            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1978            goto ANYNL03;            goto ANYNL03;
1979    
1980            case 0x000d:            case CHAR_CR:
1981            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1982            /* Fall through */            /* Fall through */
1983    
1984            ANYNL03:            ANYNL03:
1985            case 0x000a:            case CHAR_LF:
1986            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1987              {              {
1988              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1989              next_active_state--;              next_active_state--;
1990              }              }
1991            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
1992              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1993            else            else
1994              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1995            break;            break;
# Line 1698  for (;;) Line 2006  for (;;)
2006        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2007        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2008        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2009          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2010        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2011        if (clen > 0)        if (clen > 0)
2012          {          {
2013          BOOL OK;          BOOL OK;
2014          switch (c)          switch (c)
2015            {            {
2016            case 0x000a:            VSPACE_CASES:
           case 0x000b:  
           case 0x000c:  
           case 0x000d:  
           case 0x0085:  
           case 0x2028:  
           case 0x2029:  
2017            OK = TRUE;            OK = TRUE;
2018            break;            break;
2019    
# Line 1726  for (;;) Line 2028  for (;;)
2028              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
2029              next_active_state--;              next_active_state--;
2030              }              }
2031            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2032              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2033            else            else
2034              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
2035            }            }
# Line 1740  for (;;) Line 2042  for (;;)
2042        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2043        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2044        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2045          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2046        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2047        if (clen > 0)        if (clen > 0)
2048          {          {
2049          BOOL OK;          BOOL OK;
2050          switch (c)          switch (c)
2051            {            {
2052            case 0x09:      /* HT */            HSPACE_CASES:
           case 0x20:      /* SPACE */  
           case 0xa0:      /* NBSP */  
           case 0x1680:    /* OGHAM SPACE MARK */  
           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
           case 0x2000:    /* EN QUAD */  
           case 0x2001:    /* EM QUAD */  
           case 0x2002:    /* EN SPACE */  
           case 0x2003:    /* EM SPACE */  
           case 0x2004:    /* THREE-PER-EM SPACE */  
           case 0x2005:    /* FOUR-PER-EM SPACE */  
           case 0x2006:    /* SIX-PER-EM SPACE */  
           case 0x2007:    /* FIGURE SPACE */  
           case 0x2008:    /* PUNCTUATION SPACE */  
           case 0x2009:    /* THIN SPACE */  
           case 0x200A:    /* HAIR SPACE */  
           case 0x202f:    /* NARROW NO-BREAK SPACE */  
           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
           case 0x3000:    /* IDEOGRAPHIC SPACE */  
2053            OK = TRUE;            OK = TRUE;
2054            break;            break;
2055    
# Line 1781  for (;;) Line 2065  for (;;)
2065              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
2066              next_active_state--;              next_active_state--;
2067              }              }
2068            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2069              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2070            else            else
2071              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
2072            }            }
# Line 1801  for (;;) Line 2085  for (;;)
2085        break;        break;
2086    
2087        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2088        case OP_CHARNC:        case OP_CHARI:
2089        if (clen == 0) break;        if (clen == 0) break;
2090    
2091  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2092        if (utf8)        if (utf)
2093          {          {
2094          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2095            {            {
2096            unsigned int othercase;            unsigned int othercase;
2097            if (c < 128) othercase = fcc[c]; else            if (c < 128)
2098                othercase = fcc[c];
2099            /* If we have Unicode property support, we can use it to test the            else
2100            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2101                other case of the character. */
2102  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2103            othercase = UCD_OTHERCASE(c);              othercase = UCD_OTHERCASE(c);
2104  #else  #else
2105            othercase = NOTACHAR;              othercase = NOTACHAR;
2106  #endif  #endif
2107    
2108            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2109            }            }
2110          }          }
2111        else        else
2112  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2113          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2114          {          {
2115          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2116              { ADD_NEW(state_offset + 2, 0); }
2117          }          }
2118        break;        break;
2119    
# Line 1841  for (;;) Line 2125  for (;;)
2125        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2126    
2127        case OP_EXTUNI:        case OP_EXTUNI:
2128        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0)
2129          {          {
2130          const uschar *nptr = ptr + clen;          int lgb, rgb;
2131            const pcre_uchar *nptr = ptr + clen;
2132          int ncount = 0;          int ncount = 0;
2133            lgb = UCD_GRAPHBREAK(c);
2134          while (nptr < end_subject)          while (nptr < end_subject)
2135            {            {
2136            int nclen = 1;            dlen = 1;
2137            GETCHARLEN(c, nptr, nclen);            if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2138            if (UCD_CATEGORY(c) != ucp_M) break;            rgb = UCD_GRAPHBREAK(d);
2139              if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2140            ncount++;            ncount++;
2141            nptr += nclen;            lgb = rgb;
2142              nptr += dlen;
2143            }            }
2144            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2145                reset_could_continue = TRUE;
2146          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2147          }          }
2148        break;        break;
# Line 1866  for (;;) Line 2156  for (;;)
2156        case OP_ANYNL:        case OP_ANYNL:
2157        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2158          {          {
2159          case 0x000b:          case CHAR_VT:
2160          case 0x000c:          case CHAR_FF:
2161          case 0x0085:          case CHAR_NEL:
2162    #ifndef EBCDIC
2163          case 0x2028:          case 0x2028:
2164          case 0x2029:          case 0x2029:
2165    #endif  /* Not EBCDIC */
2166          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2167    
2168          case 0x000a:          case CHAR_LF:
2169          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2170          break;          break;
2171    
2172          case 0x000d:          case CHAR_CR:
2173          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 >= end_subject)
2174              {
2175              ADD_NEW(state_offset + 1, 0);
2176              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2177                reset_could_continue = TRUE;
2178              }
2179            else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2180            {            {
2181            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2182            }            }
# Line 1894  for (;;) Line 2192  for (;;)
2192        case OP_NOT_VSPACE:        case OP_NOT_VSPACE:
2193        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2194          {          {
2195          case 0x000a:          VSPACE_CASES:
         case 0x000b:  
         case 0x000c:  
         case 0x000d:  
         case 0x0085:  
         case 0x2028:  
         case 0x2029:  
2196          break;          break;
2197    
2198          default:          default:
# Line 1913  for (;;) Line 2205  for (;;)
2205        case OP_VSPACE:        case OP_VSPACE:
2206        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2207          {          {
2208          case 0x000a:          VSPACE_CASES:
         case 0x000b:  
         case 0x000c:  
         case 0x000d:  
         case 0x0085:  
         case 0x2028:  
         case 0x2029:  
2209          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2210          break;          break;
2211    
2212          default: break;          default:
2213            break;
2214          }          }
2215        break;        break;
2216    
# Line 1931  for (;;) Line 2218  for (;;)
2218        case OP_NOT_HSPACE:        case OP_NOT_HSPACE:
2219        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2220          {          {
2221          case 0x09:      /* HT */          HSPACE_CASES:
         case 0x20:      /* SPACE */  
         case 0xa0:      /* NBSP */  
         case 0x1680:    /* OGHAM SPACE MARK */  
         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
         case 0x2000:    /* EN QUAD */  
         case 0x2001:    /* EM QUAD */  
         case 0x2002:    /* EN SPACE */  
         case 0x2003:    /* EM SPACE */  
         case 0x2004:    /* THREE-PER-EM SPACE */  
         case 0x2005:    /* FOUR-PER-EM SPACE */  
         case 0x2006:    /* SIX-PER-EM SPACE */  
         case 0x2007:    /* FIGURE SPACE */  
         case 0x2008:    /* PUNCTUATION SPACE */  
         case 0x2009:    /* THIN SPACE */  
         case 0x200A:    /* HAIR SPACE */  
         case 0x202f:    /* NARROW NO-BREAK SPACE */  
         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
         case 0x3000:    /* IDEOGRAPHIC SPACE */  
2222          break;          break;
2223    
2224          default:          default:
# Line 1962  for (;;) Line 2231  for (;;)
2231        case OP_HSPACE:        case OP_HSPACE:
2232        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2233          {          {
2234          case 0x09:      /* HT */          HSPACE_CASES:
         case 0x20:      /* SPACE */  
         case 0xa0:      /* NBSP */  
         case 0x1680:    /* OGHAM SPACE MARK */  
         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */  
         case 0x2000:    /* EN QUAD */  
         case 0x2001:    /* EM QUAD */  
         case 0x2002:    /* EN SPACE */  
         case 0x2003:    /* EM SPACE */  
         case 0x2004:    /* THREE-PER-EM SPACE */  
         case 0x2005:    /* FOUR-PER-EM SPACE */  
         case 0x2006:    /* SIX-PER-EM SPACE */  
         case 0x2007:    /* FIGURE SPACE */  
         case 0x2008:    /* PUNCTUATION SPACE */  
         case 0x2009:    /* THIN SPACE */  
         case 0x200A:    /* HAIR SPACE */  
         case 0x202f:    /* NARROW NO-BREAK SPACE */  
         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */  
         case 0x3000:    /* IDEOGRAPHIC SPACE */  
2235          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2236          break;          break;
2237    
2238            default:
2239            break;
2240          }          }
2241        break;        break;
2242    
2243        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2244        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character casefully. */
       characters, that is, we know that d < 256. The character we are  
       checking (c) can be multibyte. */  
2245    
2246        case OP_NOT:        case OP_NOT:
2247          if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2248          break;
2249    
2250          /*-----------------------------------------------------------------*/
2251          /* Match a negated single character caselessly. */
2252    
2253          case OP_NOTI:
2254        if (clen > 0)        if (clen > 0)
2255          {          {
2256          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          unsigned int otherd;
2257          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  #ifdef SUPPORT_UTF
2258            if (utf && d >= 128)
2259              {
2260    #ifdef SUPPORT_UCP
2261              otherd = UCD_OTHERCASE(d);
2262    #endif  /* SUPPORT_UCP */
2263              }
2264            else
2265    #endif  /* SUPPORT_UTF */
2266            otherd = TABLE_GET(d, fcc, d);
2267            if (c != d && c != otherd)
2268              { ADD_NEW(state_offset + dlen + 1, 0); }
2269          }          }
2270        break;        break;
2271    
2272        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2273          case OP_PLUSI:
2274          case OP_MINPLUSI:
2275          case OP_POSPLUSI:
2276          case OP_NOTPLUSI:
2277          case OP_NOTMINPLUSI:
2278          case OP_NOTPOSPLUSI:
2279          caseless = TRUE;
2280          codevalue -= OP_STARI - OP_STAR;
2281    
2282          /* Fall through */
2283        case OP_PLUS:        case OP_PLUS:
2284        case OP_MINPLUS:        case OP_MINPLUS:
2285        case OP_POSPLUS:        case OP_POSPLUS:
# Line 2010  for (;;) Line 2290  for (;;)
2290        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2291        if (clen > 0)        if (clen > 0)
2292          {          {
2293          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2294          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2295            {            {
2296  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2297            if (utf8 && d >= 128)            if (utf && d >= 128)
2298              {              {
2299  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2300              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2301  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2302              }              }
2303            else            else
2304  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2305            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2306            }            }
2307          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2308            {            {
# Line 2039  for (;;) Line 2319  for (;;)
2319        break;        break;
2320    
2321        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2322          case OP_QUERYI:
2323          case OP_MINQUERYI:
2324          case OP_POSQUERYI:
2325          case OP_NOTQUERYI:
2326          case OP_NOTMINQUERYI:
2327          case OP_NOTPOSQUERYI:
2328          caseless = TRUE;
2329          codevalue -= OP_STARI - OP_STAR;
2330          /* Fall through */
2331        case OP_QUERY:        case OP_QUERY:
2332        case OP_MINQUERY:        case OP_MINQUERY:
2333        case OP_POSQUERY:        case OP_POSQUERY:
# Line 2048  for (;;) Line 2337  for (;;)
2337        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2338        if (clen > 0)        if (clen > 0)
2339          {          {
2340          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2341          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2342            {            {
2343  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2344            if (utf8 && d >= 128)            if (utf && d >= 128)
2345              {              {
2346  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2347              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2348  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2349              }              }
2350            else            else
2351  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2352            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2353            }            }
2354          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2355            {            {
# Line 2075  for (;;) Line 2364  for (;;)
2364        break;        break;
2365    
2366        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2367          case OP_STARI:
2368          case OP_MINSTARI:
2369          case OP_POSSTARI:
2370          case OP_NOTSTARI:
2371          case OP_NOTMINSTARI:
2372          case OP_NOTPOSSTARI:
2373          caseless = TRUE;
2374          codevalue -= OP_STARI - OP_STAR;
2375          /* Fall through */
2376        case OP_STAR:        case OP_STAR:
2377        case OP_MINSTAR:        case OP_MINSTAR:
2378        case OP_POSSTAR:        case OP_POSSTAR:
# Line 2084  for (;;) Line 2382  for (;;)
2382        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
2383        if (clen > 0)        if (clen > 0)
2384          {          {
2385          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2386          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2387            {            {
2388  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2389            if (utf8 && d >= 128)            if (utf && d >= 128)
2390              {              {
2391  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2392              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2393  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2394              }              }
2395            else            else
2396  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2397            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2398            }            }
2399          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2400            {            {
# Line 2111  for (;;) Line 2409  for (;;)
2409        break;        break;
2410    
2411        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2412          case OP_EXACTI:
2413          case OP_NOTEXACTI:
2414          caseless = TRUE;
2415          codevalue -= OP_STARI - OP_STAR;
2416          /* Fall through */
2417        case OP_EXACT:        case OP_EXACT:
2418        case OP_NOTEXACT:        case OP_NOTEXACT:
2419        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2420        if (clen > 0)        if (clen > 0)
2421          {          {
2422          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2423          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2424            {            {
2425  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2426            if (utf8 && d >= 128)            if (utf && d >= 128)
2427              {              {
2428  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2429              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2430  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2431              }              }
2432            else            else
2433  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2434            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2435            }            }
2436          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2437            {            {
2438            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2439              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2440            else            else
2441              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2442            }            }
# Line 2141  for (;;) Line 2444  for (;;)
2444        break;        break;
2445    
2446        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2447          case OP_UPTOI:
2448          case OP_MINUPTOI:
2449          case OP_POSUPTOI:
2450          case OP_NOTUPTOI:
2451          case OP_NOTMINUPTOI:
2452          case OP_NOTPOSUPTOI:
2453          caseless = TRUE;
2454          codevalue -= OP_STARI - OP_STAR;
2455          /* Fall through */
2456        case OP_UPTO:        case OP_UPTO:
2457        case OP_MINUPTO:        case OP_MINUPTO:
2458        case OP_POSUPTO:        case OP_POSUPTO:
2459        case OP_NOTUPTO:        case OP_NOTUPTO:
2460        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2461        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2462        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2463        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2464        if (clen > 0)        if (clen > 0)
2465          {          {
2466          unsigned int otherd = NOTACHAR;          pcre_uint32 otherd = NOTACHAR;
2467          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2468            {            {
2469  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2470            if (utf8 && d >= 128)            if (utf && d >= 128)
2471              {              {
2472  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2473              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2474  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2475              }              }
2476            else            else
2477  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2478            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2479            }            }
2480          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2481            {            {
# Line 2172  for (;;) Line 2484  for (;;)
2484              active_count--;             /* Remove non-match possibility */              active_count--;             /* Remove non-match possibility */
2485              next_active_state--;              next_active_state--;
2486              }              }
2487            if (++count >= GET2(code, 1))            if (++count >= (int)GET2(code, 1))
2488              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2489            else            else
2490              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2491            }            }
# Line 2190  for (;;) Line 2502  for (;;)
2502          {          {
2503          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2504          int next_state_offset;          int next_state_offset;
2505          const uschar *ecode;          const pcre_uchar *ecode;
2506    
2507          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2508          can set isinclass from it. */          can set isinclass from it. */
2509    
2510          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2511            {            {
2512            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2513            if (clen > 0)            if (clen > 0)
2514              {              {
2515              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2516                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2517              }              }
2518            }            }
2519    
# Line 2212  for (;;) Line 2524  for (;;)
2524          else          else
2525           {           {
2526           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2527           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2528           }           }
2529    
2530          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2531          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2532          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2533    
2534          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2535    
2536          switch (*ecode)          switch (*ecode)
2537            {            {
# Line 2245  for (;;) Line 2557  for (;;)
2557            case OP_CRRANGE:            case OP_CRRANGE:
2558            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2559            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2560            if (count >= GET2(ecode, 1))            if (count >= (int)GET2(ecode, 1))
2561              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2562            if (isinclass)            if (isinclass)
2563              {              {
2564              int max = GET2(ecode, 3);              int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2565              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2566                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2567              else              else
2568                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2569              }              }
# Line 2282  for (;;) Line 2594  for (;;)
2594          int rc;          int rc;
2595          int local_offsets[2];          int local_offsets[2];
2596          int local_workspace[1000];          int local_workspace[1000];
2597          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2598    
2599          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2600    
# Line 2290  for (;;) Line 2602  for (;;)
2602            md,                                   /* static match data */            md,                                   /* static match data */
2603            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2604            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2605            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2606            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2607            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2608            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2609            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2610            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
2611            rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
   
2612          if (rc == PCRE_ERROR_DFA_UITEM) return rc;          if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2613          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2614              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2615          }          }
2616        break;        break;
2617    
# Line 2321  for (;;) Line 2631  for (;;)
2631          if (code[LINK_SIZE+1] == OP_CALLOUT)          if (code[LINK_SIZE+1] == OP_CALLOUT)
2632            {            {
2633            rrc = 0;            rrc = 0;
2634            if (pcre_callout != NULL)            if (PUBL(callout) != NULL)
2635              {              {
2636              pcre_callout_block cb;              PUBL(callout_block) cb;
2637              cb.version          = 1;   /* Version 1 of the callout block */              cb.version          = 1;   /* Version 1 of the callout block */
2638              cb.callout_number   = code[LINK_SIZE+2];              cb.callout_number   = code[LINK_SIZE+2];
2639              cb.offset_vector    = offsets;              cb.offset_vector    = offsets;
2640    #if defined COMPILE_PCRE8
2641              cb.subject          = (PCRE_SPTR)start_subject;              cb.subject          = (PCRE_SPTR)start_subject;
2642              cb.subject_length   = end_subject - start_subject;  #elif defined COMPILE_PCRE16
2643              cb.start_match      = current_subject - start_subject;              cb.subject          = (PCRE_SPTR16)start_subject;
2644              cb.current_position = ptr - start_subject;  #elif defined COMPILE_PCRE32
2645                cb.subject          = (PCRE_SPTR32)start_subject;
2646    #endif
2647                cb.subject_length   = (int)(end_subject - start_subject);
2648                cb.start_match      = (int)(current_subject - start_subject);
2649                cb.current_position = (int)(ptr - start_subject);
2650              cb.pattern_position = GET(code, LINK_SIZE + 3);              cb.pattern_position = GET(code, LINK_SIZE + 3);
2651              cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);              cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2652              cb.capture_top      = 1;              cb.capture_top      = 1;
2653              cb.capture_last     = -1;              cb.capture_last     = -1;
2654              cb.callout_data     = md->callout_data;              cb.callout_data     = md->callout_data;
2655              if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */              cb.mark             = NULL;   /* No (*MARK) support */
2656                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2657              }              }
2658            if (rrc > 0) break;                      /* Fail this thread */            if (rrc > 0) break;                      /* Fail this thread */
2659            code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */            code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2660            }            }
2661    
2662          condcode = code[LINK_SIZE+1];          condcode = code[LINK_SIZE+1];
2663    
2664          /* Back reference conditions are not supported */          /* Back reference conditions and duplicate named recursion conditions
2665            are not supported */
2666    
2667          if (condcode == OP_CREF || condcode == OP_NCREF)          if (condcode == OP_CREF || condcode == OP_DNCREF ||
2668                condcode == OP_DNRREF)
2669            return PCRE_ERROR_DFA_UCOND;            return PCRE_ERROR_DFA_UCOND;
2670    
2671          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
# Line 2358  for (;;) Line 2677  for (;;)
2677          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2678          recursed groups. */          recursed groups. */
2679    
2680          else if (condcode == OP_RREF || condcode == OP_NRREF)          else if (condcode == OP_RREF)
2681            {            {
2682            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2683            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2684            if (recursing > 0)            if (md->recursive != NULL)
2685              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2686            else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }            else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2687            }            }
2688    
# Line 2372  for (;;) Line 2691  for (;;)
2691          else          else
2692            {            {
2693            int rc;            int rc;
2694            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2695            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2696    
2697            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2698    
# Line 2381  for (;;) Line 2700  for (;;)
2700              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2701              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2702              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2703              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2704              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2705              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2706              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2707              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2708              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2709    
2710            if (rc == PCRE_ERROR_DFA_UITEM) return rc;            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2711            if ((rc >= 0) ==            if ((rc >= 0) ==
2712                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2713              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2714            else            else
2715              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2716            }            }
# Line 2403  for (;;) Line 2720  for (;;)
2720        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2721        case OP_RECURSE:        case OP_RECURSE:
2722          {          {
2723            dfa_recursion_info *ri;
2724          int local_offsets[1000];          int local_offsets[1000];
2725          int local_workspace[1000];          int local_workspace[1000];
2726            const pcre_uchar *callpat = start_code + GET(code, 1);
2727            int recno = (callpat == md->start_code)? 0 :
2728              GET2(callpat, 1 + LINK_SIZE);
2729          int rc;          int rc;
2730    
2731          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2732            recursing + 1));  
2733            /* Check for repeating a recursion without advancing the subject
2734            pointer. This should catch convoluted mutual recursions. (Some simple
2735            cases are caught at compile time.) */
2736    
2737            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2738              if (recno == ri->group_num && ptr == ri->subject_position)
2739                return PCRE_ERROR_RECURSELOOP;
2740    
2741            /* Remember this recursion and where we started it so as to
2742            catch infinite loops. */
2743    
2744            new_recursive.group_num = recno;
2745            new_recursive.subject_position = ptr;
2746            new_recursive.prevrec = md->recursive;
2747            md->recursive = &new_recursive;
2748    
2749          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2750            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2751            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2752            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2753            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2754            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2755            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2756            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2757            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2758            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing + 1);                       /* regex recurse level */  
2759    
2760          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2761            recursing + 1, rc));  
2762            DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2763              rc));
2764    
2765          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2766    
# Line 2438  for (;;) Line 2774  for (;;)
2774            {            {
2775            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2776              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2777              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2778              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2779                if (utf)
2780                  {
2781                  const pcre_uchar *p = start_subject + local_offsets[rc];
2782                  const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2783                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2784                  }
2785    #endif
2786              if (charcount > 0)              if (charcount > 0)
2787                {                {
2788                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 2457  for (;;) Line 2798  for (;;)
2798        break;        break;
2799    
2800        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2801          case OP_BRAPOS:
2802          case OP_SBRAPOS:
2803          case OP_CBRAPOS:
2804          case OP_SCBRAPOS:
2805          case OP_BRAPOSZERO:
2806            {
2807            int charcount, matched_count;
2808            const pcre_uchar *local_ptr = ptr;
2809            BOOL allow_zero;
2810    
2811            if (codevalue == OP_BRAPOSZERO)
2812              {
2813              allow_zero = TRUE;
2814              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2815              }
2816            else allow_zero = FALSE;
2817    
2818            /* Loop to match the subpattern as many times as possible as if it were
2819            a complete pattern. */
2820    
2821            for (matched_count = 0;; matched_count++)
2822              {
2823              int local_offsets[2];
2824              int local_workspace[1000];
2825    
2826              int rc = internal_dfa_exec(
2827                md,                                   /* fixed match data */
2828                code,                                 /* this subexpression's code */
2829                local_ptr,                            /* where we currently are */
2830                (int)(ptr - start_subject),           /* start offset */
2831                local_offsets,                        /* offset vector */
2832                sizeof(local_offsets)/sizeof(int),    /* size of same */
2833                local_workspace,                      /* workspace vector */
2834                sizeof(local_workspace)/sizeof(int),  /* size of same */
2835                rlevel);                              /* function recursion level */
2836    
2837              /* Failed to match */
2838    
2839              if (rc < 0)
2840                {
2841                if (rc != PCRE_ERROR_NOMATCH) return rc;
2842                break;
2843                }
2844    
2845              /* Matched: break the loop if zero characters matched. */
2846    
2847              charcount = local_offsets[1] - local_offsets[0];
2848              if (charcount == 0) break;
2849              local_ptr += charcount;    /* Advance temporary position ptr */
2850              }
2851    
2852            /* At this point we have matched the subpattern matched_count
2853            times, and local_ptr is pointing to the character after the end of the
2854            last match. */
2855    
2856            if (matched_count > 0 || allow_zero)
2857              {
2858              const pcre_uchar *end_subpattern = code;
2859              int next_state_offset;
2860    
2861              do { end_subpattern += GET(end_subpattern, 1); }
2862                while (*end_subpattern == OP_ALT);
2863              next_state_offset =
2864                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2865    
2866              /* Optimization: if there are no more active states, and there
2867              are no new states yet set up, then skip over the subject string
2868              right here, to save looping. Otherwise, set up the new state to swing
2869              into action when the end of the matched substring is reached. */
2870    
2871              if (i + 1 >= active_count && new_count == 0)
2872                {
2873                ptr = local_ptr;
2874                clen = 0;
2875                ADD_NEW(next_state_offset, 0);
2876                }
2877              else
2878                {
2879                const pcre_uchar *p = ptr;
2880                const pcre_uchar *pp = local_ptr;
2881                charcount = (int)(pp - p);
2882    #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2883                if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2884    #endif
2885                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2886                }
2887              }
2888            }
2889          break;
2890    
2891          /*-----------------------------------------------------------------*/
2892        case OP_ONCE:        case OP_ONCE:
2893          case OP_ONCE_NC:
2894          {          {
2895          int local_offsets[2];          int local_offsets[2];
2896          int local_workspace[1000];          int local_workspace[1000];
# Line 2466  for (;;) Line 2899  for (;;)
2899            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2900            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2901            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2902            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2903            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2904            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2905            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2906            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2907            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2908    
2909          if (rc >= 0)          if (rc >= 0)
2910            {            {
2911            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2912            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2913            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2914    
2915            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2916              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2917            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2918                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2919    
2920            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2921            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 2491  for (;;) Line 2923  for (;;)
2923    
2924            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2925                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2926              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2927    
2928            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2929            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 2506  for (;;) Line 2938  for (;;)
2938            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2939            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2940            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2941            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2942    
2943            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2944              {              {
# Line 2529  for (;;) Line 2961  for (;;)
2961              }              }
2962            else            else
2963              {              {
2964              const uschar *p = start_subject + local_offsets[0];  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2965              const uschar *pp = start_subject + local_offsets[1];              if (utf)
2966              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;                {
2967                  const pcre_uchar *p = start_subject + local_offsets[0];
2968                  const pcre_uchar *pp = start_subject + local_offsets[1];
2969                  while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2970                  }
2971    #endif
2972              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2973              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2974                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2975              }              }
   
2976            }            }
2977          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2978          }          }
# Line 2548  for (;;) Line 2984  for (;;)
2984    
2985        case OP_CALLOUT:        case OP_CALLOUT:
2986        rrc = 0;        rrc = 0;
2987        if (pcre_callout != NULL)        if (PUBL(callout) != NULL)
2988          {          {
2989          pcre_callout_block cb;          PUBL(callout_block) cb;
2990          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2991          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2992          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2993    #if defined COMPILE_PCRE8
2994          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2995          cb.subject_length   = end_subject - start_subject;  #elif defined COMPILE_PCRE16
2996          cb.start_match      = current_subject - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
2997          cb.current_position = ptr - start_subject;  #elif defined COMPILE_PCRE32
2998            cb.subject          = (PCRE_SPTR32)start_subject;
2999    #endif
3000            cb.subject_length   = (int)(end_subject - start_subject);
3001            cb.start_match      = (int)(current_subject - start_subject);
3002            cb.current_position = (int)(ptr - start_subject);
3003          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
3004          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
3005          cb.capture_top      = 1;          cb.capture_top      = 1;
3006          cb.capture_last     = -1;          cb.capture_last     = -1;
3007          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
3008          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
3009            if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3010          }          }
3011        if (rrc == 0)        if (rrc == 0)
3012          { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }          { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3013        break;        break;
3014    
3015    
# Line 2595  for (;;) Line 3038  for (;;)
3038    if (new_count <= 0)    if (new_count <= 0)
3039      {      {
3040      if (rlevel == 1 &&                               /* Top level, and */      if (rlevel == 1 &&                               /* Top level, and */
3041          could_continue &&                            /* Some could go on */          could_continue &&                            /* Some could go on, and */
3042          forced_fail != workspace[1] &&               /* Not all forced fail & */          forced_fail != workspace[1] &&               /* Not all forced fail & */
3043          (                                            /* either... */          (                                            /* either... */
3044          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
# Line 2603  for (;;) Line 3046  for (;;)
3046          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */          ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3047           match_count < 0)                            /* no matches */           match_count < 0)                            /* no matches */
3048          ) &&                                         /* And... */          ) &&                                         /* And... */
3049          ptr >= end_subject &&                     /* Reached end of subject */          (
3050          ptr > current_subject)                    /* Matched non-empty string */          partial_newline ||                           /* Either partial NL */
3051        {            (                                          /* or ... */
3052        if (offsetcount >= 2)            ptr >= end_subject &&                /* End of subject and */
3053          {            ptr > md->start_used_ptr)            /* Inspected non-empty string */
3054          offsets[0] = md->start_used_ptr - start_subject;            )
3055          offsets[1] = end_subject - start_subject;          )
         }  
3056        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
       }  
   
3057      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3058        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3059        rlevel*2-2, SP));        rlevel*2-2, SP));
# Line 2663  Returns:          > 0 => number of match Line 3103  Returns:          > 0 => number of match
3103                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3104  */  */
3105    
3106    #if defined COMPILE_PCRE8
3107  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3108  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3109    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3110    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3111    #elif defined COMPILE_PCRE16
3112    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3113    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3114      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3115      int offsetcount, int *workspace, int wscount)
3116    #elif defined COMPILE_PCRE32
3117    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3118    pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3119      PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3120      int offsetcount, int *workspace, int wscount)
3121    #endif
3122  {  {
3123  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3124  dfa_match_data match_block;  dfa_match_data match_block;
3125  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3126  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3127  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
   
 pcre_study_data internal_study;  
3128  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3129    
3130  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3131  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3132  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3133  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3134  int first_byte = -1;  pcre_uchar first_char = 0;
3135  int req_byte = -1;  pcre_uchar first_char2 = 0;
3136  int req_byte2 = -1;  pcre_uchar req_char = 0;
3137    pcre_uchar req_char2 = 0;
3138  int newline;  int newline;
3139    
3140  /* Plausibility checks */  /* Plausibility checks */
# Line 2694  if (re == NULL || subject == NULL || wor Line 3144  if (re == NULL || subject == NULL || wor
3144     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3145  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3146  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3147    if (length < 0) return PCRE_ERROR_BADLENGTH;
3148    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3149    
3150    /* Check that the first field in the block is the magic number. If it is not,
3151    return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3152    REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3153    means that the pattern is likely compiled with different endianness. */
3154    
3155    if (re->magic_number != MAGIC_NUMBER)
3156      return re->magic_number == REVERSED_MAGIC_NUMBER?
3157        PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3158    if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3159    
3160    /* If restarting after a partial match, do some sanity checks on the contents
3161    of the workspace. */
3162    
3163    if ((options & PCRE_DFA_RESTART) != 0)
3164      {
3165      if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3166        workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3167          return PCRE_ERROR_DFA_BADRESTART;
3168      }
3169    
3170  /* We need to find the pointer to any study data before we test for byte  /* Set up study, callout, and table data */
 flipping, so we scan the extra_data block first. This may set two fields in the  
 match block, so we must initialize them beforehand. However, the other fields  
 in the match block must not be set until after the byte flipping. */  
3171    
3172  md->tables = re->tables;  md->tables = re->tables;
3173  md->callout_data = NULL;  md->callout_data = NULL;
# Line 2717  if (extra_data != NULL) Line 3186  if (extra_data != NULL)
3186      md->tables = extra_data->tables;      md->tables = extra_data->tables;
3187    }    }
3188    
 /* Check that the first field in the block is the magic number. If it is not,  
 test for a regex that was compiled on a host of opposite endianness. If this is  
 the case, flipped values are put in internal_re and internal_study if there was  
 study data too. */  
   
 if (re->magic_number != MAGIC_NUMBER)  
   {  
   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);  
   if (re == NULL) return PCRE_ERROR_BADMAGIC;  
   if (study != NULL) study = &internal_study;  
   }  
   
3189  /* Set some local values */  /* Set some local values */
3190    
3191  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3192  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3193  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3194    
3195  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3196  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3197    utf = (re->options & PCRE_UTF8) != 0;
3198  #else  #else
3199  utf8 = FALSE;  utf = FALSE;
3200  #endif  #endif
3201    
3202  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2746  anchored = (options & (PCRE_ANCHORED|PCR Line 3204  anchored = (options & (PCRE_ANCHORED|PCR
3204    
3205  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3206    
3207  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3208      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3209  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3210  md->end_subject = end_subject;  md->end_subject = end_subject;
3211  md->start_offset = start_offset;  md->start_offset = start_offset;
3212  md->moptions = options;  md->moptions = options;
# Line 2809  else Line 3267  else
3267  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3268  back the character offset. */  back the character offset. */
3269    
3270  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3271  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3272    {    {
3273    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3274      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3275    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3276      {      {
3277      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3278        {        {
3279        tb &= 0xc0;        offsets[0] = erroroffset;
3280        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3281        }        }
3282    #if defined COMPILE_PCRE8
3283        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3284          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3285    #elif defined COMPILE_PCRE16
3286        return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3287          PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3288    #elif defined COMPILE_PCRE32
3289        return PCRE_ERROR_BADUTF32;
3290    #endif
3291      }      }
3292    #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3293      if (start_offset > 0 && start_offset < length &&
3294            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3295        return PCRE_ERROR_BADUTF8_OFFSET;
3296    #endif
3297    }    }
3298  #endif  #endif
3299    
# Line 2830  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3301  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3301  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3302  in other programs later. */  in other programs later. */
3303    
3304  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3305    
3306  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3307  used in a loop when finding where to start. */  where to start. */
3308    
 lcc = md->tables + lcc_offset;  
3309  startline = (re->flags & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
3310  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3311    
# Line 2849  if (!anchored) Line 3319  if (!anchored)
3319    {    {
3320    if ((re->flags & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3321      {      {
3322      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3323      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = (pcre_uchar)(re->first_char);
3324        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3325          {
3326          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3327    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3328          if (utf && first_char > 127)
3329            first_char2 = UCD_OTHERCASE(first_char);
3330    #endif
3331          }
3332      }      }
3333    else    else
3334      {      {
# Line 2866  character" set. */ Line 3343  character" set. */
3343    
3344  if ((re->flags & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3345    {    {
3346    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3347    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = (pcre_uchar)(re->req_char);
3348    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3349        {
3350        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3351    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3352        if (utf && req_char > 127)
3353          req_char2 = UCD_OTHERCASE(req_char);
3354    #endif
3355        }
3356    }    }
3357    
3358  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
# Line 2881  for (;;) Line 3365  for (;;)
3365    
3366    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3367      {      {
3368      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3369    
3370      /* If firstline is TRUE, the start of the match is constrained to the first      /* If firstline is TRUE, the start of the match is constrained to the first
3371      line of a multiline string. Implement this by temporarily adjusting      line of a multiline string. Implement this by temporarily adjusting
# Line 2890  for (;;) Line 3374  for (;;)
3374    
3375      if (firstline)      if (firstline)
3376        {        {
3377        USPTR t = current_subject;        PCRE_PUCHAR t = current_subject;
3378  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3379        if (utf8)        if (utf)
3380          {          {
3381          while (t < md->end_subject && !IS_NEWLINE(t))          while (t < md->end_subject && !IS_NEWLINE(t))
3382            {            {
3383            t++;            t++;
3384            while (t < end_subject && (*t & 0xc0) == 0x80) t++;            ACROSSCHAR(t < end_subject, *t, t++);
3385            }            }
3386          }          }
3387        else        else
# Line 2908  for (;;) Line 3392  for (;;)
3392    
3393      /* There are some optimizations that avoid running the match if a known      /* There are some optimizations that avoid running the match if a known
3394      starting point is not found. However, there is an option that disables      starting point is not found. However, there is an option that disables
3395      these, for testing and for ensuring that all callouts do actually occur. */      these, for testing and for ensuring that all callouts do actually occur.
3396        The option can be set in the regex by (*NO_START_OPT) or passed in
3397        match-time options. */
3398    
3399      if ((options & PCRE_NO_START_OPTIMIZE) == 0)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3400        {        {
3401        /* Advance to a known first byte. */        /* Advance to a known first char. */
3402    
3403        if (first_byte >= 0)        if (has_first_char)
3404          {          {
3405          if (first_byte_caseless)          if (first_char != first_char2)
3406              {
3407              pcre_uchar csc;
3408            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3409                   lcc[*current_subject] != first_byte)                   (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3410              current_subject++;              current_subject++;
3411              }
3412          else          else
3413            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3414                   *current_subject != first_byte)                   RAWUCHARTEST(current_subject) != first_char)
3415              current_subject++;              current_subject++;
3416          }          }
3417    
# Line 2932  for (;;) Line 3421  for (;;)
3421          {          {
3422          if (current_subject > md->start_subject + start_offset)          if (current_subject > md->start_subject + start_offset)
3423            {            {
3424  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3425            if (utf8)            if (utf)
3426              {              {
3427              while (current_subject < end_subject &&              while (current_subject < end_subject &&
3428                     !WAS_NEWLINE(current_subject))                     !WAS_NEWLINE(current_subject))
3429                {                {
3430                current_subject++;                current_subject++;
3431                while(current_subject < end_subject &&                ACROSSCHAR(current_subject < end_subject, *current_subject,
3432                      (*current_subject & 0xc0) == 0x80)                  current_subject++);
                 current_subject++;  
3433                }                }
3434              }              }
3435            else            else
# Line 2953  for (;;) Line 3441  for (;;)
3441            ANYCRLF, and we are now at a LF, advance the match position by one            ANYCRLF, and we are now at a LF, advance the match position by one
3442            more character. */            more character. */
3443    
3444            if (current_subject[-1] == CHAR_CR &&            if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3445                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3446                 current_subject < end_subject &&                 current_subject < end_subject &&
3447                 *current_subject == CHAR_NL)                 RAWUCHARTEST(current_subject) == CHAR_NL)
3448              current_subject++;              current_subject++;
3449            }            }
3450          }          }
# Line 2967  for (;;) Line 3455  for (;;)
3455          {          {
3456          while (current_subject < end_subject)          while (current_subject < end_subject)
3457            {            {
3458            register unsigned int c = *current_subject;            register pcre_uint32 c = RAWUCHARTEST(current_subject);
3459            if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;  #ifndef COMPILE_PCRE8
3460              else break;            if (c > 255) c = 255;
3461    #endif
3462              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3463                {
3464                current_subject++;
3465    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3466                /* In non 8-bit mode, the iteration will stop for
3467                characters > 255 at the beginning or not stop at all. */
3468                if (utf)
3469                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3470                    current_subject++);
3471    #endif
3472                }
3473              else break;
3474            }            }
3475          }          }
3476        }        }
# Line 2982  for (;;) Line 3483  for (;;)
3483      disabling is explicitly requested (and of course, by the test above, this      disabling is explicitly requested (and of course, by the test above, this
3484      code is not obeyed when restarting after a partial match). */      code is not obeyed when restarting after a partial match). */
3485    
3486      if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3487          (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)          (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3488        {        {
3489        /* If the pattern was studied, a minimum subject length may be set. This        /* If the pattern was studied, a minimum subject length may be set. This
# Line 2991  for (;;) Line 3492  for (;;)
3492        bytes to avoid spending too much time in this optimization. */        bytes to avoid spending too much time in this optimization. */
3493    
3494        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3495            end_subject - current_subject < study->minlength)            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3496          return PCRE_ERROR_NOMATCH;          return PCRE_ERROR_NOMATCH;
3497    
3498        /* If req_byte is set, we know that that character must appear in the        /* If req_char is set, we know that that character must appear in the
3499        subject for the match to succeed. If the first character is set, req_byte        subject for the match to succeed. If the first character is set, req_char
3500        must be later in the subject; otherwise the test starts at the match        must be later in the subject; otherwise the test starts at the match
3501        point. This optimization can save a huge amount of work in patterns with        point. This optimization can save a huge amount of work in patterns with
3502        nested unlimited repeats that aren't going to match. Writing separate        nested unlimited repeats that aren't going to match. Writing separate
# Line 3007  for (;;) Line 3508  for (;;)
3508        patterns. This showed up when somebody was matching /^C/ on a 32-megabyte        patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3509        string... so we don't do this when the string is sufficiently long. */        string... so we don't do this when the string is sufficiently long. */
3510    
3511        if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)        if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3512          {          {
3513          register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3514    
3515          /* We don't need to repeat the search if we haven't yet reached the          /* We don't need to repeat the search if we haven't yet reached the
3516          place we found it at last time. */          place we found it at last time. */
3517    
3518          if (p > req_byte_ptr)          if (p > req_char_ptr)
3519            {            {
3520            if (req_byte_caseless)            if (req_char != req_char2)
3521              {              {
3522              while (p < end_subject)              while (p < end_subject)
3523                {                {
3524                register int pp = *p++;                register pcre_uint32 pp = RAWUCHARINCTEST(p);
3525                if (pp == req_byte || pp == req_byte2) { p--; break; }                if (pp == req_char || pp == req_char2) { p--; break; }
3526                }                }
3527              }              }
3528            else            else
3529              {              {
3530              while (p < end_subject)              while (p < end_subject)
3531                {                {
3532                if (*p++ == req_byte) { p--; break; }                if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3533                }                }
3534              }              }
3535    
# Line 3041  for (;;) Line 3542  for (;;)
3542            found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3543            the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3544    
3545            req_byte_ptr = p;            req_char_ptr = p;
3546            }            }
3547          }          }
3548        }        }
# Line 3050  for (;;) Line 3551  for (;;)
3551    /* OK, now we can do the business */    /* OK, now we can do the business */
3552    
3553    md->start_used_ptr = current_subject;    md->start_used_ptr = current_subject;
3554      md->recursive = NULL;
3555    
3556    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3557      md,                                /* fixed match data */      md,                                /* fixed match data */
# Line 3060  for (;;) Line 3562  for (;;)
3562      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3563      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3564      wscount,                           /* size of same */      wscount,                           /* size of same */
3565      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3566    
3567    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3568    on only if not anchored. */    on only if not anchored. */
3569    
3570    if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;    if (rc != PCRE_ERROR_NOMATCH || anchored)
3571        {
3572        if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3573          {
3574          offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3575          offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3576          if (offsetcount > 2)
3577            offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3578          }
3579        return rc;
3580        }
3581    
3582    /* Advance to the next subject character unless we are at the end of a line    /* Advance to the next subject character unless we are at the end of a line
3583    and firstline is set. */    and firstline is set. */
3584    
3585    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3586    current_subject++;    current_subject++;
3587    if (utf8)  #ifdef SUPPORT_UTF
3588      if (utf)
3589      {      {
3590      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3591        current_subject++;        current_subject++);
3592      }      }
3593    #endif
3594    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3595    
3596    /* If we have just passed a CR and we are now at a LF, and the pattern does    /* If we have just passed a CR and we are now at a LF, and the pattern does
3597    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
3598    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3599    
3600    if (current_subject[-1] == CHAR_CR &&    if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3601        current_subject < end_subject &&        current_subject < end_subject &&
3602        *current_subject == CHAR_NL &&        RAWUCHARTEST(current_subject) == CHAR_NL &&
3603        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
3604          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
3605           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.473  
changed lines
  Added in v.1365

  ViewVC Help
Powered by ViewVC 1.1.5