/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 341 by ph10, Sat Apr 19 16:41:04 2008 UTC revision 850 by zherczeg, Wed Jan 4 17:29:11 2012 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49    the performance of his patterns greatly. I could not use it as it stood, as it
50    was not thread safe, and made assumptions about pattern sizes. Also, it caused
51    test 7 to loop, and test 9 to crash with a segfault.
52    
53    The issue is the check for duplicate states, which is done by a simple linear
54    search up the state list. (Grep for "duplicate" below to find the code.) For
55    many patterns, there will never be many states active at one time, so a simple
56    linear search is fine. In patterns that have many active states, it might be a
57    bottleneck. The suggested code used an indexing scheme to remember which states
58    had previously been used for each character, and avoided the linear search when
59    it knew there was no chance of a duplicate. This was implemented when adding
60    states to the state lists.
61    
62    I wrote some thread-safe, not-limited code to try something similar at the time
63    of checking for duplicates (instead of when adding states), using index vectors
64    on the stack. It did give a 13% improvement with one specially constructed
65    pattern for certain subject strings, but on other strings and on many of the
66    simpler patterns in the test suite it did worse. The major problem, I think,
67    was the extra time to initialize the index. This had to be done for each call
68    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69    only once - I suspect this was the cause of the problems with the tests.)
70    
71    Overall, I concluded that the gains in some cases did not outweigh the losses
72    in others, so I abandoned this code. */
73    
74    
75    
76  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
77  #include "config.h"  #include "config.h"
78  #endif  #endif
# Line 60  applications. */ Line 89  applications. */
89  #define SP "                   "  #define SP "                   "
90    
91    
   
92  /*************************************************  /*************************************************
93  *      Code parameters and static tables         *  *      Code parameters and static tables         *
94  *************************************************/  *************************************************/
# Line 78  never stored, so we push them well clear Line 106  never stored, so we push them well clear
106    
107    
108  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
109  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
110  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
111  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. Non-zero values in the table are the offsets from the opcode where
113  that follow must also be modified. */  the character is to be found. ***NOTE*** If the start of this table is
114    modified, the three tables that follow must also be modified. */
115    
116  static const uschar coptable[] = {  static const pcre_uint8 coptable[] = {
117    0,                             /* End                                    */    0,                             /* End                                    */
118    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120    0, 0, 0,                       /* Any, AllAny, Anybyte                   */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0,                          /* \P, \p                                 */
122    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0,                             /* \X                                     */
124      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
125    1,                             /* Char                                   */    1,                             /* Char                                   */
126    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
127    1,                             /* not                                    */    1,                             /* not                                    */
128      1,                             /* noti                                   */
129    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
130    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
133      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136      1+IMM2_SIZE,                   /* exact I                                */
137      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
139    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
142      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145      1+IMM2_SIZE,                   /* NOT exact I                            */
146      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
148    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
151      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
153    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 114  static const uschar coptable[] = { Line 156  static const uschar coptable[] = {
156    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
157    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
158    0,                             /* REF                                    */    0,                             /* REF                                    */
159      0,                             /* REFI                                   */
160    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
161    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
162    0,                             /* Alt                                    */    0,                             /* Alt                                    */
163    0,                             /* Ket                                    */    0,                             /* Ket                                    */
164    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
165    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
166      0,                             /* KetRpos                                */
167      0,                             /* Reverse                                */
168    0,                             /* Assert                                 */    0,                             /* Assert                                 */
169    0,                             /* Assert not                             */    0,                             /* Assert not                             */
170    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
171    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
172      0, 0,                          /* ONCE, ONCE_NC                          */
173      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
174      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
175      0, 0,                          /* CREF, NCREF                            */
176      0, 0,                          /* RREF, NRREF                            */
177      0,                             /* DEF                                    */
178      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
179      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
180      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
181      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
182      0, 0                           /* CLOSE, SKIPZERO  */
183    };
184    
185    /* This table identifies those opcodes that inspect a character. It is used to
186    remember the fact that a character could have been inspected when the end of
187    the subject is reached. ***NOTE*** If the start of this table is modified, the
188    two tables that follow must also be modified. */
189    
190    static const pcre_uint8 poptable[] = {
191      0,                             /* End                                    */
192      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
193      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
194      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
195      1, 1,                          /* \P, \p                                 */
196      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
197      1,                             /* \X                                     */
198      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
199      1,                             /* Char                                   */
200      1,                             /* Chari                                  */
201      1,                             /* not                                    */
202      1,                             /* noti                                   */
203      /* Positive single-char repeats                                          */
204      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
205      1, 1, 1,                       /* upto, minupto, exact                   */
206      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
207      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
208      1, 1, 1,                       /* upto I, minupto I, exact I             */
209      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
210      /* Negative single-char repeats - only for chars < 256                   */
211      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
212      1, 1, 1,                       /* NOT upto, minupto, exact               */
213      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
214      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
215      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
216      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
217      /* Positive type repeats                                                 */
218      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
219      1, 1, 1,                       /* Type upto, minupto, exact              */
220      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
221      /* Character class & ref repeats                                         */
222      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
223      1, 1,                          /* CRRANGE, CRMINRANGE                    */
224      1,                             /* CLASS                                  */
225      1,                             /* NCLASS                                 */
226      1,                             /* XCLASS - variable length               */
227      0,                             /* REF                                    */
228      0,                             /* REFI                                   */
229      0,                             /* RECURSE                                */
230      0,                             /* CALLOUT                                */
231      0,                             /* Alt                                    */
232      0,                             /* Ket                                    */
233      0,                             /* KetRmax                                */
234      0,                             /* KetRmin                                */
235      0,                             /* KetRpos                                */
236    0,                             /* Reverse                                */    0,                             /* Reverse                                */
237    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
238    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
239    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
240    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
241      0, 0,                          /* ONCE, ONCE_NC                          */
242      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
243      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
244      0, 0,                          /* CREF, NCREF                            */
245      0, 0,                          /* RREF, NRREF                            */
246    0,                             /* DEF                                    */    0,                             /* DEF                                    */
247    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
248    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
249    0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
250      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
251      0, 0                           /* CLOSE, SKIPZERO                        */
252  };  };
253    
254  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
255  and \w */  and \w */
256    
257  static const uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
258    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
259    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
260    ctype_space, ctype_space,    ctype_space, ctype_space,
# Line 146  static const uschar toptable1[] = { Line 262  static const uschar toptable1[] = {
262    0, 0                            /* OP_ANY, OP_ALLANY */    0, 0                            /* OP_ANY, OP_ALLANY */
263  };  };
264    
265  static const uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
266    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
267    ctype_digit, 0,    ctype_digit, 0,
268    ctype_space, 0,    ctype_space, 0,
# Line 163  these structures in, is a vector of ints Line 279  these structures in, is a vector of ints
279  typedef struct stateblock {  typedef struct stateblock {
280    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
281    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
282    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
283  } stateblock;  } stateblock;
284    
285  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
286    
287    
288  #ifdef DEBUG  #ifdef PCRE_DEBUG
289  /*************************************************  /*************************************************
290  *             Print character string             *  *             Print character string             *
291  *************************************************/  *************************************************/
# Line 186  Returns:       nothing Line 301  Returns:       nothing
301  */  */
302    
303  static void  static void
304  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
305  {  {
306  int c;  int c;
307  while (length-- > 0)  while (length-- > 0)
# Line 219  Arguments: Line 334  Arguments:
334    offsetcount       size of same    offsetcount       size of same
335    workspace         vector of workspace    workspace         vector of workspace
336    wscount           size of same    wscount           size of same
   ims               the current ims flags  
337    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
338    
339  Returns:            > 0 => number of match offset pairs placed in offsets  Returns:            > 0 => number of match offset pairs placed in offsets
340                      = 0 => offsets overflowed; longest matches are present                      = 0 => offsets overflowed; longest matches are present
341                       -1 => failed to match                       -1 => failed to match
342                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
# Line 236  for the current character, one for the f Line 349  for the current character, one for the f
349      { \      { \
350      next_active_state->offset = (x); \      next_active_state->offset = (x); \
351      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
352      next_active_state++; \      next_active_state++; \
353      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354      } \      } \
# Line 247  for the current character, one for the f Line 359  for the current character, one for the f
359      { \      { \
360      next_active_state->offset = (x); \      next_active_state->offset = (x); \
361      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
362      next_active_state->data   = (z); \      next_active_state->data   = (z); \
363      next_active_state++; \      next_active_state++; \
364      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 259  for the current character, one for the f Line 370  for the current character, one for the f
370      { \      { \
371      next_new_state->offset = (x); \      next_new_state->offset = (x); \
372      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
373      next_new_state++; \      next_new_state++; \
374      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
375      } \      } \
# Line 270  for the current character, one for the f Line 380  for the current character, one for the f
380      { \      { \
381      next_new_state->offset = (x); \      next_new_state->offset = (x); \
382      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
383      next_new_state->data   = (z); \      next_new_state->data   = (z); \
384      next_new_state++; \      next_new_state++; \
385      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 282  for the current character, one for the f Line 391  for the current character, one for the f
391  static int  static int
392  internal_dfa_exec(  internal_dfa_exec(
393    dfa_match_data *md,    dfa_match_data *md,
394    const uschar *this_start_code,    const pcre_uchar *this_start_code,
395    const uschar *current_subject,    const pcre_uchar *current_subject,
396    int start_offset,    int start_offset,
397    int *offsets,    int *offsets,
398    int offsetcount,    int offsetcount,
399    int *workspace,    int *workspace,
400    int wscount,    int wscount,
401    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
402  {  {
403  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
404  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
405    
406  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
407  const uschar *ptr;  const pcre_uchar *ptr;
408  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
409    
410    dfa_recursion_info new_recursive;
411    
412  int active_count, new_count, match_count;  int active_count, new_count, match_count;
413    
414  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
415  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
416    
417  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
418  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
419  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
420    
421  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
422  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423  #else  #else
424  BOOL utf8 = FALSE;  BOOL utf = FALSE;
425  #endif  #endif
426    
427  rlevel++;  rlevel++;
# Line 323  wscount = (wscount - (wscount % (INTS_PE Line 432  wscount = (wscount - (wscount % (INTS_PE
432            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
433    
434  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
435    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
436    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
437    
438  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
439  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 337  next_new_state = new_states = active_sta Line 446  next_new_state = new_states = active_sta
446  new_count = 0;  new_count = 0;
447    
448  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
449    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
450        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
451        ? IMM2_SIZE:0);
452    
453  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
454  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 365  if (*first_op == OP_REVERSE) Line 476  if (*first_op == OP_REVERSE)
476    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
477    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
478    
479  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
480    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
481    
482    if (utf8)    if (utf)
483      {      {
484      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
485        {        {
486        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
487        current_subject--;        current_subject--;
488        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
489        }        }
490      }      }
491    else    else
# Line 386  if (*first_op == OP_REVERSE) Line 495  if (*first_op == OP_REVERSE)
495    
496      {      {
497      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
498        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
499      current_subject -= gone_back;      current_subject -= gone_back;
500      }      }
501    
502      /* Save the earliest consulted character */
503    
504      if (current_subject < md->start_used_ptr)
505        md->start_used_ptr = current_subject;
506    
507    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
508    
509    end_code = this_start_code;    end_code = this_start_code;
# Line 398  if (*first_op == OP_REVERSE) Line 512  if (*first_op == OP_REVERSE)
512      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
513      if (back <= gone_back)      if (back <= gone_back)
514        {        {
515        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
516        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
517        }        }
518      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 431  else Line 545  else
545    else    else
546      {      {
547      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
548        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
549            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
550            ? IMM2_SIZE:0);
551      do      do
552        {        {
553        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
554        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
555        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
556        }        }
# Line 444  else Line 560  else
560    
561  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
562    
563  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
564    
565  /* Loop for scanning the subject */  /* Loop for scanning the subject */
566    
# Line 454  for (;;) Line 570  for (;;)
570    int i, j;    int i, j;
571    int clen, dlen;    int clen, dlen;
572    unsigned int c, d;    unsigned int c, d;
573      int forced_fail = 0;
574      BOOL could_continue = FALSE;
575    
576    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
577    new state list. */    new state list. */
# Line 467  for (;;) Line 585  for (;;)
585    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
586    workspace[1] = active_count;    workspace[1] = active_count;
587    
588  #ifdef DEBUG  #ifdef PCRE_DEBUG
589    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
590    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
591    printf("\"\n");    printf("\"\n");
592    
593    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 490  for (;;) Line 608  for (;;)
608    if (ptr < end_subject)    if (ptr < end_subject)
609      {      {
610      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of bytes in the character */
611  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
612      if (utf8) { GETCHARLEN(c, ptr, clen); } else      if (utf) { GETCHARLEN(c, ptr, clen); } else
613  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
614      c = *ptr;      c = *ptr;
615      }      }
616    else    else
# Line 509  for (;;) Line 627  for (;;)
627    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
628      {      {
629      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
630      const uschar *code;      BOOL caseless = FALSE;
631        const pcre_uchar *code;
632      int state_offset = current_state->offset;      int state_offset = current_state->offset;
633      int count, codevalue;      int count, codevalue, rrc;
 #ifdef SUPPORT_UCP  
     int chartype, script;  
 #endif  
634    
635  #ifdef DEBUG  #ifdef PCRE_DEBUG
636      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
637      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
638        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
639          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
640  #endif  #endif
641    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
642      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
643      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
644      been skipped". */      been skipped". */
# Line 546  for (;;) Line 658  for (;;)
658          }          }
659        }        }
660    
661      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
662        See the note at the head of this module about the possibility of improving
663        performance here. */
664    
665      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
666        {        {
# Line 563  for (;;) Line 677  for (;;)
677      code = start_code + state_offset;      code = start_code + state_offset;
678      codevalue = *code;      codevalue = *code;
679    
680        /* If this opcode inspects a character, but we are at the end of the
681        subject, remember the fact for use when testing for a partial match. */
682    
683        if (clen == 0 && poptable[codevalue] != 0)
684          could_continue = TRUE;
685    
686      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
687      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
688      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
# Line 577  for (;;) Line 697  for (;;)
697      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
698        {        {
699        dlen = 1;        dlen = 1;
700  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
701        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
702  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
703        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
704        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
705          {          {
# Line 609  for (;;) Line 729  for (;;)
729    
730      switch (codevalue)      switch (codevalue)
731        {        {
732    /* ========================================================================== */
733          /* These cases are never obeyed. This is a fudge that causes a compile-
734          time error if the vectors coptable or poptable, which are indexed by
735          opcode, are not the correct length. It seems to be the only way to do
736          such a check at compile time, as the sizeof() operator does not work
737          in the C preprocessor. */
738    
739          case OP_TABLE_LENGTH:
740          case OP_TABLE_LENGTH +
741            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
742             (sizeof(poptable) == OP_TABLE_LENGTH)):
743          break;
744    
745  /* ========================================================================== */  /* ========================================================================== */
746        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
747        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
748        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
749          subpattern, because the possessive subpattern repeats are always handled
750          using recursive calls. Thus, it never adds any new states.
751    
752          At the end of the (sub)pattern, unless we have an empty string and
753          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
754          start of the subject, save the match data, shifting up all previous
755        matches so we always have the longest first. */        matches so we always have the longest first. */
756    
757        case OP_KET:        case OP_KET:
758        case OP_KETRMIN:        case OP_KETRMIN:
759        case OP_KETRMAX:        case OP_KETRMAX:
760          case OP_KETRPOS:
761        if (code != end_code)        if (code != end_code)
762          {          {
763          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 627  for (;;) Line 766  for (;;)
766            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
767            }            }
768          }          }
769        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
770          {          {
771          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
772            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
773              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
774          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
775          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
776          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
777            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
778            offsets[0] = current_subject - start_subject;                match_count = 0;
779            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
780            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
781              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
782            }              {
783          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
784            {              offsets[1] = (int)(ptr - start_subject);
785            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
786              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], current_subject));
787              match_count, rlevel*2-2, SP));              }
788            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
789                {
790                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
791                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
792                  match_count, rlevel*2-2, SP));
793                return match_count;
794                }
795            }            }
796          }          }
797        break;        break;
# Line 658  for (;;) Line 803  for (;;)
803        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
804        case OP_ALT:        case OP_ALT:
805        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
806        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
807        break;        break;
808    
809        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 666  for (;;) Line 811  for (;;)
811        case OP_SBRA:        case OP_SBRA:
812        do        do
813          {          {
814          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
815          code += GET(code, 1);          code += GET(code, 1);
816          }          }
817        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 675  for (;;) Line 820  for (;;)
820        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
821        case OP_CBRA:        case OP_CBRA:
822        case OP_SCBRA:        case OP_SCBRA:
823        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
824        code += GET(code, 1);        code += GET(code, 1);
825        while (*code == OP_ALT)        while (*code == OP_ALT)
826          {          {
827          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
828          code += GET(code, 1);          code += GET(code, 1);
829          }          }
830        break;        break;
# Line 690  for (;;) Line 835  for (;;)
835        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
836        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
837        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
838        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
839        break;        break;
840    
841        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
842        case OP_SKIPZERO:        case OP_SKIPZERO:
843        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
844        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
845        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
846        break;        break;
847    
848        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
849        case OP_CIRC:        case OP_CIRC:
850        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
851          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
852        break;        break;
853    
854        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
855        case OP_EOD:        case OP_CIRCM:
856        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
857              (ptr != end_subject && WAS_NEWLINE(ptr)))
858            { ADD_ACTIVE(state_offset + 1, 0); }
859        break;        break;
860    
861        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
862        case OP_OPT:        case OP_EOD:
863        ims = code[1];        if (ptr >= end_subject)
864        ADD_ACTIVE(state_offset + 2, 0);          {
865            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
866              could_continue = TRUE;
867            else { ADD_ACTIVE(state_offset + 1, 0); }
868            }
869        break;        break;
870    
871        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 739  for (;;) Line 887  for (;;)
887    
888        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
889        case OP_ANY:        case OP_ANY:
890        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
891          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
892        break;        break;
893    
# Line 751  for (;;) Line 899  for (;;)
899    
900        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
901        case OP_EODN:        case OP_EODN:
902        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
903            could_continue = TRUE;
904          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
905          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
906        break;        break;
907    
# Line 759  for (;;) Line 909  for (;;)
909        case OP_DOLL:        case OP_DOLL:
910        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
911          {          {
912          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
913              (IS_NEWLINE(ptr) &&            could_continue = TRUE;
914                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)          else if (clen == 0 ||
915                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
916                   (ptr == end_subject - md->nllen)
917              ))              ))
918            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
919          }          }
920        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        break;
921    
922          /*-----------------------------------------------------------------*/
923          case OP_DOLLM:
924          if ((md->moptions & PCRE_NOTEOL) == 0)
925            {
926            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
927              could_continue = TRUE;
928            else if (clen == 0 ||
929                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
930              { ADD_ACTIVE(state_offset + 1, 0); }
931            }
932          else if (IS_NEWLINE(ptr))
933          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
934        break;        break;
935    
# Line 796  for (;;) Line 960  for (;;)
960    
961          if (ptr > start_subject)          if (ptr > start_subject)
962            {            {
963            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
964  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
965            if (utf8) BACKCHAR(temp);  #ifdef SUPPORT_UTF
966              if (utf) { BACKCHAR(temp); }
967  #endif  #endif
968            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
969    #ifdef SUPPORT_UCP
970              if ((md->poptions & PCRE_UCP) != 0)
971                {
972                if (d == '_') left_word = TRUE; else
973                  {
974                  int cat = UCD_CATEGORY(d);
975                  left_word = (cat == ucp_L || cat == ucp_N);
976                  }
977                }
978              else
979    #endif
980            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
981            }            }
982          else left_word = 0;          else left_word = FALSE;
983    
984          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
985            else right_word = 0;            {
986    #ifdef SUPPORT_UCP
987              if ((md->poptions & PCRE_UCP) != 0)
988                {
989                if (c == '_') right_word = TRUE; else
990                  {
991                  int cat = UCD_CATEGORY(c);
992                  right_word = (cat == ucp_L || cat == ucp_N);
993                  }
994                }
995              else
996    #endif
997              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
998              }
999            else right_word = FALSE;
1000    
1001          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1002            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 825  for (;;) Line 1015  for (;;)
1015        if (clen > 0)        if (clen > 0)
1016          {          {
1017          BOOL OK;          BOOL OK;
1018          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1019          switch(code[1])          switch(code[1])
1020            {            {
1021            case PT_ANY:            case PT_ANY:
# Line 833  for (;;) Line 1023  for (;;)
1023            break;            break;
1024    
1025            case PT_LAMP:            case PT_LAMP:
1026            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1027                   prop->chartype == ucp_Lt;
1028            break;            break;
1029    
1030            case PT_GC:            case PT_GC:
1031            OK = category == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1032            break;            break;
1033    
1034            case PT_PC:            case PT_PC:
1035            OK = chartype == code[2];            OK = prop->chartype == code[2];
1036            break;            break;
1037    
1038            case PT_SC:            case PT_SC:
1039            OK = script == code[2];            OK = prop->script == code[2];
1040              break;
1041    
1042              /* These are specials for combination cases. */
1043    
1044              case PT_ALNUM:
1045              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1046                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1047              break;
1048    
1049              case PT_SPACE:    /* Perl space */
1050              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1051                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1052              break;
1053    
1054              case PT_PXSPACE:  /* POSIX space */
1055              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1056                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1057                   c == CHAR_FF || c == CHAR_CR;
1058              break;
1059    
1060              case PT_WORD:
1061              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1062                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1063                   c == CHAR_UNDERSCORE;
1064            break;            break;
1065    
1066            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 877  for (;;) Line 1092  for (;;)
1092          {          {
1093          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1094              (c < 256 &&              (c < 256 &&
1095                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1096                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1097            {            {
1098            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 903  for (;;) Line 1115  for (;;)
1115          {          {
1116          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1117              (c < 256 &&              (c < 256 &&
1118                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1119                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1120            {            {
1121            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 928  for (;;) Line 1137  for (;;)
1137          {          {
1138          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1139              (c < 256 &&              (c < 256 &&
1140                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1141                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1142            {            {
1143            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 951  for (;;) Line 1157  for (;;)
1157          {          {
1158          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1159              (c < 256 &&              (c < 256 &&
1160                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1161                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1162            {            {
1163            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1164              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1165            else            else
1166              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1167            }            }
# Line 969  for (;;) Line 1172  for (;;)
1172        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1173        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1174        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1175        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1176        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1177        if (clen > 0)        if (clen > 0)
1178          {          {
1179          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1180              (c < 256 &&              (c < 256 &&
1181                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1182                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1183            {            {
1184            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 987  for (;;) Line 1187  for (;;)
1187              next_active_state--;              next_active_state--;
1188              }              }
1189            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1190              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1191            else            else
1192              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1193            }            }
# Line 1009  for (;;) Line 1209  for (;;)
1209        if (clen > 0)        if (clen > 0)
1210          {          {
1211          BOOL OK;          BOOL OK;
1212          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1213          switch(code[2])          switch(code[2])
1214            {            {
1215            case PT_ANY:            case PT_ANY:
# Line 1017  for (;;) Line 1217  for (;;)
1217            break;            break;
1218    
1219            case PT_LAMP:            case PT_LAMP:
1220            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1221                prop->chartype == ucp_Lt;
1222            break;            break;
1223    
1224            case PT_GC:            case PT_GC:
1225            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1226            break;            break;
1227    
1228            case PT_PC:            case PT_PC:
1229            OK = chartype == code[3];            OK = prop->chartype == code[3];
1230            break;            break;
1231    
1232            case PT_SC:            case PT_SC:
1233            OK = script == code[3];            OK = prop->script == code[3];
1234              break;
1235    
1236              /* These are specials for combination cases. */
1237    
1238              case PT_ALNUM:
1239              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1240                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1241              break;
1242    
1243              case PT_SPACE:    /* Perl space */
1244              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1245                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1246              break;
1247    
1248              case PT_PXSPACE:  /* POSIX space */
1249              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1250                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1251                   c == CHAR_FF || c == CHAR_CR;
1252              break;
1253    
1254              case PT_WORD:
1255              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1256                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1257                   c == CHAR_UNDERSCORE;
1258            break;            break;
1259    
1260            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1058  for (;;) Line 1283  for (;;)
1283        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1284        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1285        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1286        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1287          {          {
1288          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1289          int ncount = 0;          int ncount = 0;
1290          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1291            {            {
# Line 1072  for (;;) Line 1297  for (;;)
1297            int nd;            int nd;
1298            int ndlen = 1;            int ndlen = 1;
1299            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1300            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1301            ncount++;            ncount++;
1302            nptr += ndlen;            nptr += ndlen;
1303            }            }
# Line 1231  for (;;) Line 1456  for (;;)
1456        if (clen > 0)        if (clen > 0)
1457          {          {
1458          BOOL OK;          BOOL OK;
1459          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1460          switch(code[2])          switch(code[2])
1461            {            {
1462            case PT_ANY:            case PT_ANY:
# Line 1239  for (;;) Line 1464  for (;;)
1464            break;            break;
1465    
1466            case PT_LAMP:            case PT_LAMP:
1467            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1468                prop->chartype == ucp_Lt;
1469            break;            break;
1470    
1471            case PT_GC:            case PT_GC:
1472            OK = category == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1473            break;            break;
1474    
1475            case PT_PC:            case PT_PC:
1476            OK = chartype == code[3];            OK = prop->chartype == code[3];
1477            break;            break;
1478    
1479            case PT_SC:            case PT_SC:
1480            OK = script == code[3];            OK = prop->script == code[3];
1481              break;
1482    
1483              /* These are specials for combination cases. */
1484    
1485              case PT_ALNUM:
1486              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1487                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1488              break;
1489    
1490              case PT_SPACE:    /* Perl space */
1491              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1492                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1493              break;
1494    
1495              case PT_PXSPACE:  /* POSIX space */
1496              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1497                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1498                   c == CHAR_FF || c == CHAR_CR;
1499              break;
1500    
1501              case PT_WORD:
1502              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1503                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1504                   c == CHAR_UNDERSCORE;
1505            break;            break;
1506    
1507            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1289  for (;;) Line 1539  for (;;)
1539        QS2:        QS2:
1540    
1541        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1542        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1543          {          {
1544          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1545          int ncount = 0;          int ncount = 0;
1546          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1547              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1304  for (;;) Line 1554  for (;;)
1554            int nd;            int nd;
1555            int ndlen = 1;            int ndlen = 1;
1556            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1557            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1558            ncount++;            ncount++;
1559            nptr += ndlen;            nptr += ndlen;
1560            }            }
# Line 1473  for (;;) Line 1723  for (;;)
1723        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1724        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1725        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1726          { ADD_ACTIVE(state_offset + 6, 0); }          { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1727        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1728        if (clen > 0)        if (clen > 0)
1729          {          {
1730          BOOL OK;          BOOL OK;
1731          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1732          switch(code[4])          switch(code[1 + IMM2_SIZE + 1])
1733            {            {
1734            case PT_ANY:            case PT_ANY:
1735            OK = TRUE;            OK = TRUE;
1736            break;            break;
1737    
1738            case PT_LAMP:            case PT_LAMP:
1739            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1740                prop->chartype == ucp_Lt;
1741            break;            break;
1742    
1743            case PT_GC:            case PT_GC:
1744            OK = category == code[5];            OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1745            break;            break;
1746    
1747            case PT_PC:            case PT_PC:
1748            OK = chartype == code[5];            OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1749            break;            break;
1750    
1751            case PT_SC:            case PT_SC:
1752            OK = script == code[5];            OK = prop->script == code[1 + IMM2_SIZE + 2];
1753              break;
1754    
1755              /* These are specials for combination cases. */
1756    
1757              case PT_ALNUM:
1758              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1759                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1760              break;
1761    
1762              case PT_SPACE:    /* Perl space */
1763              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1764                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1765              break;
1766    
1767              case PT_PXSPACE:  /* POSIX space */
1768              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1769                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1770                   c == CHAR_FF || c == CHAR_CR;
1771              break;
1772    
1773              case PT_WORD:
1774              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1775                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1776                   c == CHAR_UNDERSCORE;
1777            break;            break;
1778    
1779            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1516  for (;;) Line 1791  for (;;)
1791              next_active_state--;              next_active_state--;
1792              }              }
1793            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1794              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1795            else            else
1796              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1797            }            }
# Line 1529  for (;;) Line 1804  for (;;)
1804        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1805        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1806        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1807          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1808        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1809        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1810          {          {
1811          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1812          int ncount = 0;          int ncount = 0;
1813          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1814            {            {
# Line 1545  for (;;) Line 1820  for (;;)
1820            int nd;            int nd;
1821            int ndlen = 1;            int ndlen = 1;
1822            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1823            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1824            ncount++;            ncount++;
1825            nptr += ndlen;            nptr += ndlen;
1826            }            }
1827          if (++count >= GET2(code, 1))          if (++count >= GET2(code, 1))
1828            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1829          else          else
1830            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1831          }          }
# Line 1563  for (;;) Line 1838  for (;;)
1838        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1839        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1840        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1841          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1842        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1843        if (clen > 0)        if (clen > 0)
1844          {          {
# Line 1590  for (;;) Line 1865  for (;;)
1865              next_active_state--;              next_active_state--;
1866              }              }
1867            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1868              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1869            else            else
1870              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1871            break;            break;
# Line 1607  for (;;) Line 1882  for (;;)
1882        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1883        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1884        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1885          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1886        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1887        if (clen > 0)        if (clen > 0)
1888          {          {
# Line 1636  for (;;) Line 1911  for (;;)
1911              next_active_state--;              next_active_state--;
1912              }              }
1913            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1914              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1915            else            else
1916              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1917            }            }
# Line 1649  for (;;) Line 1924  for (;;)
1924        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1925        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1926        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1927          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1928        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1929        if (clen > 0)        if (clen > 0)
1930          {          {
# Line 1691  for (;;) Line 1966  for (;;)
1966              next_active_state--;              next_active_state--;
1967              }              }
1968            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1969              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1970            else            else
1971              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1972            }            }
# Line 1710  for (;;) Line 1985  for (;;)
1985        break;        break;
1986    
1987        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1988        case OP_CHARNC:        case OP_CHARI:
1989        if (clen == 0) break;        if (clen == 0) break;
1990    
1991  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1992        if (utf8)        if (utf)
1993          {          {
1994          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1995            {            {
1996            unsigned int othercase;            unsigned int othercase;
1997            if (c < 128) othercase = fcc[c]; else            if (c < 128)
1998                othercase = fcc[c];
1999            /* If we have Unicode property support, we can use it to test the            else
2000            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2001                other case of the character. */
2002  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2003            othercase = _pcre_ucp_othercase(c);              othercase = UCD_OTHERCASE(c);
2004  #else  #else
2005            othercase = NOTACHAR;              othercase = NOTACHAR;
2006  #endif  #endif
2007    
2008            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2009            }            }
2010          }          }
2011        else        else
2012  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2013          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2014          {          {
2015          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2016              { ADD_NEW(state_offset + 2, 0); }
2017          }          }
2018        break;        break;
2019    
# Line 1750  for (;;) Line 2025  for (;;)
2025        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2026    
2027        case OP_EXTUNI:        case OP_EXTUNI:
2028        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2029          {          {
2030          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
2031          int ncount = 0;          int ncount = 0;
2032          while (nptr < end_subject)          while (nptr < end_subject)
2033            {            {
2034            int nclen = 1;            int nclen = 1;
2035            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
2036            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
2037            ncount++;            ncount++;
2038            nptr += nclen;            nptr += nclen;
2039            }            }
# Line 1896  for (;;) Line 2171  for (;;)
2171        break;        break;
2172    
2173        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2174        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character casefully. This is only used for
2175        characters, that is, we know that d < 256. The character we are        one-byte characters, that is, we know that d < 256. The character we are
2176        checking (c) can be multibyte. */        checking (c) can be multibyte. */
2177    
2178        case OP_NOT:        case OP_NOT:
2179        if (clen > 0)        if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2180          {        break;
2181          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;  
2182          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }        /*-----------------------------------------------------------------*/
2183          }        /* Match a negated single character caselessly. This is only used for
2184          one-byte characters, that is, we know that d < 256. The character we are
2185          checking (c) can be multibyte. */
2186    
2187          case OP_NOTI:
2188          if (clen > 0 && c != d && c != fcc[d])
2189            { ADD_NEW(state_offset + dlen + 1, 0); }
2190        break;        break;
2191    
2192        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2193          case OP_PLUSI:
2194          case OP_MINPLUSI:
2195          case OP_POSPLUSI:
2196          case OP_NOTPLUSI:
2197          case OP_NOTMINPLUSI:
2198          case OP_NOTPOSPLUSI:
2199          caseless = TRUE;
2200          codevalue -= OP_STARI - OP_STAR;
2201    
2202          /* Fall through */
2203        case OP_PLUS:        case OP_PLUS:
2204        case OP_MINPLUS:        case OP_MINPLUS:
2205        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1920  for (;;) Line 2211  for (;;)
2211        if (clen > 0)        if (clen > 0)
2212          {          {
2213          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2214          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2215            {            {
2216  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2217            if (utf8 && d >= 128)            if (utf && d >= 128)
2218              {              {
2219  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2220              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2221  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2222              }              }
2223            else            else
2224  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2225            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2226            }            }
2227          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2228            {            {
# Line 1948  for (;;) Line 2239  for (;;)
2239        break;        break;
2240    
2241        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2242          case OP_QUERYI:
2243          case OP_MINQUERYI:
2244          case OP_POSQUERYI:
2245          case OP_NOTQUERYI:
2246          case OP_NOTMINQUERYI:
2247          case OP_NOTPOSQUERYI:
2248          caseless = TRUE;
2249          codevalue -= OP_STARI - OP_STAR;
2250          /* Fall through */
2251        case OP_QUERY:        case OP_QUERY:
2252        case OP_MINQUERY:        case OP_MINQUERY:
2253        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1958  for (;;) Line 2258  for (;;)
2258        if (clen > 0)        if (clen > 0)
2259          {          {
2260          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2261          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2262            {            {
2263  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2264            if (utf8 && d >= 128)            if (utf && d >= 128)
2265              {              {
2266  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2267              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2268  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2269              }              }
2270            else            else
2271  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2272            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2273            }            }
2274          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2275            {            {
# Line 1984  for (;;) Line 2284  for (;;)
2284        break;        break;
2285    
2286        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2287          case OP_STARI:
2288          case OP_MINSTARI:
2289          case OP_POSSTARI:
2290          case OP_NOTSTARI:
2291          case OP_NOTMINSTARI:
2292          case OP_NOTPOSSTARI:
2293          caseless = TRUE;
2294          codevalue -= OP_STARI - OP_STAR;
2295          /* Fall through */
2296        case OP_STAR:        case OP_STAR:
2297        case OP_MINSTAR:        case OP_MINSTAR:
2298        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1994  for (;;) Line 2303  for (;;)
2303        if (clen > 0)        if (clen > 0)
2304          {          {
2305          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2306          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2307            {            {
2308  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2309            if (utf8 && d >= 128)            if (utf && d >= 128)
2310              {              {
2311  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2312              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2313  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2314              }              }
2315            else            else
2316  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2317            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2318            }            }
2319          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2320            {            {
# Line 2020  for (;;) Line 2329  for (;;)
2329        break;        break;
2330    
2331        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2332          case OP_EXACTI:
2333          case OP_NOTEXACTI:
2334          caseless = TRUE;
2335          codevalue -= OP_STARI - OP_STAR;
2336          /* Fall through */
2337        case OP_EXACT:        case OP_EXACT:
2338        case OP_NOTEXACT:        case OP_NOTEXACT:
2339        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2340        if (clen > 0)        if (clen > 0)
2341          {          {
2342          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2343          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2344            {            {
2345  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2346            if (utf8 && d >= 128)            if (utf && d >= 128)
2347              {              {
2348  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2349              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2350  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2351              }              }
2352            else            else
2353  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2354            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2355            }            }
2356          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2357            {            {
2358            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2359              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2360            else            else
2361              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2362            }            }
# Line 2050  for (;;) Line 2364  for (;;)
2364        break;        break;
2365    
2366        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2367          case OP_UPTOI:
2368          case OP_MINUPTOI:
2369          case OP_POSUPTOI:
2370          case OP_NOTUPTOI:
2371          case OP_NOTMINUPTOI:
2372          case OP_NOTPOSUPTOI:
2373          caseless = TRUE;
2374          codevalue -= OP_STARI - OP_STAR;
2375          /* Fall through */
2376        case OP_UPTO:        case OP_UPTO:
2377        case OP_MINUPTO:        case OP_MINUPTO:
2378        case OP_POSUPTO:        case OP_POSUPTO:
2379        case OP_NOTUPTO:        case OP_NOTUPTO:
2380        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2381        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2382        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2383        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2384        if (clen > 0)        if (clen > 0)
2385          {          {
2386          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2387          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2388            {            {
2389  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2390            if (utf8 && d >= 128)            if (utf && d >= 128)
2391              {              {
2392  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2393              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2394  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2395              }              }
2396            else            else
2397  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2398            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2399            }            }
2400          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2401            {            {
# Line 2082  for (;;) Line 2405  for (;;)
2405              next_active_state--;              next_active_state--;
2406              }              }
2407            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2408              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2409            else            else
2410              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2411            }            }
# Line 2099  for (;;) Line 2422  for (;;)
2422          {          {
2423          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2424          int next_state_offset;          int next_state_offset;
2425          const uschar *ecode;          const pcre_uchar *ecode;
2426    
2427          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2428          can set isinclass from it. */          can set isinclass from it. */
2429    
2430          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2431            {            {
2432            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2433            if (clen > 0)            if (clen > 0)
2434              {              {
2435              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2436                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2437              }              }
2438            }            }
2439    
# Line 2121  for (;;) Line 2444  for (;;)
2444          else          else
2445           {           {
2446           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2447           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2448           }           }
2449    
2450          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2451          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2452          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2453    
2454          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2455    
2456          switch (*ecode)          switch (*ecode)
2457            {            {
# Line 2155  for (;;) Line 2478  for (;;)
2478            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2479            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2480            if (count >= GET2(ecode, 1))            if (count >= GET2(ecode, 1))
2481              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2482            if (isinclass)            if (isinclass)
2483              {              {
2484              int max = GET2(ecode, 3);              int max = GET2(ecode, 1 + IMM2_SIZE);
2485              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2486                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2487              else              else
2488                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2489              }              }
# Line 2175  for (;;) Line 2498  for (;;)
2498    
2499  /* ========================================================================== */  /* ========================================================================== */
2500        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2501        to use recursion in order to handle them. The "always failing" assersion        to use recursion in order to handle them. The "always failing" assertion
2502        (?!) is optimised when compiling to OP_FAIL, so we have to support that,        (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2503        though the other "backtracking verbs" are not supported. */        though the other "backtracking verbs" are not supported. */
2504    
2505        case OP_FAIL:        case OP_FAIL:
2506        break;        forced_fail++;    /* Count FAILs for multiple states */
2507          break;
2508    
2509        case OP_ASSERT:        case OP_ASSERT:
2510        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 2190  for (;;) Line 2514  for (;;)
2514          int rc;          int rc;
2515          int local_offsets[2];          int local_offsets[2];
2516          int local_workspace[1000];          int local_workspace[1000];
2517          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2518    
2519          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2520    
# Line 2198  for (;;) Line 2522  for (;;)
2522            md,                                   /* static match data */            md,                                   /* static match data */
2523            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2524            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2525            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2526            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2527            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2528            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2529            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2530            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2531    
2532            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2533          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2534              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2535          }          }
2536        break;        break;
2537    
# Line 2218  for (;;) Line 2541  for (;;)
2541          {          {
2542          int local_offsets[1000];          int local_offsets[1000];
2543          int local_workspace[1000];          int local_workspace[1000];
2544          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2545            int condcode;
2546    
2547            /* Because of the way auto-callout works during compile, a callout item
2548            is inserted between OP_COND and an assertion condition. This does not
2549            happen for the other conditions. */
2550    
2551            if (code[LINK_SIZE+1] == OP_CALLOUT)
2552              {
2553              rrc = 0;
2554              if (PUBL(callout) != NULL)
2555                {
2556                PUBL(callout_block) cb;
2557                cb.version          = 1;   /* Version 1 of the callout block */
2558                cb.callout_number   = code[LINK_SIZE+2];
2559                cb.offset_vector    = offsets;
2560                cb.subject          = (PCRE_SPTR)start_subject;
2561                cb.subject_length   = (int)(end_subject - start_subject);
2562                cb.start_match      = (int)(current_subject - start_subject);
2563                cb.current_position = (int)(ptr - start_subject);
2564                cb.pattern_position = GET(code, LINK_SIZE + 3);
2565                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2566                cb.capture_top      = 1;
2567                cb.capture_last     = -1;
2568                cb.callout_data     = md->callout_data;
2569                cb.mark             = NULL;   /* No (*MARK) support */
2570                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2571                }
2572              if (rrc > 0) break;                      /* Fail this thread */
2573              code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2574              }
2575    
2576            condcode = code[LINK_SIZE+1];
2577    
2578          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2579    
2580          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2581              return PCRE_ERROR_DFA_UCOND;
2582    
2583          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2584    
2585          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2586            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2587    
2588          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2589          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2590          recursed groups. */          recursed groups. */
2591    
2592          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2593            {            {
2594            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2595            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2596            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (md->recursive != NULL)
2597              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2598              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2599            }            }
2600    
2601          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2248  for (;;) Line 2603  for (;;)
2603          else          else
2604            {            {
2605            int rc;            int rc;
2606            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2607            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2608    
2609            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2610    
# Line 2257  for (;;) Line 2612  for (;;)
2612              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2613              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2614              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2615              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2616              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2617              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2618              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2619              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2620              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2621    
2622              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2623            if ((rc >= 0) ==            if ((rc >= 0) ==
2624                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2625              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2626            else            else
2627              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2628            }            }
2629          }          }
2630        break;        break;
# Line 2278  for (;;) Line 2632  for (;;)
2632        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2633        case OP_RECURSE:        case OP_RECURSE:
2634          {          {
2635            dfa_recursion_info *ri;
2636          int local_offsets[1000];          int local_offsets[1000];
2637          int local_workspace[1000];          int local_workspace[1000];
2638            const pcre_uchar *callpat = start_code + GET(code, 1);
2639            int recno = (callpat == md->start_code)? 0 :
2640              GET2(callpat, 1 + LINK_SIZE);
2641          int rc;          int rc;
2642    
2643          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2644            recursing + 1));  
2645            /* Check for repeating a recursion without advancing the subject
2646            pointer. This should catch convoluted mutual recursions. (Some simple
2647            cases are caught at compile time.) */
2648    
2649            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2650              if (recno == ri->group_num && ptr == ri->subject_position)
2651                return PCRE_ERROR_RECURSELOOP;
2652    
2653            /* Remember this recursion and where we started it so as to
2654            catch infinite loops. */
2655    
2656            new_recursive.group_num = recno;
2657            new_recursive.subject_position = ptr;
2658            new_recursive.prevrec = md->recursive;
2659            md->recursive = &new_recursive;
2660    
2661          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2662            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2663            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2664            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2665            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2666            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2667            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2668            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2669            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2670            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing + 1);                       /* regex recurse level */  
2671    
2672          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2673            recursing + 1, rc));  
2674            DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2675              rc));
2676    
2677          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2678    
# Line 2313  for (;;) Line 2686  for (;;)
2686            {            {
2687            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2688              {              {
2689              const uschar *p = start_subject + local_offsets[rc];              const pcre_uchar *p = start_subject + local_offsets[rc];
2690              const uschar *pp = start_subject + local_offsets[rc+1];              const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2691              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2692              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #ifdef SUPPORT_UTF
2693                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2694    #endif
2695              if (charcount > 0)              if (charcount > 0)
2696                {                {
2697                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 2332  for (;;) Line 2707  for (;;)
2707        break;        break;
2708    
2709        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2710          case OP_BRAPOS:
2711          case OP_SBRAPOS:
2712          case OP_CBRAPOS:
2713          case OP_SCBRAPOS:
2714          case OP_BRAPOSZERO:
2715            {
2716            int charcount, matched_count;
2717            const pcre_uchar *local_ptr = ptr;
2718            BOOL allow_zero;
2719    
2720            if (codevalue == OP_BRAPOSZERO)
2721              {
2722              allow_zero = TRUE;
2723              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2724              }
2725            else allow_zero = FALSE;
2726    
2727            /* Loop to match the subpattern as many times as possible as if it were
2728            a complete pattern. */
2729    
2730            for (matched_count = 0;; matched_count++)
2731              {
2732              int local_offsets[2];
2733              int local_workspace[1000];
2734    
2735              int rc = internal_dfa_exec(
2736                md,                                   /* fixed match data */
2737                code,                                 /* this subexpression's code */
2738                local_ptr,                            /* where we currently are */
2739                (int)(ptr - start_subject),           /* start offset */
2740                local_offsets,                        /* offset vector */
2741                sizeof(local_offsets)/sizeof(int),    /* size of same */
2742                local_workspace,                      /* workspace vector */
2743                sizeof(local_workspace)/sizeof(int),  /* size of same */
2744                rlevel);                              /* function recursion level */
2745    
2746              /* Failed to match */
2747    
2748              if (rc < 0)
2749                {
2750                if (rc != PCRE_ERROR_NOMATCH) return rc;
2751                break;
2752                }
2753    
2754              /* Matched: break the loop if zero characters matched. */
2755    
2756              charcount = local_offsets[1] - local_offsets[0];
2757              if (charcount == 0) break;
2758              local_ptr += charcount;    /* Advance temporary position ptr */
2759              }
2760    
2761            /* At this point we have matched the subpattern matched_count
2762            times, and local_ptr is pointing to the character after the end of the
2763            last match. */
2764    
2765            if (matched_count > 0 || allow_zero)
2766              {
2767              const pcre_uchar *end_subpattern = code;
2768              int next_state_offset;
2769    
2770              do { end_subpattern += GET(end_subpattern, 1); }
2771                while (*end_subpattern == OP_ALT);
2772              next_state_offset =
2773                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2774    
2775              /* Optimization: if there are no more active states, and there
2776              are no new states yet set up, then skip over the subject string
2777              right here, to save looping. Otherwise, set up the new state to swing
2778              into action when the end of the matched substring is reached. */
2779    
2780              if (i + 1 >= active_count && new_count == 0)
2781                {
2782                ptr = local_ptr;
2783                clen = 0;
2784                ADD_NEW(next_state_offset, 0);
2785                }
2786              else
2787                {
2788                const pcre_uchar *p = ptr;
2789                const pcre_uchar *pp = local_ptr;
2790                charcount = (int)(pp - p);
2791    #ifdef SUPPORT_UTF
2792                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2793    #endif
2794                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2795                }
2796              }
2797            }
2798          break;
2799    
2800          /*-----------------------------------------------------------------*/
2801        case OP_ONCE:        case OP_ONCE:
2802          case OP_ONCE_NC:
2803          {          {
2804          int local_offsets[2];          int local_offsets[2];
2805          int local_workspace[1000];          int local_workspace[1000];
# Line 2341  for (;;) Line 2808  for (;;)
2808            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2809            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2810            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2811            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2812            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2813            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2814            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2815            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2816            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2817    
2818          if (rc >= 0)          if (rc >= 0)
2819            {            {
2820            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2821            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2822            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2823    
2824            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2825              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2826            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2827                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2828    
2829            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2830            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 2366  for (;;) Line 2832  for (;;)
2832    
2833            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2834                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2835              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2836    
2837            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2838            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 2381  for (;;) Line 2847  for (;;)
2847            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2848            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2849            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2850            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2851    
2852            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2853              {              {
# Line 2404  for (;;) Line 2870  for (;;)
2870              }              }
2871            else            else
2872              {              {
2873              const uschar *p = start_subject + local_offsets[0];  #ifdef SUPPORT_UTF
2874              const uschar *pp = start_subject + local_offsets[1];              const pcre_uchar *p = start_subject + local_offsets[0];
2875              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;              const pcre_uchar *pp = start_subject + local_offsets[1];
2876                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2877    #endif
2878              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2879              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2880                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2881              }              }
   
2882            }            }
2883          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2884          }          }
# Line 2422  for (;;) Line 2889  for (;;)
2889        /* Handle callouts */        /* Handle callouts */
2890    
2891        case OP_CALLOUT:        case OP_CALLOUT:
2892        if (pcre_callout != NULL)        rrc = 0;
2893          if (PUBL(callout) != NULL)
2894          {          {
2895          int rrc;          PUBL(callout_block) cb;
         pcre_callout_block cb;  
2896          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2897          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2898          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2899          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2900          cb.subject_length   = end_subject - start_subject;          cb.subject_length   = (int)(end_subject - start_subject);
2901          cb.start_match      = current_subject - start_subject;          cb.start_match      = (int)(current_subject - start_subject);
2902          cb.current_position = ptr - start_subject;          cb.current_position = (int)(ptr - start_subject);
2903          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
2904          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
2905          cb.capture_top      = 1;          cb.capture_top      = 1;
2906          cb.capture_last     = -1;          cb.capture_last     = -1;
2907          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2908          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
2909          if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }          if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2910          }          }
2911          if (rrc == 0)
2912            { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2913        break;        break;
2914    
2915    
# Line 2456  for (;;) Line 2925  for (;;)
2925    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2926    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2927    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2928    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
2929    
2930      The "forced_ fail" variable counts the number of (*F) encountered for the
2931      character. If it is equal to the original active_count (saved in
2932      workspace[1]) it means that (*F) was found on every active state. In this
2933      case we don't want to give a partial match.
2934    
2935      The "could_continue" variable is true if a state could have continued but
2936      for the fact that the end of the subject was reached. */
2937    
2938    if (new_count <= 0)    if (new_count <= 0)
2939      {      {
2940      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2941          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on */
2942          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2943            (                                            /* either... */
2944            (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2945            ||                                           /* or... */
2946            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2947             match_count < 0)                            /* no matches */
2948            ) &&                                         /* And... */
2949          ptr >= end_subject &&                  /* Reached end of subject */          ptr >= end_subject &&                  /* Reached end of subject */
2950          ptr > current_subject)                 /* Matched non-empty string */          ptr > md->start_used_ptr)              /* Inspected non-empty string */
2951        {        {
2952        if (offsetcount >= 2)        if (offsetcount >= 2)
2953          {          {
2954          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
2955          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
2956          }          }
2957        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
2958        }        }
# Line 2523  Returns:          > 0 => number of match Line 3006  Returns:          > 0 => number of match
3006                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3007  */  */
3008    
3009  PCRE_EXP_DEFN int  #ifdef COMPILE_PCRE8
3010    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3011  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3012    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3013    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3014    #else
3015    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3016    pcre16_dfa_exec(const pcre *argument_re, const pcre16_extra *extra_data,
3017      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3018      int offsetcount, int *workspace, int wscount)
3019    #endif
3020  {  {
3021  real_pcre *re = (real_pcre *)argument_re;  real_pcre *re = (real_pcre *)argument_re;
3022  dfa_match_data match_block;  dfa_match_data match_block;
3023  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3024  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3025  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
3026    const pcre_uint8 *lcc;
3027    
 pcre_study_data internal_study;  
3028  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3029    
3030  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3031  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3032  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3033  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3034  int first_byte = -1;  pcre_uchar first_char = 0;
3035  int req_byte = -1;  pcre_uchar first_char2 = 0;
3036  int req_byte2 = -1;  pcre_uchar req_char = 0;
3037    pcre_uchar req_char2 = 0;
3038  int newline;  int newline;
3039    
3040  /* Plausibility checks */  /* Plausibility checks */
# Line 2554  if (re == NULL || subject == NULL || wor Line 3044  if (re == NULL || subject == NULL || wor
3044     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3045  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3046  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3047    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3048    
3049  /* We need to find the pointer to any study data before we test for byte  /* We need to find the pointer to any study data before we test for byte
3050  flipping, so we scan the extra_data block first. This may set two fields in the  flipping, so we scan the extra_data block first. This may set two fields in the
# Line 2578  if (extra_data != NULL) Line 3069  if (extra_data != NULL)
3069    }    }
3070    
3071  /* Check that the first field in the block is the magic number. If it is not,  /* Check that the first field in the block is the magic number. If it is not,
3072  test for a regex that was compiled on a host of opposite endianness. If this is  return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3073  the case, flipped values are put in internal_re and internal_study if there was  REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3074  study data too. */  means that the pattern is likely compiled with different endianness. */
3075    
3076  if (re->magic_number != MAGIC_NUMBER)  if (re->magic_number != MAGIC_NUMBER)
3077    {    return re->magic_number == REVERSED_MAGIC_NUMBER?
3078    re = _pcre_try_flipped(re, &internal_re, study, &internal_study);      PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3079    if (re == NULL) return PCRE_ERROR_BADMAGIC;  if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
   if (study != NULL) study = &internal_study;  
   }  
3080    
3081  /* Set some local values */  /* Set some local values */
3082    
3083  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3084  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3085  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3086    
3087  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3088  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3089    utf = (re->options & PCRE_UTF8) != 0;
3090  #else  #else
3091  utf8 = FALSE;  utf = FALSE;
3092  #endif  #endif
3093    
3094  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2606  anchored = (options & (PCRE_ANCHORED|PCR Line 3096  anchored = (options & (PCRE_ANCHORED|PCR
3096    
3097  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3098    
3099  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3100      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3101  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3102  md->end_subject = end_subject;  md->end_subject = end_subject;
3103    md->start_offset = start_offset;
3104  md->moptions = options;  md->moptions = options;
3105  md->poptions = re->options;  md->poptions = re->options;
3106    
# Line 2632  switch ((((options & PCRE_NEWLINE_BITS) Line 3123  switch ((((options & PCRE_NEWLINE_BITS)
3123           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
3124    {    {
3125    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
3126    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3127    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3128    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
3129         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3130    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
3131    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3132    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2668  else Line 3159  else
3159  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3160  back the character offset. */  back the character offset. */
3161    
3162  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3163  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3164    {    {
3165    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3166      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3167    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3168      {      {
3169      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3170        {        {
3171        tb &= 0xc0;        offsets[0] = erroroffset;
3172        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3173        }        }
3174        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3175          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3176      }      }
3177      if (start_offset > 0 && start_offset < length &&
3178            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3179        return PCRE_ERROR_BADUTF8_OFFSET;
3180    }    }
3181  #endif  #endif
3182    
# Line 2689  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3184  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3184  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3185  in other programs later. */  in other programs later. */
3186    
3187  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3188    
3189  /* The lower casing table and the "must be at the start of a line" flag are  /* The lower casing table and the "must be at the start of a line" flag are
3190  used in a loop when finding where to start. */  used in a loop when finding where to start. */
# Line 2708  if (!anchored) Line 3203  if (!anchored)
3203    {    {
3204    if ((re->flags & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3205      {      {
3206      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3207      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = re->first_char;
3208        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3209          {
3210          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3211    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3212          if (utf && first_char > 127)
3213            first_char2 = UCD_OTHERCASE(first_char);
3214    #endif
3215          }
3216      }      }
3217    else    else
3218      {      {
3219      if (startline && study != NULL &&      if (!startline && study != NULL &&
3220           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3221        start_bits = study->start_bits;        start_bits = study->start_bits;
3222      }      }
3223    }    }
# Line 2725  character" set. */ Line 3227  character" set. */
3227    
3228  if ((re->flags & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3229    {    {
3230    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3231    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = re->req_char;
3232    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3233        {
3234        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3235    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3236        if (utf && req_char > 127)
3237          req_char2 = UCD_OTHERCASE(req_char);
3238    #endif
3239        }
3240    }    }
3241    
3242  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
3243  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
3244  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
3245    
3246  for (;;)  for (;;)
3247    {    {
# Line 2741  for (;;) Line 3249  for (;;)
3249    
3250    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3251      {      {
3252      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3253    
3254      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
3255      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
3256      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
3257      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
3258    
3259      if (firstline)      if (firstline)
3260        {        {
3261        const uschar *t = current_subject;        PCRE_PUCHAR t = current_subject;
3262    #ifdef SUPPORT_UTF
3263          if (utf)
3264            {
3265            while (t < md->end_subject && !IS_NEWLINE(t))
3266              {
3267              t++;
3268              ACROSSCHAR(t < end_subject, *t, t++);
3269              }
3270            }
3271          else
3272    #endif
3273        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3274        end_subject = t;        end_subject = t;
3275        }        }
3276    
3277      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
3278        starting point is not found. However, there is an option that disables
3279        these, for testing and for ensuring that all callouts do actually occur.
3280        The option can be set in the regex by (*NO_START_OPT) or passed in
3281        match-time options. */
3282    
3283        if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3284        {        {
3285        if (first_byte_caseless)        /* Advance to a known first char. */
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
3286    
3287      /* Or to just after a linebreak for a multiline match if possible */        if (has_first_char)
3288            {
3289            if (first_char != first_char2)
3290              while (current_subject < end_subject &&
3291                  *current_subject != first_char && *current_subject != first_char2)
3292                current_subject++;
3293            else
3294              while (current_subject < end_subject &&
3295                     *current_subject != first_char)
3296                current_subject++;
3297            }
3298    
3299      else if (startline)        /* Or to just after a linebreak for a multiline match if possible */
3300        {  
3301        if (current_subject > md->start_subject + start_offset)        else if (startline)
3302          {          {
3303          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (current_subject > md->start_subject + start_offset)
3304            current_subject++;            {
3305    #ifdef SUPPORT_UTF
3306              if (utf)
3307                {
3308                while (current_subject < end_subject &&
3309                       !WAS_NEWLINE(current_subject))
3310                  {
3311                  current_subject++;
3312                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3313                    current_subject++);
3314                  }
3315                }
3316              else
3317    #endif
3318              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3319                current_subject++;
3320    
3321          /* If we have just passed a CR and the newline option is ANY or            /* If we have just passed a CR and the newline option is ANY or
3322          ANYCRLF, and we are now at a LF, advance the match position by one more            ANYCRLF, and we are now at a LF, advance the match position by one
3323          character. */            more character. */
3324    
3325          if (current_subject[-1] == '\r' &&            if (current_subject[-1] == CHAR_CR &&
3326               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3327               current_subject < end_subject &&                 current_subject < end_subject &&
3328               *current_subject == '\n')                 *current_subject == CHAR_NL)
3329            current_subject++;              current_subject++;
3330              }
3331          }          }
       }  
3332    
3333      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
3334    
3335      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
3336          {          {
3337          register unsigned int c = *current_subject;          while (current_subject < end_subject)
3338          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
3339              register unsigned int c = *current_subject;
3340    #ifndef COMPILE_PCRE8
3341              if (c > 255) c = 255;
3342    #endif
3343              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3344                {
3345                current_subject++;
3346    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3347                /* In non 8-bit mode, the iteration will stop for
3348                characters > 255 at the beginning or not stop at all. */
3349                if (utf)
3350                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3351                    current_subject++);
3352    #endif
3353                }
3354            else break;            else break;
3355              }
3356          }          }
3357        }        }
3358    
3359      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3360    
3361      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
   
   /* If req_byte is set, we know that that character must appear in the subject  
   for the match to succeed. If the first character is set, req_byte must be  
   later in the subject; otherwise the test starts at the match point. This  
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
3362    
3363      /* We don't need to repeat the search if we haven't yet reached the      /* The following two optimizations are disabled for partial matching or if
3364      place we found it at last time. */      disabling is explicitly requested (and of course, by the test above, this
3365        code is not obeyed when restarting after a partial match). */
3366    
3367      if (p > req_byte_ptr)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3368            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3369        {        {
3370        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3371          {        is a lower bound; no actual string of that length may actually match the
3372          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3373            {        bytes to avoid spending too much time in this optimization. */
3374            register int pp = *p++;  
3375            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3376            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3377          }          return PCRE_ERROR_NOMATCH;
3378        else  
3379          /* If req_char is set, we know that that character must appear in the
3380          subject for the match to succeed. If the first character is set, req_char
3381          must be later in the subject; otherwise the test starts at the match
3382          point. This optimization can save a huge amount of work in patterns with
3383          nested unlimited repeats that aren't going to match. Writing separate
3384          code for cased/caseless versions makes it go faster, as does using an
3385          autoincrement and backing off on a match.
3386    
3387          HOWEVER: when the subject string is very, very long, searching to its end
3388          can take a long time, and give bad performance on quite ordinary
3389          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3390          string... so we don't do this when the string is sufficiently long. */
3391    
3392          if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3393          {          {
3394          while (p < end_subject)          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3395    
3396            /* We don't need to repeat the search if we haven't yet reached the
3397            place we found it at last time. */
3398    
3399            if (p > req_char_ptr)
3400            {            {
3401            if (*p++ == req_byte) { p--; break; }            if (req_char != req_char2)
3402            }              {
3403          }              while (p < end_subject)
3404                  {
3405                  register int pp = *p++;
3406                  if (pp == req_char || pp == req_char2) { p--; break; }
3407                  }
3408                }
3409              else
3410                {
3411                while (p < end_subject)
3412                  {
3413                  if (*p++ == req_char) { p--; break; }
3414                  }
3415                }
3416    
3417        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3418        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3419    
3420        if (p >= end_subject) break;            if (p >= end_subject) break;
3421    
3422        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3423        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3424        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3425    
3426        req_byte_ptr = p;            req_char_ptr = p;
3427              }
3428            }
3429        }        }
3430      }      }   /* End of optimizations that are done when not restarting */
3431    
3432    /* OK, now we can do the business */    /* OK, now we can do the business */
3433    
3434      md->start_used_ptr = current_subject;
3435      md->recursive = NULL;
3436    
3437    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3438      md,                                /* fixed match data */      md,                                /* fixed match data */
3439      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2872  for (;;) Line 3443  for (;;)
3443      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3444      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3445      wscount,                           /* size of same */      wscount,                           /* size of same */
3446      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3447    
3448    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3449    on only if not anchored. */    on only if not anchored. */
# Line 2886  for (;;) Line 3455  for (;;)
3455    
3456    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3457    current_subject++;    current_subject++;
3458    if (utf8)  #ifdef SUPPORT_UTF
3459      if (utf)
3460      {      {
3461      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3462        current_subject++;        current_subject++);
3463      }      }
3464    #endif
3465    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3466    
3467    /* If we have just passed a CR and we are now at a LF, and the pattern does    /* If we have just passed a CR and we are now at a LF, and the pattern does
3468    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
3469    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3470    
3471    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
3472        current_subject < end_subject &&        current_subject < end_subject &&
3473        *current_subject == '\n' &&        *current_subject == CHAR_NL &&
3474        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
3475          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
3476           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.341  
changed lines
  Added in v.850

  ViewVC Help
Powered by ViewVC 1.1.5