/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 406 by ph10, Mon Mar 23 12:05:43 2009 UTC revision 916 by ph10, Wed Feb 15 09:50:53 2012 UTC
# Line 7  and semantics are as close as possible t Line 7  and semantics are as close as possible t
7  below for why this module is different).  below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2009 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 45  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49    the performance of his patterns greatly. I could not use it as it stood, as it
50    was not thread safe, and made assumptions about pattern sizes. Also, it caused
51    test 7 to loop, and test 9 to crash with a segfault.
52    
53    The issue is the check for duplicate states, which is done by a simple linear
54    search up the state list. (Grep for "duplicate" below to find the code.) For
55    many patterns, there will never be many states active at one time, so a simple
56    linear search is fine. In patterns that have many active states, it might be a
57    bottleneck. The suggested code used an indexing scheme to remember which states
58    had previously been used for each character, and avoided the linear search when
59    it knew there was no chance of a duplicate. This was implemented when adding
60    states to the state lists.
61    
62    I wrote some thread-safe, not-limited code to try something similar at the time
63    of checking for duplicates (instead of when adding states), using index vectors
64    on the stack. It did give a 13% improvement with one specially constructed
65    pattern for certain subject strings, but on other strings and on many of the
66    simpler patterns in the test suite it did worse. The major problem, I think,
67    was the extra time to initialize the index. This had to be done for each call
68    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69    only once - I suspect this was the cause of the problems with the tests.)
70    
71    Overall, I concluded that the gains in some cases did not outweigh the losses
72    in others, so I abandoned this code. */
73    
74    
75    
76  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
77  #include "config.h"  #include "config.h"
78  #endif  #endif
# Line 78  never stored, so we push them well clear Line 106  never stored, so we push them well clear
106    
107    
108  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
109  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
110  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
111  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. Non-zero values in the table are the offsets from the opcode where
113  that follow must also be modified. */  the character is to be found. ***NOTE*** If the start of this table is
114    modified, the three tables that follow must also be modified. */
115    
116  static const uschar coptable[] = {  static const pcre_uint8 coptable[] = {
117    0,                             /* End                                    */    0,                             /* End                                    */
118    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120    0, 0, 0,                       /* Any, AllAny, Anybyte                   */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0,                          /* \P, \p                                 */
122    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0,                             /* \X                                     */
124      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
125    1,                             /* Char                                   */    1,                             /* Char                                   */
126    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
127    1,                             /* not                                    */    1,                             /* not                                    */
128      1,                             /* noti                                   */
129    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
130    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
133      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136      1+IMM2_SIZE,                   /* exact I                                */
137      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
139    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
142      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145      1+IMM2_SIZE,                   /* NOT exact I                            */
146      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
148    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
151      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
153    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 114  static const uschar coptable[] = { Line 156  static const uschar coptable[] = {
156    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
157    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
158    0,                             /* REF                                    */    0,                             /* REF                                    */
159      0,                             /* REFI                                   */
160    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
161    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
162    0,                             /* Alt                                    */    0,                             /* Alt                                    */
163    0,                             /* Ket                                    */    0,                             /* Ket                                    */
164    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
165    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
166      0,                             /* KetRpos                                */
167      0,                             /* Reverse                                */
168    0,                             /* Assert                                 */    0,                             /* Assert                                 */
169    0,                             /* Assert not                             */    0,                             /* Assert not                             */
170    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
171    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
172      0, 0,                          /* ONCE, ONCE_NC                          */
173      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
174      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
175      0, 0,                          /* CREF, NCREF                            */
176      0, 0,                          /* RREF, NRREF                            */
177      0,                             /* DEF                                    */
178      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
179      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
180      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
181      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
182      0, 0                           /* CLOSE, SKIPZERO  */
183    };
184    
185    /* This table identifies those opcodes that inspect a character. It is used to
186    remember the fact that a character could have been inspected when the end of
187    the subject is reached. ***NOTE*** If the start of this table is modified, the
188    two tables that follow must also be modified. */
189    
190    static const pcre_uint8 poptable[] = {
191      0,                             /* End                                    */
192      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
193      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
194      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
195      1, 1,                          /* \P, \p                                 */
196      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
197      1,                             /* \X                                     */
198      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
199      1,                             /* Char                                   */
200      1,                             /* Chari                                  */
201      1,                             /* not                                    */
202      1,                             /* noti                                   */
203      /* Positive single-char repeats                                          */
204      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
205      1, 1, 1,                       /* upto, minupto, exact                   */
206      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
207      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
208      1, 1, 1,                       /* upto I, minupto I, exact I             */
209      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
210      /* Negative single-char repeats - only for chars < 256                   */
211      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
212      1, 1, 1,                       /* NOT upto, minupto, exact               */
213      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
214      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
215      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
216      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
217      /* Positive type repeats                                                 */
218      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
219      1, 1, 1,                       /* Type upto, minupto, exact              */
220      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
221      /* Character class & ref repeats                                         */
222      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
223      1, 1,                          /* CRRANGE, CRMINRANGE                    */
224      1,                             /* CLASS                                  */
225      1,                             /* NCLASS                                 */
226      1,                             /* XCLASS - variable length               */
227      0,                             /* REF                                    */
228      0,                             /* REFI                                   */
229      0,                             /* RECURSE                                */
230      0,                             /* CALLOUT                                */
231      0,                             /* Alt                                    */
232      0,                             /* Ket                                    */
233      0,                             /* KetRmax                                */
234      0,                             /* KetRmin                                */
235      0,                             /* KetRpos                                */
236    0,                             /* Reverse                                */    0,                             /* Reverse                                */
237    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
238    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
239    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
240    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
241      0, 0,                          /* ONCE, ONCE_NC                          */
242      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
243      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
244      0, 0,                          /* CREF, NCREF                            */
245      0, 0,                          /* RREF, NRREF                            */
246    0,                             /* DEF                                    */    0,                             /* DEF                                    */
247    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
248    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
249    0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
250      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
251      0, 0                           /* CLOSE, SKIPZERO                        */
252  };  };
253    
254  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
255  and \w */  and \w */
256    
257  static const uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
258    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
259    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
260    ctype_space, ctype_space,    ctype_space, ctype_space,
# Line 146  static const uschar toptable1[] = { Line 262  static const uschar toptable1[] = {
262    0, 0                            /* OP_ANY, OP_ALLANY */    0, 0                            /* OP_ANY, OP_ALLANY */
263  };  };
264    
265  static const uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
266    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
267    ctype_digit, 0,    ctype_digit, 0,
268    ctype_space, 0,    ctype_space, 0,
# Line 163  these structures in, is a vector of ints Line 279  these structures in, is a vector of ints
279  typedef struct stateblock {  typedef struct stateblock {
280    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
281    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
282    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
283  } stateblock;  } stateblock;
284    
285  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
286    
287    
288  #ifdef DEBUG  #ifdef PCRE_DEBUG
289  /*************************************************  /*************************************************
290  *             Print character string             *  *             Print character string             *
291  *************************************************/  *************************************************/
# Line 186  Returns:       nothing Line 301  Returns:       nothing
301  */  */
302    
303  static void  static void
304  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
305  {  {
306  int c;  int c;
307  while (length-- > 0)  while (length-- > 0)
# Line 219  Arguments: Line 334  Arguments:
334    offsetcount       size of same    offsetcount       size of same
335    workspace         vector of workspace    workspace         vector of workspace
336    wscount           size of same    wscount           size of same
   ims               the current ims flags  
337    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
338    
339  Returns:            > 0 => number of match offset pairs placed in offsets  Returns:            > 0 => number of match offset pairs placed in offsets
340                      = 0 => offsets overflowed; longest matches are present                      = 0 => offsets overflowed; longest matches are present
# Line 236  for the current character, one for the f Line 349  for the current character, one for the f
349      { \      { \
350      next_active_state->offset = (x); \      next_active_state->offset = (x); \
351      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
352      next_active_state++; \      next_active_state++; \
353      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354      } \      } \
# Line 247  for the current character, one for the f Line 359  for the current character, one for the f
359      { \      { \
360      next_active_state->offset = (x); \      next_active_state->offset = (x); \
361      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
362      next_active_state->data   = (z); \      next_active_state->data   = (z); \
363      next_active_state++; \      next_active_state++; \
364      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 259  for the current character, one for the f Line 370  for the current character, one for the f
370      { \      { \
371      next_new_state->offset = (x); \      next_new_state->offset = (x); \
372      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
373      next_new_state++; \      next_new_state++; \
374      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
375      } \      } \
# Line 270  for the current character, one for the f Line 380  for the current character, one for the f
380      { \      { \
381      next_new_state->offset = (x); \      next_new_state->offset = (x); \
382      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
383      next_new_state->data   = (z); \      next_new_state->data   = (z); \
384      next_new_state++; \      next_new_state++; \
385      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 282  for the current character, one for the f Line 391  for the current character, one for the f
391  static int  static int
392  internal_dfa_exec(  internal_dfa_exec(
393    dfa_match_data *md,    dfa_match_data *md,
394    const uschar *this_start_code,    const pcre_uchar *this_start_code,
395    const uschar *current_subject,    const pcre_uchar *current_subject,
396    int start_offset,    int start_offset,
397    int *offsets,    int *offsets,
398    int offsetcount,    int offsetcount,
399    int *workspace,    int *workspace,
400    int wscount,    int wscount,
401    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
402  {  {
403  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
404  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
405    
406  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
407  const uschar *ptr;  const pcre_uchar *ptr;
408  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
409    
410    dfa_recursion_info new_recursive;
411    
412  int active_count, new_count, match_count;  int active_count, new_count, match_count;
413    
414  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
415  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
416    
417  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
418  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
419  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
420    
421  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
422  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423  #else  #else
424  BOOL utf8 = FALSE;  BOOL utf = FALSE;
425  #endif  #endif
426    
427    BOOL reset_could_continue = FALSE;
428    
429  rlevel++;  rlevel++;
430  offsetcount &= (-2);  offsetcount &= (-2);
431    
# Line 323  wscount = (wscount - (wscount % (INTS_PE Line 434  wscount = (wscount - (wscount % (INTS_PE
434            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
435    
436  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
437    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
438    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439    
440  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
441  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 337  next_new_state = new_states = active_sta Line 448  next_new_state = new_states = active_sta
448  new_count = 0;  new_count = 0;
449    
450  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
451    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453        ? IMM2_SIZE:0);
454    
455  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 365  if (*first_op == OP_REVERSE) Line 478  if (*first_op == OP_REVERSE)
478    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
479    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
480    
481  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
482    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
483    
484    if (utf8)    if (utf)
485      {      {
486      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
487        {        {
488        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
489        current_subject--;        current_subject--;
490        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
491        }        }
492      }      }
493    else    else
# Line 386  if (*first_op == OP_REVERSE) Line 497  if (*first_op == OP_REVERSE)
497    
498      {      {
499      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
500        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
501      current_subject -= gone_back;      current_subject -= gone_back;
502      }      }
503    
504      /* Save the earliest consulted character */
505    
506      if (current_subject < md->start_used_ptr)
507        md->start_used_ptr = current_subject;
508    
509    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
510    
511    end_code = this_start_code;    end_code = this_start_code;
# Line 398  if (*first_op == OP_REVERSE) Line 514  if (*first_op == OP_REVERSE)
514      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
515      if (back <= gone_back)      if (back <= gone_back)
516        {        {
517        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
519        }        }
520      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 431  else Line 547  else
547    else    else
548      {      {
549      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
550        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552            ? IMM2_SIZE:0);
553      do      do
554        {        {
555        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
556        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
557        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
558        }        }
# Line 444  else Line 562  else
562    
563  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
564    
565  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566    
567  /* Loop for scanning the subject */  /* Loop for scanning the subject */
568    
# Line 454  for (;;) Line 572  for (;;)
572    int i, j;    int i, j;
573    int clen, dlen;    int clen, dlen;
574    unsigned int c, d;    unsigned int c, d;
575      int forced_fail = 0;
576      BOOL partial_newline = FALSE;
577      BOOL could_continue = reset_could_continue;
578      reset_could_continue = FALSE;
579    
580    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
581    new state list. */    new state list. */
582    
# Line 467  for (;;) Line 589  for (;;)
589    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
590    workspace[1] = active_count;    workspace[1] = active_count;
591    
592  #ifdef DEBUG  #ifdef PCRE_DEBUG
593    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
595    printf("\"\n");    printf("\"\n");
596    
597    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 490  for (;;) Line 612  for (;;)
612    if (ptr < end_subject)    if (ptr < end_subject)
613      {      {
614      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of bytes in the character */
615  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
616      if (utf8) { GETCHARLEN(c, ptr, clen); } else      if (utf) { GETCHARLEN(c, ptr, clen); } else
617  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
618      c = *ptr;      c = *ptr;
619      }      }
620    else    else
# Line 509  for (;;) Line 631  for (;;)
631    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
632      {      {
633      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
634      const uschar *code;      BOOL caseless = FALSE;
635        const pcre_uchar *code;
636      int state_offset = current_state->offset;      int state_offset = current_state->offset;
637      int count, codevalue, rrc;      int count, codevalue, rrc;
638    
639  #ifdef DEBUG  #ifdef PCRE_DEBUG
640      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
641      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
642        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
643          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
644  #endif  #endif
645    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
646      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
647      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
648      been skipped". */      been skipped". If the could_continue flag was passed over from a previous
649        state, arrange for it to passed on. */
650    
651      if (state_offset < 0)      if (state_offset < 0)
652        {        {
# Line 535  for (;;) Line 655  for (;;)
655          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));          DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
656          ADD_NEW_DATA(state_offset, current_state->count,          ADD_NEW_DATA(state_offset, current_state->count,
657            current_state->data - 1);            current_state->data - 1);
658            if (could_continue) reset_could_continue = TRUE;
659          continue;          continue;
660          }          }
661        else        else
# Line 543  for (;;) Line 664  for (;;)
664          }          }
665        }        }
666    
667      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
668        See the note at the head of this module about the possibility of improving
669        performance here. */
670    
671      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
672        {        {
# Line 560  for (;;) Line 683  for (;;)
683      code = start_code + state_offset;      code = start_code + state_offset;
684      codevalue = *code;      codevalue = *code;
685    
686        /* If this opcode inspects a character, but we are at the end of the
687        subject, remember the fact for use when testing for a partial match. */
688    
689        if (clen == 0 && poptable[codevalue] != 0)
690          could_continue = TRUE;
691    
692      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
693      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
694      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
# Line 574  for (;;) Line 703  for (;;)
703      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
704        {        {
705        dlen = 1;        dlen = 1;
706  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
707        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
708  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
709        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
710        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
711          {          {
# Line 606  for (;;) Line 735  for (;;)
735    
736      switch (codevalue)      switch (codevalue)
737        {        {
738    /* ========================================================================== */
739          /* These cases are never obeyed. This is a fudge that causes a compile-
740          time error if the vectors coptable or poptable, which are indexed by
741          opcode, are not the correct length. It seems to be the only way to do
742          such a check at compile time, as the sizeof() operator does not work
743          in the C preprocessor. */
744    
745          case OP_TABLE_LENGTH:
746          case OP_TABLE_LENGTH +
747            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
748             (sizeof(poptable) == OP_TABLE_LENGTH)):
749          break;
750    
751  /* ========================================================================== */  /* ========================================================================== */
752        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
753        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
754        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
755          subpattern, because the possessive subpattern repeats are always handled
756          using recursive calls. Thus, it never adds any new states.
757    
758          At the end of the (sub)pattern, unless we have an empty string and
759          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
760          start of the subject, save the match data, shifting up all previous
761        matches so we always have the longest first. */        matches so we always have the longest first. */
762    
763        case OP_KET:        case OP_KET:
764        case OP_KETRMIN:        case OP_KETRMIN:
765        case OP_KETRMAX:        case OP_KETRMAX:
766          case OP_KETRPOS:
767        if (code != end_code)        if (code != end_code)
768          {          {
769          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 624  for (;;) Line 772  for (;;)
772            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
773            }            }
774          }          }
775        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
776          {          {
777          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
778            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
779              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
780          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
781          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
782          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
783            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
784            offsets[0] = current_subject - start_subject;                match_count = 0;
785            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
786            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
787              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
788            }              {
789          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
790            {              offsets[1] = (int)(ptr - start_subject);
791            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
792              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], current_subject));
793              match_count, rlevel*2-2, SP));              }
794            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
795                {
796                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
797                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
798                  match_count, rlevel*2-2, SP));
799                return match_count;
800                }
801            }            }
802          }          }
803        break;        break;
# Line 655  for (;;) Line 809  for (;;)
809        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
810        case OP_ALT:        case OP_ALT:
811        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
812        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
813        break;        break;
814    
815        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 663  for (;;) Line 817  for (;;)
817        case OP_SBRA:        case OP_SBRA:
818        do        do
819          {          {
820          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821          code += GET(code, 1);          code += GET(code, 1);
822          }          }
823        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 672  for (;;) Line 826  for (;;)
826        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
827        case OP_CBRA:        case OP_CBRA:
828        case OP_SCBRA:        case OP_SCBRA:
829        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
830        code += GET(code, 1);        code += GET(code, 1);
831        while (*code == OP_ALT)        while (*code == OP_ALT)
832          {          {
833          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
834          code += GET(code, 1);          code += GET(code, 1);
835          }          }
836        break;        break;
# Line 687  for (;;) Line 841  for (;;)
841        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
842        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
843        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
844        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
845        break;        break;
846    
847        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
848        case OP_SKIPZERO:        case OP_SKIPZERO:
849        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
850        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
851        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
852        break;        break;
853    
854        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
855        case OP_CIRC:        case OP_CIRC:
856        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
857          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
858        break;        break;
859    
860        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
861        case OP_EOD:        case OP_CIRCM:
862        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
863              (ptr != end_subject && WAS_NEWLINE(ptr)))
864            { ADD_ACTIVE(state_offset + 1, 0); }
865        break;        break;
866    
867        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
868        case OP_OPT:        case OP_EOD:
869        ims = code[1];        if (ptr >= end_subject)
870        ADD_ACTIVE(state_offset + 2, 0);          {
871            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
872              could_continue = TRUE;
873            else { ADD_ACTIVE(state_offset + 1, 0); }
874            }
875        break;        break;
876    
877        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 748  for (;;) Line 905  for (;;)
905    
906        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
907        case OP_EODN:        case OP_EODN:
908        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
909            could_continue = TRUE;
910          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
911          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
912        break;        break;
913    
# Line 756  for (;;) Line 915  for (;;)
915        case OP_DOLL:        case OP_DOLL:
916        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
917          {          {
918          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
919              could_continue = TRUE;
920            else if (clen == 0 ||
921              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
922                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 (ptr == end_subject - md->nllen)
923              ))              ))
924            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
925            else if (ptr + 1 >= md->end_subject &&
926                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
927                     NLBLOCK->nltype == NLTYPE_FIXED &&
928                     NLBLOCK->nllen == 2 &&
929                     c == NLBLOCK->nl[0])
930              {
931              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
932                {
933                reset_could_continue = TRUE;
934                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
935                }
936              else could_continue = partial_newline = TRUE;
937              }
938            }
939          break;
940    
941          /*-----------------------------------------------------------------*/
942          case OP_DOLLM:
943          if ((md->moptions & PCRE_NOTEOL) == 0)
944            {
945            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
946              could_continue = TRUE;
947            else if (clen == 0 ||
948                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
949              { ADD_ACTIVE(state_offset + 1, 0); }
950            else if (ptr + 1 >= md->end_subject &&
951                     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
952                     NLBLOCK->nltype == NLTYPE_FIXED &&
953                     NLBLOCK->nllen == 2 &&
954                     c == NLBLOCK->nl[0])
955              {
956              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
957                {
958                reset_could_continue = TRUE;
959                ADD_NEW_DATA(-(state_offset + 1), 0, 1);
960                }
961              else could_continue = partial_newline = TRUE;
962              }
963          }          }
964        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        else if (IS_NEWLINE(ptr))
965          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
966        break;        break;
967    
# Line 793  for (;;) Line 992  for (;;)
992    
993          if (ptr > start_subject)          if (ptr > start_subject)
994            {            {
995            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
996  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
997            if (utf8) BACKCHAR(temp);  #ifdef SUPPORT_UTF
998              if (utf) { BACKCHAR(temp); }
999  #endif  #endif
1000            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
1001    #ifdef SUPPORT_UCP
1002              if ((md->poptions & PCRE_UCP) != 0)
1003                {
1004                if (d == '_') left_word = TRUE; else
1005                  {
1006                  int cat = UCD_CATEGORY(d);
1007                  left_word = (cat == ucp_L || cat == ucp_N);
1008                  }
1009                }
1010              else
1011    #endif
1012            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1013            }            }
1014          else left_word = 0;          else left_word = FALSE;
1015    
1016          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
1017            else right_word = 0;            {
1018    #ifdef SUPPORT_UCP
1019              if ((md->poptions & PCRE_UCP) != 0)
1020                {
1021                if (c == '_') right_word = TRUE; else
1022                  {
1023                  int cat = UCD_CATEGORY(c);
1024                  right_word = (cat == ucp_L || cat == ucp_N);
1025                  }
1026                }
1027              else
1028    #endif
1029              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1030              }
1031            else right_word = FALSE;
1032    
1033          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1034            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 830  for (;;) Line 1055  for (;;)
1055            break;            break;
1056    
1057            case PT_LAMP:            case PT_LAMP:
1058            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1059                   prop->chartype == ucp_Lt;
1060            break;            break;
1061    
1062            case PT_GC:            case PT_GC:
1063            OK = _pcre_ucp_gentype[prop->chartype] == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1064            break;            break;
1065    
1066            case PT_PC:            case PT_PC:
# Line 845  for (;;) Line 1071  for (;;)
1071            OK = prop->script == code[2];            OK = prop->script == code[2];
1072            break;            break;
1073    
1074              /* These are specials for combination cases. */
1075    
1076              case PT_ALNUM:
1077              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1078                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1079              break;
1080    
1081              case PT_SPACE:    /* Perl space */
1082              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1083                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1084              break;
1085    
1086              case PT_PXSPACE:  /* POSIX space */
1087              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1088                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1089                   c == CHAR_FF || c == CHAR_CR;
1090              break;
1091    
1092              case PT_WORD:
1093              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1094                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1095                   c == CHAR_UNDERSCORE;
1096              break;
1097    
1098            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1099    
1100            default:            default:
# Line 943  for (;;) Line 1193  for (;;)
1193                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1194            {            {
1195            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1196              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1197            else            else
1198              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1199            }            }
# Line 954  for (;;) Line 1204  for (;;)
1204        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1205        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1206        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1207        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1208        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1209        if (clen > 0)        if (clen > 0)
1210          {          {
# Line 969  for (;;) Line 1219  for (;;)
1219              next_active_state--;              next_active_state--;
1220              }              }
1221            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1222              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1223            else            else
1224              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1225            }            }
# Line 999  for (;;) Line 1249  for (;;)
1249            break;            break;
1250    
1251            case PT_LAMP:            case PT_LAMP:
1252            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1253                prop->chartype == ucp_Lt;
1254            break;            break;
1255    
1256            case PT_GC:            case PT_GC:
1257            OK = _pcre_ucp_gentype[prop->chartype] == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1258            break;            break;
1259    
1260            case PT_PC:            case PT_PC:
# Line 1014  for (;;) Line 1265  for (;;)
1265            OK = prop->script == code[3];            OK = prop->script == code[3];
1266            break;            break;
1267    
1268              /* These are specials for combination cases. */
1269    
1270              case PT_ALNUM:
1271              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1272                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1273              break;
1274    
1275              case PT_SPACE:    /* Perl space */
1276              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1277                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1278              break;
1279    
1280              case PT_PXSPACE:  /* POSIX space */
1281              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1282                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1283                   c == CHAR_FF || c == CHAR_CR;
1284              break;
1285    
1286              case PT_WORD:
1287              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1288                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1289                   c == CHAR_UNDERSCORE;
1290              break;
1291    
1292            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1293    
1294            default:            default:
# Line 1042  for (;;) Line 1317  for (;;)
1317        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1318        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1319          {          {
1320          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1321          int ncount = 0;          int ncount = 0;
1322          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1323            {            {
# Line 1221  for (;;) Line 1496  for (;;)
1496            break;            break;
1497    
1498            case PT_LAMP:            case PT_LAMP:
1499            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1500                prop->chartype == ucp_Lt;
1501            break;            break;
1502    
1503            case PT_GC:            case PT_GC:
1504            OK = _pcre_ucp_gentype[prop->chartype] == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1505            break;            break;
1506    
1507            case PT_PC:            case PT_PC:
# Line 1236  for (;;) Line 1512  for (;;)
1512            OK = prop->script == code[3];            OK = prop->script == code[3];
1513            break;            break;
1514    
1515              /* These are specials for combination cases. */
1516    
1517              case PT_ALNUM:
1518              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1519                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1520              break;
1521    
1522              case PT_SPACE:    /* Perl space */
1523              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1524                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1525              break;
1526    
1527              case PT_PXSPACE:  /* POSIX space */
1528              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1529                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1530                   c == CHAR_FF || c == CHAR_CR;
1531              break;
1532    
1533              case PT_WORD:
1534              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1535                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1536                   c == CHAR_UNDERSCORE;
1537              break;
1538    
1539            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1540    
1541            default:            default:
# Line 1273  for (;;) Line 1573  for (;;)
1573        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1574        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1575          {          {
1576          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1577          int ncount = 0;          int ncount = 0;
1578          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1579              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1455  for (;;) Line 1755  for (;;)
1755        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1756        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1757        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1758          { ADD_ACTIVE(state_offset + 6, 0); }          { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1759        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1760        if (clen > 0)        if (clen > 0)
1761          {          {
1762          BOOL OK;          BOOL OK;
1763          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1764          switch(code[4])          switch(code[1 + IMM2_SIZE + 1])
1765            {            {
1766            case PT_ANY:            case PT_ANY:
1767            OK = TRUE;            OK = TRUE;
1768            break;            break;
1769    
1770            case PT_LAMP:            case PT_LAMP:
1771            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1772                prop->chartype == ucp_Lt;
1773            break;            break;
1774    
1775            case PT_GC:            case PT_GC:
1776            OK = _pcre_ucp_gentype[prop->chartype] == code[5];            OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1777            break;            break;
1778    
1779            case PT_PC:            case PT_PC:
1780            OK = prop->chartype == code[5];            OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1781            break;            break;
1782    
1783            case PT_SC:            case PT_SC:
1784            OK = prop->script == code[5];            OK = prop->script == code[1 + IMM2_SIZE + 2];
1785              break;
1786    
1787              /* These are specials for combination cases. */
1788    
1789              case PT_ALNUM:
1790              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1791                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1792              break;
1793    
1794              case PT_SPACE:    /* Perl space */
1795              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1796                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1797              break;
1798    
1799              case PT_PXSPACE:  /* POSIX space */
1800              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1801                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1802                   c == CHAR_FF || c == CHAR_CR;
1803              break;
1804    
1805              case PT_WORD:
1806              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1807                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1808                   c == CHAR_UNDERSCORE;
1809            break;            break;
1810    
1811            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1498  for (;;) Line 1823  for (;;)
1823              next_active_state--;              next_active_state--;
1824              }              }
1825            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1826              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1827            else            else
1828              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1829            }            }
# Line 1511  for (;;) Line 1836  for (;;)
1836        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1837        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1838        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1839          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1840        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1841        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1842          {          {
1843          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1844          int ncount = 0;          int ncount = 0;
1845          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1846            {            {
# Line 1531  for (;;) Line 1856  for (;;)
1856            ncount++;            ncount++;
1857            nptr += ndlen;            nptr += ndlen;
1858            }            }
1859            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1860                reset_could_continue = TRUE;
1861          if (++count >= GET2(code, 1))          if (++count >= GET2(code, 1))
1862            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1863          else          else
1864            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1865          }          }
# Line 1545  for (;;) Line 1872  for (;;)
1872        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1873        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1874        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1875          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1876        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1877        if (clen > 0)        if (clen > 0)
1878          {          {
# Line 1572  for (;;) Line 1899  for (;;)
1899              next_active_state--;              next_active_state--;
1900              }              }
1901            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1902              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1903            else            else
1904              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1905            break;            break;
# Line 1589  for (;;) Line 1916  for (;;)
1916        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1917        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1918        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1919          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1920        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1921        if (clen > 0)        if (clen > 0)
1922          {          {
# Line 1618  for (;;) Line 1945  for (;;)
1945              next_active_state--;              next_active_state--;
1946              }              }
1947            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1948              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1949            else            else
1950              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1951            }            }
# Line 1631  for (;;) Line 1958  for (;;)
1958        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1959        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1960        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1961          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1962        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1963        if (clen > 0)        if (clen > 0)
1964          {          {
# Line 1673  for (;;) Line 2000  for (;;)
2000              next_active_state--;              next_active_state--;
2001              }              }
2002            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2003              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2004            else            else
2005              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
2006            }            }
# Line 1692  for (;;) Line 2019  for (;;)
2019        break;        break;
2020    
2021        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2022        case OP_CHARNC:        case OP_CHARI:
2023        if (clen == 0) break;        if (clen == 0) break;
2024    
2025  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2026        if (utf8)        if (utf)
2027          {          {
2028          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2029            {            {
2030            unsigned int othercase;            unsigned int othercase;
2031            if (c < 128) othercase = fcc[c]; else            if (c < 128)
2032                othercase = fcc[c];
2033            /* If we have Unicode property support, we can use it to test the            else
2034            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2035                other case of the character. */
2036  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2037            othercase = UCD_OTHERCASE(c);              othercase = UCD_OTHERCASE(c);
2038  #else  #else
2039            othercase = NOTACHAR;              othercase = NOTACHAR;
2040  #endif  #endif
2041    
2042            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2043            }            }
2044          }          }
2045        else        else
2046  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2047          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2048          {          {
2049          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2050              { ADD_NEW(state_offset + 2, 0); }
2051          }          }
2052        break;        break;
2053    
# Line 1734  for (;;) Line 2061  for (;;)
2061        case OP_EXTUNI:        case OP_EXTUNI:
2062        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2063          {          {
2064          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
2065          int ncount = 0;          int ncount = 0;
2066          while (nptr < end_subject)          while (nptr < end_subject)
2067            {            {
# Line 1744  for (;;) Line 2071  for (;;)
2071            ncount++;            ncount++;
2072            nptr += nclen;            nptr += nclen;
2073            }            }
2074            if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2075                reset_could_continue = TRUE;
2076          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);          ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2077          }          }
2078        break;        break;
# Line 1769  for (;;) Line 2098  for (;;)
2098          break;          break;
2099    
2100          case 0x000d:          case 0x000d:
2101          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 >= end_subject)
2102              {
2103              ADD_NEW(state_offset + 1, 0);
2104              if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2105                reset_could_continue = TRUE;
2106              }
2107            else if (ptr[1] == 0x0a)
2108            {            {
2109            ADD_NEW_DATA(-(state_offset + 1), 0, 1);            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2110            }            }
2111          else          else
2112            {            {
2113            ADD_NEW(state_offset + 1, 0);            ADD_NEW(state_offset + 1, 0);
2114            }            }
2115          break;          break;
2116          }          }
2117        break;        break;
# Line 1878  for (;;) Line 2213  for (;;)
2213        break;        break;
2214    
2215        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2216        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character casefully. This is only used for
2217        characters, that is, we know that d < 256. The character we are        one-byte characters, that is, we know that d < 256. The character we are
2218        checking (c) can be multibyte. */        checking (c) can be multibyte. */
2219    
2220        case OP_NOT:        case OP_NOT:
2221        if (clen > 0)        if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
         {  
         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;  
         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  
         }  
2222        break;        break;
2223    
2224        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2225          /* Match a negated single character caselessly. This is only used for
2226          one-byte characters, that is, we know that d < 256. The character we are
2227          checking (c) can be multibyte. */
2228    
2229          case OP_NOTI:
2230          if (clen > 0 && c != d && c != fcc[d])
2231            { ADD_NEW(state_offset + dlen + 1, 0); }
2232          break;
2233    
2234          /*-----------------------------------------------------------------*/
2235          case OP_PLUSI:
2236          case OP_MINPLUSI:
2237          case OP_POSPLUSI:
2238          case OP_NOTPLUSI:
2239          case OP_NOTMINPLUSI:
2240          case OP_NOTPOSPLUSI:
2241          caseless = TRUE;
2242          codevalue -= OP_STARI - OP_STAR;
2243    
2244          /* Fall through */
2245        case OP_PLUS:        case OP_PLUS:
2246        case OP_MINPLUS:        case OP_MINPLUS:
2247        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1902  for (;;) Line 2253  for (;;)
2253        if (clen > 0)        if (clen > 0)
2254          {          {
2255          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2256          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2257            {            {
2258  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2259            if (utf8 && d >= 128)            if (utf && d >= 128)
2260              {              {
2261  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2262              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2263  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2264              }              }
2265            else            else
2266  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2267            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2268            }            }
2269          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2270            {            {
# Line 1930  for (;;) Line 2281  for (;;)
2281        break;        break;
2282    
2283        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2284          case OP_QUERYI:
2285          case OP_MINQUERYI:
2286          case OP_POSQUERYI:
2287          case OP_NOTQUERYI:
2288          case OP_NOTMINQUERYI:
2289          case OP_NOTPOSQUERYI:
2290          caseless = TRUE;
2291          codevalue -= OP_STARI - OP_STAR;
2292          /* Fall through */
2293        case OP_QUERY:        case OP_QUERY:
2294        case OP_MINQUERY:        case OP_MINQUERY:
2295        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1940  for (;;) Line 2300  for (;;)
2300        if (clen > 0)        if (clen > 0)
2301          {          {
2302          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2303          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2304            {            {
2305  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2306            if (utf8 && d >= 128)            if (utf && d >= 128)
2307              {              {
2308  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2309              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2310  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2311              }              }
2312            else            else
2313  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2314            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2315            }            }
2316          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2317            {            {
# Line 1966  for (;;) Line 2326  for (;;)
2326        break;        break;
2327    
2328        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2329          case OP_STARI:
2330          case OP_MINSTARI:
2331          case OP_POSSTARI:
2332          case OP_NOTSTARI:
2333          case OP_NOTMINSTARI:
2334          case OP_NOTPOSSTARI:
2335          caseless = TRUE;
2336          codevalue -= OP_STARI - OP_STAR;
2337          /* Fall through */
2338        case OP_STAR:        case OP_STAR:
2339        case OP_MINSTAR:        case OP_MINSTAR:
2340        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1976  for (;;) Line 2345  for (;;)
2345        if (clen > 0)        if (clen > 0)
2346          {          {
2347          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2348          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2349            {            {
2350  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2351            if (utf8 && d >= 128)            if (utf && d >= 128)
2352              {              {
2353  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2354              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2355  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2356              }              }
2357            else            else
2358  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2359            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2360            }            }
2361          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2362            {            {
# Line 2002  for (;;) Line 2371  for (;;)
2371        break;        break;
2372    
2373        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2374          case OP_EXACTI:
2375          case OP_NOTEXACTI:
2376          caseless = TRUE;
2377          codevalue -= OP_STARI - OP_STAR;
2378          /* Fall through */
2379        case OP_EXACT:        case OP_EXACT:
2380        case OP_NOTEXACT:        case OP_NOTEXACT:
2381        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2382        if (clen > 0)        if (clen > 0)
2383          {          {
2384          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2385          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2386            {            {
2387  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2388            if (utf8 && d >= 128)            if (utf && d >= 128)
2389              {              {
2390  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2391              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2392  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2393              }              }
2394            else            else
2395  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2396            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2397            }            }
2398          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2399            {            {
2400            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2401              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2402            else            else
2403              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2404            }            }
# Line 2032  for (;;) Line 2406  for (;;)
2406        break;        break;
2407    
2408        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2409          case OP_UPTOI:
2410          case OP_MINUPTOI:
2411          case OP_POSUPTOI:
2412          case OP_NOTUPTOI:
2413          case OP_NOTMINUPTOI:
2414          case OP_NOTPOSUPTOI:
2415          caseless = TRUE;
2416          codevalue -= OP_STARI - OP_STAR;
2417          /* Fall through */
2418        case OP_UPTO:        case OP_UPTO:
2419        case OP_MINUPTO:        case OP_MINUPTO:
2420        case OP_POSUPTO:        case OP_POSUPTO:
2421        case OP_NOTUPTO:        case OP_NOTUPTO:
2422        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2423        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2424        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2425        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2426        if (clen > 0)        if (clen > 0)
2427          {          {
2428          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2429          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2430            {            {
2431  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2432            if (utf8 && d >= 128)            if (utf && d >= 128)
2433              {              {
2434  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2435              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2436  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2437              }              }
2438            else            else
2439  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2440            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2441            }            }
2442          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2443            {            {
# Line 2064  for (;;) Line 2447  for (;;)
2447              next_active_state--;              next_active_state--;
2448              }              }
2449            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2450              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2451            else            else
2452              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2453            }            }
# Line 2081  for (;;) Line 2464  for (;;)
2464          {          {
2465          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2466          int next_state_offset;          int next_state_offset;
2467          const uschar *ecode;          const pcre_uchar *ecode;
2468    
2469          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2470          can set isinclass from it. */          can set isinclass from it. */
2471    
2472          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2473            {            {
2474            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2475            if (clen > 0)            if (clen > 0)
2476              {              {
2477              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2478                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2479              }              }
2480            }            }
2481    
# Line 2103  for (;;) Line 2486  for (;;)
2486          else          else
2487           {           {
2488           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2489           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2490           }           }
2491    
2492          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2493          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2494          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2495    
2496          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2497    
2498          switch (*ecode)          switch (*ecode)
2499            {            {
# Line 2137  for (;;) Line 2520  for (;;)
2520            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2521            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2522            if (count >= GET2(ecode, 1))            if (count >= GET2(ecode, 1))
2523              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2524            if (isinclass)            if (isinclass)
2525              {              {
2526              int max = GET2(ecode, 3);              int max = GET2(ecode, 1 + IMM2_SIZE);
2527              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2528                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2529              else              else
2530                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2531              }              }
# Line 2157  for (;;) Line 2540  for (;;)
2540    
2541  /* ========================================================================== */  /* ========================================================================== */
2542        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2543        to use recursion in order to handle them. The "always failing" assersion        to use recursion in order to handle them. The "always failing" assertion
2544        (?!) is optimised when compiling to OP_FAIL, so we have to support that,        (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2545        though the other "backtracking verbs" are not supported. */        though the other "backtracking verbs" are not supported. */
2546    
2547        case OP_FAIL:        case OP_FAIL:
2548          forced_fail++;    /* Count FAILs for multiple states */
2549        break;        break;
2550    
2551        case OP_ASSERT:        case OP_ASSERT:
# Line 2172  for (;;) Line 2556  for (;;)
2556          int rc;          int rc;
2557          int local_offsets[2];          int local_offsets[2];
2558          int local_workspace[1000];          int local_workspace[1000];
2559          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2560    
2561          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2562    
# Line 2180  for (;;) Line 2564  for (;;)
2564            md,                                   /* static match data */            md,                                   /* static match data */
2565            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2566            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2567            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2568            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2569            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2570            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2571            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2572            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2573    
2574            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2575          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2576              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2577          }          }
2578        break;        break;
2579    
# Line 2210  for (;;) Line 2593  for (;;)
2593          if (code[LINK_SIZE+1] == OP_CALLOUT)          if (code[LINK_SIZE+1] == OP_CALLOUT)
2594            {            {
2595            rrc = 0;            rrc = 0;
2596            if (pcre_callout != NULL)            if (PUBL(callout) != NULL)
2597              {              {
2598              pcre_callout_block cb;              PUBL(callout_block) cb;
2599              cb.version          = 1;   /* Version 1 of the callout block */              cb.version          = 1;   /* Version 1 of the callout block */
2600              cb.callout_number   = code[LINK_SIZE+2];              cb.callout_number   = code[LINK_SIZE+2];
2601              cb.offset_vector    = offsets;              cb.offset_vector    = offsets;
2602    #ifdef COMPILE_PCRE8
2603              cb.subject          = (PCRE_SPTR)start_subject;              cb.subject          = (PCRE_SPTR)start_subject;
2604              cb.subject_length   = end_subject - start_subject;  #else
2605              cb.start_match      = current_subject - start_subject;              cb.subject          = (PCRE_SPTR16)start_subject;
2606              cb.current_position = ptr - start_subject;  #endif
2607                cb.subject_length   = (int)(end_subject - start_subject);
2608                cb.start_match      = (int)(current_subject - start_subject);
2609                cb.current_position = (int)(ptr - start_subject);
2610              cb.pattern_position = GET(code, LINK_SIZE + 3);              cb.pattern_position = GET(code, LINK_SIZE + 3);
2611              cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);              cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2612              cb.capture_top      = 1;              cb.capture_top      = 1;
2613              cb.capture_last     = -1;              cb.capture_last     = -1;
2614              cb.callout_data     = md->callout_data;              cb.callout_data     = md->callout_data;
2615              if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */              cb.mark             = NULL;   /* No (*MARK) support */
2616                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2617              }              }
2618            if (rrc > 0) break;                      /* Fail this thread */            if (rrc > 0) break;                      /* Fail this thread */
2619            code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */            code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2620            }            }
2621    
2622          condcode = code[LINK_SIZE+1];          condcode = code[LINK_SIZE+1];
2623    
2624          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2625    
2626          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2627              return PCRE_ERROR_DFA_UCOND;
2628    
2629          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2630    
# Line 2246  for (;;) Line 2635  for (;;)
2635          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2636          recursed groups. */          recursed groups. */
2637    
2638          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2639            {            {
2640            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2641            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2642            if (recursing > 0)            if (md->recursive != NULL)
2643              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2644            else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }            else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2645            }            }
2646    
# Line 2260  for (;;) Line 2649  for (;;)
2649          else          else
2650            {            {
2651            int rc;            int rc;
2652            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2653            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2654    
2655            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2656    
# Line 2269  for (;;) Line 2658  for (;;)
2658              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2659              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2660              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2661              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2662              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2663              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2664              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2665              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2666              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2667    
2668              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2669            if ((rc >= 0) ==            if ((rc >= 0) ==
2670                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2671              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2672            else            else
2673              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2674            }            }
# Line 2290  for (;;) Line 2678  for (;;)
2678        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2679        case OP_RECURSE:        case OP_RECURSE:
2680          {          {
2681            dfa_recursion_info *ri;
2682          int local_offsets[1000];          int local_offsets[1000];
2683          int local_workspace[1000];          int local_workspace[1000];
2684            const pcre_uchar *callpat = start_code + GET(code, 1);
2685            int recno = (callpat == md->start_code)? 0 :
2686              GET2(callpat, 1 + LINK_SIZE);
2687          int rc;          int rc;
2688    
2689          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2690            recursing + 1));  
2691            /* Check for repeating a recursion without advancing the subject
2692            pointer. This should catch convoluted mutual recursions. (Some simple
2693            cases are caught at compile time.) */
2694    
2695            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2696              if (recno == ri->group_num && ptr == ri->subject_position)
2697                return PCRE_ERROR_RECURSELOOP;
2698    
2699            /* Remember this recursion and where we started it so as to
2700            catch infinite loops. */
2701    
2702            new_recursive.group_num = recno;
2703            new_recursive.subject_position = ptr;
2704            new_recursive.prevrec = md->recursive;
2705            md->recursive = &new_recursive;
2706    
2707          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2708            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2709            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2710            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2711            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2712            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2713            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2714            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2715            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2716            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing + 1);                       /* regex recurse level */  
2717    
2718          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2719            recursing + 1, rc));  
2720            DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2721              rc));
2722    
2723          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2724    
# Line 2325  for (;;) Line 2732  for (;;)
2732            {            {
2733            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2734              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2735              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2736              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #ifdef SUPPORT_UTF
2737                const pcre_uchar *p = start_subject + local_offsets[rc];
2738                const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2739                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2740    #endif
2741              if (charcount > 0)              if (charcount > 0)
2742                {                {
2743                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 2344  for (;;) Line 2753  for (;;)
2753        break;        break;
2754    
2755        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2756          case OP_BRAPOS:
2757          case OP_SBRAPOS:
2758          case OP_CBRAPOS:
2759          case OP_SCBRAPOS:
2760          case OP_BRAPOSZERO:
2761            {
2762            int charcount, matched_count;
2763            const pcre_uchar *local_ptr = ptr;
2764            BOOL allow_zero;
2765    
2766            if (codevalue == OP_BRAPOSZERO)
2767              {
2768              allow_zero = TRUE;
2769              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2770              }
2771            else allow_zero = FALSE;
2772    
2773            /* Loop to match the subpattern as many times as possible as if it were
2774            a complete pattern. */
2775    
2776            for (matched_count = 0;; matched_count++)
2777              {
2778              int local_offsets[2];
2779              int local_workspace[1000];
2780    
2781              int rc = internal_dfa_exec(
2782                md,                                   /* fixed match data */
2783                code,                                 /* this subexpression's code */
2784                local_ptr,                            /* where we currently are */
2785                (int)(ptr - start_subject),           /* start offset */
2786                local_offsets,                        /* offset vector */
2787                sizeof(local_offsets)/sizeof(int),    /* size of same */
2788                local_workspace,                      /* workspace vector */
2789                sizeof(local_workspace)/sizeof(int),  /* size of same */
2790                rlevel);                              /* function recursion level */
2791    
2792              /* Failed to match */
2793    
2794              if (rc < 0)
2795                {
2796                if (rc != PCRE_ERROR_NOMATCH) return rc;
2797                break;
2798                }
2799    
2800              /* Matched: break the loop if zero characters matched. */
2801    
2802              charcount = local_offsets[1] - local_offsets[0];
2803              if (charcount == 0) break;
2804              local_ptr += charcount;    /* Advance temporary position ptr */
2805              }
2806    
2807            /* At this point we have matched the subpattern matched_count
2808            times, and local_ptr is pointing to the character after the end of the
2809            last match. */
2810    
2811            if (matched_count > 0 || allow_zero)
2812              {
2813              const pcre_uchar *end_subpattern = code;
2814              int next_state_offset;
2815    
2816              do { end_subpattern += GET(end_subpattern, 1); }
2817                while (*end_subpattern == OP_ALT);
2818              next_state_offset =
2819                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2820    
2821              /* Optimization: if there are no more active states, and there
2822              are no new states yet set up, then skip over the subject string
2823              right here, to save looping. Otherwise, set up the new state to swing
2824              into action when the end of the matched substring is reached. */
2825    
2826              if (i + 1 >= active_count && new_count == 0)
2827                {
2828                ptr = local_ptr;
2829                clen = 0;
2830                ADD_NEW(next_state_offset, 0);
2831                }
2832              else
2833                {
2834                const pcre_uchar *p = ptr;
2835                const pcre_uchar *pp = local_ptr;
2836                charcount = (int)(pp - p);
2837    #ifdef SUPPORT_UTF
2838                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2839    #endif
2840                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2841                }
2842              }
2843            }
2844          break;
2845    
2846          /*-----------------------------------------------------------------*/
2847        case OP_ONCE:        case OP_ONCE:
2848          case OP_ONCE_NC:
2849          {          {
2850          int local_offsets[2];          int local_offsets[2];
2851          int local_workspace[1000];          int local_workspace[1000];
# Line 2353  for (;;) Line 2854  for (;;)
2854            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2855            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2856            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2857            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2858            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2859            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2860            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2861            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2862            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2863    
2864          if (rc >= 0)          if (rc >= 0)
2865            {            {
2866            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2867            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2868            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2869    
2870            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2871              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2872            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2873                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2874    
2875            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2876            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 2378  for (;;) Line 2878  for (;;)
2878    
2879            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2880                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2881              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2882    
2883            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2884            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 2393  for (;;) Line 2893  for (;;)
2893            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2894            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2895            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2896            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2897    
2898            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2899              {              {
# Line 2416  for (;;) Line 2916  for (;;)
2916              }              }
2917            else            else
2918              {              {
2919              const uschar *p = start_subject + local_offsets[0];  #ifdef SUPPORT_UTF
2920              const uschar *pp = start_subject + local_offsets[1];              const pcre_uchar *p = start_subject + local_offsets[0];
2921              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;              const pcre_uchar *pp = start_subject + local_offsets[1];
2922                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2923    #endif
2924              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2925              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2926                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2927              }              }
   
2928            }            }
2929          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2930          }          }
# Line 2435  for (;;) Line 2936  for (;;)
2936    
2937        case OP_CALLOUT:        case OP_CALLOUT:
2938        rrc = 0;        rrc = 0;
2939        if (pcre_callout != NULL)        if (PUBL(callout) != NULL)
2940          {          {
2941          pcre_callout_block cb;          PUBL(callout_block) cb;
2942          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2943          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2944          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2945    #ifdef COMPILE_PCRE8
2946          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2947          cb.subject_length   = end_subject - start_subject;  #else
2948          cb.start_match      = current_subject - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
2949          cb.current_position = ptr - start_subject;  #endif
2950            cb.subject_length   = (int)(end_subject - start_subject);
2951            cb.start_match      = (int)(current_subject - start_subject);
2952            cb.current_position = (int)(ptr - start_subject);
2953          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
2954          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
2955          cb.capture_top      = 1;          cb.capture_top      = 1;
2956          cb.capture_last     = -1;          cb.capture_last     = -1;
2957          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2958          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
2959            if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2960          }          }
2961        if (rrc == 0)        if (rrc == 0)
2962          { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }          { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2963        break;        break;
2964    
2965    
# Line 2469  for (;;) Line 2975  for (;;)
2975    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2976    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2977    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2978    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
2979    
2980      The "forced_ fail" variable counts the number of (*F) encountered for the
2981      character. If it is equal to the original active_count (saved in
2982      workspace[1]) it means that (*F) was found on every active state. In this
2983      case we don't want to give a partial match.
2984    
2985      The "could_continue" variable is true if a state could have continued but
2986      for the fact that the end of the subject was reached. */
2987    
2988    if (new_count <= 0)    if (new_count <= 0)
2989      {      {
2990      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2991          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on */
2992          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2993          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
2994          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2995            ||                                           /* or... */
2996            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2997             match_count < 0)                            /* no matches */
2998            ) &&                                         /* And... */
2999            (
3000            ptr >= end_subject ||                  /* Reached end of subject or */
3001            partial_newline                        /* a partial newline */
3002            ) &&
3003            ptr > md->start_used_ptr)              /* Inspected non-empty string */
3004        {        {
3005        if (offsetcount >= 2)        if (offsetcount >= 2)
3006          {          {
3007          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
3008          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
3009          }          }
3010        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
3011        }        }
# Line 2536  Returns:          > 0 => number of match Line 3059  Returns:          > 0 => number of match
3059                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3060  */  */
3061    
3062    #ifdef COMPILE_PCRE8
3063  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3064  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3065    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3066    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3067    #else
3068    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3069    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3070      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3071      int offsetcount, int *workspace, int wscount)
3072    #endif
3073  {  {
3074  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3075  dfa_match_data match_block;  dfa_match_data match_block;
3076  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3077  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3078  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
   
 pcre_study_data internal_study;  
3079  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3080    
3081  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3082  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3083  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3084  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3085  int first_byte = -1;  pcre_uchar first_char = 0;
3086  int req_byte = -1;  pcre_uchar first_char2 = 0;
3087  int req_byte2 = -1;  pcre_uchar req_char = 0;
3088    pcre_uchar req_char2 = 0;
3089  int newline;  int newline;
3090    
3091  /* Plausibility checks */  /* Plausibility checks */
# Line 2567  if (re == NULL || subject == NULL || wor Line 3095  if (re == NULL || subject == NULL || wor
3095     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3096  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3097  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3098    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3099    
3100  /* We need to find the pointer to any study data before we test for byte  /* We need to find the pointer to any study data before we test for byte
3101  flipping, so we scan the extra_data block first. This may set two fields in the  flipping, so we scan the extra_data block first. This may set two fields in the
# Line 2591  if (extra_data != NULL) Line 3120  if (extra_data != NULL)
3120    }    }
3121    
3122  /* Check that the first field in the block is the magic number. If it is not,  /* Check that the first field in the block is the magic number. If it is not,
3123  test for a regex that was compiled on a host of opposite endianness. If this is  return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3124  the case, flipped values are put in internal_re and internal_study if there was  REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3125  study data too. */  means that the pattern is likely compiled with different endianness. */
3126    
3127  if (re->magic_number != MAGIC_NUMBER)  if (re->magic_number != MAGIC_NUMBER)
3128    {    return re->magic_number == REVERSED_MAGIC_NUMBER?
3129    re = _pcre_try_flipped(re, &internal_re, study, &internal_study);      PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3130    if (re == NULL) return PCRE_ERROR_BADMAGIC;  if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
   if (study != NULL) study = &internal_study;  
   }  
3131    
3132  /* Set some local values */  /* Set some local values */
3133    
3134  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3135  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3136  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3137    
3138  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3139  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3140    utf = (re->options & PCRE_UTF8) != 0;
3141  #else  #else
3142  utf8 = FALSE;  utf = FALSE;
3143  #endif  #endif
3144    
3145  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2619  anchored = (options & (PCRE_ANCHORED|PCR Line 3147  anchored = (options & (PCRE_ANCHORED|PCR
3147    
3148  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3149    
3150  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3151      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3152  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3153  md->end_subject = end_subject;  md->end_subject = end_subject;
3154    md->start_offset = start_offset;
3155  md->moptions = options;  md->moptions = options;
3156  md->poptions = re->options;  md->poptions = re->options;
3157    
# Line 2681  else Line 3210  else
3210  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3211  back the character offset. */  back the character offset. */
3212    
3213  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3214  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3215    {    {
3216    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3217      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3218    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3219      {      {
3220      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3221        {        {
3222        tb &= 0xc0;        offsets[0] = erroroffset;
3223        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3224        }        }
3225        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3226          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3227      }      }
3228      if (start_offset > 0 && start_offset < length &&
3229            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3230        return PCRE_ERROR_BADUTF8_OFFSET;
3231    }    }
3232  #endif  #endif
3233    
# Line 2702  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3235  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3235  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3236  in other programs later. */  in other programs later. */
3237    
3238  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3239    
3240  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3241  used in a loop when finding where to start. */  where to start. */
3242    
 lcc = md->tables + lcc_offset;  
3243  startline = (re->flags & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
3244  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3245    
# Line 2721  if (!anchored) Line 3253  if (!anchored)
3253    {    {
3254    if ((re->flags & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3255      {      {
3256      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3257      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = (pcre_uchar)(re->first_char);
3258        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3259          {
3260          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3261    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3262          if (utf && first_char > 127)
3263            first_char2 = UCD_OTHERCASE(first_char);
3264    #endif
3265          }
3266      }      }
3267    else    else
3268      {      {
3269      if (startline && study != NULL &&      if (!startline && study != NULL &&
3270           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3271        start_bits = study->start_bits;        start_bits = study->start_bits;
3272      }      }
3273    }    }
# Line 2738  character" set. */ Line 3277  character" set. */
3277    
3278  if ((re->flags & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3279    {    {
3280    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3281    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = (pcre_uchar)(re->req_char);
3282    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3283        {
3284        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3285    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3286        if (utf && req_char > 127)
3287          req_char2 = UCD_OTHERCASE(req_char);
3288    #endif
3289        }
3290    }    }
3291    
3292  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
# Line 2753  for (;;) Line 3299  for (;;)
3299    
3300    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3301      {      {
3302      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3303    
3304      /* If firstline is TRUE, the start of the match is constrained to the first      /* If firstline is TRUE, the start of the match is constrained to the first
3305      line of a multiline string. Implement this by temporarily adjusting      line of a multiline string. Implement this by temporarily adjusting
# Line 2762  for (;;) Line 3308  for (;;)
3308    
3309      if (firstline)      if (firstline)
3310        {        {
3311        USPTR t = current_subject;        PCRE_PUCHAR t = current_subject;
3312  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3313        if (utf8)        if (utf)
3314          {          {
3315          while (t < md->end_subject && !IS_NEWLINE(t))          while (t < md->end_subject && !IS_NEWLINE(t))
3316            {            {
3317            t++;            t++;
3318            while (t < end_subject && (*t & 0xc0) == 0x80) t++;            ACROSSCHAR(t < end_subject, *t, t++);
3319            }            }
3320          }          }
3321        else        else
# Line 2779  for (;;) Line 3325  for (;;)
3325        }        }
3326    
3327      /* There are some optimizations that avoid running the match if a known      /* There are some optimizations that avoid running the match if a known
3328      starting point is not found, or if a known later character is not present.      starting point is not found. However, there is an option that disables
3329      However, there is an option that disables these, for testing and for      these, for testing and for ensuring that all callouts do actually occur.
3330      ensuring that all callouts do actually occur. */      The option can be set in the regex by (*NO_START_OPT) or passed in
3331        match-time options. */
3332    
3333      if ((options & PCRE_NO_START_OPTIMIZE) == 0)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3334        {        {
3335          /* Advance to a known first char. */
3336    
3337        /* Advance to a known first byte. */        if (has_first_char)
   
       if (first_byte >= 0)  
3338          {          {
3339          if (first_byte_caseless)          if (first_char != first_char2)
3340            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3341                   lcc[*current_subject] != first_byte)                *current_subject != first_char && *current_subject != first_char2)
3342              current_subject++;              current_subject++;
3343          else          else
3344            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3345                   *current_subject != first_byte)                   *current_subject != first_char)
3346              current_subject++;              current_subject++;
3347          }          }
3348    
# Line 2806  for (;;) Line 3352  for (;;)
3352          {          {
3353          if (current_subject > md->start_subject + start_offset)          if (current_subject > md->start_subject + start_offset)
3354            {            {
3355  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3356            if (utf8)            if (utf)
3357              {              {
3358              while (current_subject < end_subject &&              while (current_subject < end_subject &&
3359                     !WAS_NEWLINE(current_subject))                     !WAS_NEWLINE(current_subject))
3360                {                {
3361                current_subject++;                current_subject++;
3362                while(current_subject < end_subject &&                ACROSSCHAR(current_subject < end_subject, *current_subject,
3363                      (*current_subject & 0xc0) == 0x80)                  current_subject++);
                 current_subject++;  
3364                }                }
3365              }              }
3366            else            else
# Line 2842  for (;;) Line 3387  for (;;)
3387          while (current_subject < end_subject)          while (current_subject < end_subject)
3388            {            {
3389            register unsigned int c = *current_subject;            register unsigned int c = *current_subject;
3390            if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;  #ifndef COMPILE_PCRE8
3391              else break;            if (c > 255) c = 255;
3392    #endif
3393              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3394                {
3395                current_subject++;
3396    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3397                /* In non 8-bit mode, the iteration will stop for
3398                characters > 255 at the beginning or not stop at all. */
3399                if (utf)
3400                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3401                    current_subject++);
3402    #endif
3403                }
3404              else break;
3405            }            }
3406          }          }
3407        }        }
# Line 2851  for (;;) Line 3409  for (;;)
3409      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3410    
3411      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
3412    
3413    /* If req_byte is set, we know that that character must appear in the subject      /* The following two optimizations are disabled for partial matching or if
3414    for the match to succeed. If the first character is set, req_byte must be      disabling is explicitly requested (and of course, by the test above, this
3415    later in the subject; otherwise the test starts at the match point. This      code is not obeyed when restarting after a partial match). */
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested, and can  
   also be explicitly deactivated. */  
   
   if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&  
       req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
   
     /* We don't need to repeat the search if we haven't yet reached the  
     place we found it at last time. */  
3416    
3417      if (p > req_byte_ptr)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3418            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3419        {        {
3420        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3421          {        is a lower bound; no actual string of that length may actually match the
3422          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3423            {        bytes to avoid spending too much time in this optimization. */
3424            register int pp = *p++;  
3425            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3426            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3427          }          return PCRE_ERROR_NOMATCH;
3428        else  
3429          /* If req_char is set, we know that that character must appear in the
3430          subject for the match to succeed. If the first character is set, req_char
3431          must be later in the subject; otherwise the test starts at the match
3432          point. This optimization can save a huge amount of work in patterns with
3433          nested unlimited repeats that aren't going to match. Writing separate
3434          code for cased/caseless versions makes it go faster, as does using an
3435          autoincrement and backing off on a match.
3436    
3437          HOWEVER: when the subject string is very, very long, searching to its end
3438          can take a long time, and give bad performance on quite ordinary
3439          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3440          string... so we don't do this when the string is sufficiently long. */
3441    
3442          if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3443          {          {
3444          while (p < end_subject)          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3445    
3446            /* We don't need to repeat the search if we haven't yet reached the
3447            place we found it at last time. */
3448    
3449            if (p > req_char_ptr)
3450            {            {
3451            if (*p++ == req_byte) { p--; break; }            if (req_char != req_char2)
3452            }              {
3453          }              while (p < end_subject)
3454                  {
3455                  register int pp = *p++;
3456                  if (pp == req_char || pp == req_char2) { p--; break; }
3457                  }
3458                }
3459              else
3460                {
3461                while (p < end_subject)
3462                  {
3463                  if (*p++ == req_char) { p--; break; }
3464                  }
3465                }
3466    
3467        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3468        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3469    
3470        if (p >= end_subject) break;            if (p >= end_subject) break;
3471    
3472        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3473        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3474        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3475    
3476        req_byte_ptr = p;            req_char_ptr = p;
3477              }
3478            }
3479        }        }
3480      }      }   /* End of optimizations that are done when not restarting */
3481    
3482    /* OK, now we can do the business */    /* OK, now we can do the business */
3483    
3484      md->start_used_ptr = current_subject;
3485      md->recursive = NULL;
3486    
3487    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3488      md,                                /* fixed match data */      md,                                /* fixed match data */
3489      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2921  for (;;) Line 3493  for (;;)
3493      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3494      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3495      wscount,                           /* size of same */      wscount,                           /* size of same */
3496      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3497    
3498    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3499    on only if not anchored. */    on only if not anchored. */
# Line 2935  for (;;) Line 3505  for (;;)
3505    
3506    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3507    current_subject++;    current_subject++;
3508    if (utf8)  #ifdef SUPPORT_UTF
3509      if (utf)
3510      {      {
3511      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3512        current_subject++;        current_subject++);
3513      }      }
3514    #endif
3515    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3516    
3517    /* If we have just passed a CR and we are now at a LF, and the pattern does    /* If we have just passed a CR and we are now at a LF, and the pattern does

Legend:
Removed from v.406  
changed lines
  Added in v.916

  ViewVC Help
Powered by ViewVC 1.1.5