/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 365 by ph10, Fri Jul 11 17:06:55 2008 UTC revision 779 by ph10, Fri Dec 2 10:39:32 2011 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49    the performance of his patterns greatly. I could not use it as it stood, as it
50    was not thread safe, and made assumptions about pattern sizes. Also, it caused
51    test 7 to loop, and test 9 to crash with a segfault.
52    
53    The issue is the check for duplicate states, which is done by a simple linear
54    search up the state list. (Grep for "duplicate" below to find the code.) For
55    many patterns, there will never be many states active at one time, so a simple
56    linear search is fine. In patterns that have many active states, it might be a
57    bottleneck. The suggested code used an indexing scheme to remember which states
58    had previously been used for each character, and avoided the linear search when
59    it knew there was no chance of a duplicate. This was implemented when adding
60    states to the state lists.
61    
62    I wrote some thread-safe, not-limited code to try something similar at the time
63    of checking for duplicates (instead of when adding states), using index vectors
64    on the stack. It did give a 13% improvement with one specially constructed
65    pattern for certain subject strings, but on other strings and on many of the
66    simpler patterns in the test suite it did worse. The major problem, I think,
67    was the extra time to initialize the index. This had to be done for each call
68    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69    only once - I suspect this was the cause of the problems with the tests.)
70    
71    Overall, I concluded that the gains in some cases did not outweigh the losses
72    in others, so I abandoned this code. */
73    
74    
75    
76  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
77  #include "config.h"  #include "config.h"
78  #endif  #endif
# Line 60  applications. */ Line 89  applications. */
89  #define SP "                   "  #define SP "                   "
90    
91    
   
92  /*************************************************  /*************************************************
93  *      Code parameters and static tables         *  *      Code parameters and static tables         *
94  *************************************************/  *************************************************/
# Line 78  never stored, so we push them well clear Line 106  never stored, so we push them well clear
106    
107    
108  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
109  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
110  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
111  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. Non-zero values in the table are the offsets from the opcode where
113  that follow must also be modified. */  the character is to be found. ***NOTE*** If the start of this table is
114    modified, the three tables that follow must also be modified. */
115    
116  static const uschar coptable[] = {  static const uschar coptable[] = {
117    0,                             /* End                                    */    0,                             /* End                                    */
118    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120    0, 0, 0,                       /* Any, AllAny, Anybyte                   */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0,                          /* \P, \p                                 */
122    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0,                             /* \X                                     */
124      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
125    1,                             /* Char                                   */    1,                             /* Char                                   */
126    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
127    1,                             /* not                                    */    1,                             /* not                                    */
128      1,                             /* noti                                   */
129    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
130    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131    3, 3, 3,                       /* upto, minupto, exact                   */    3, 3, 3,                       /* upto, minupto, exact                   */
132    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      3, 3, 3,                       /* upto I, minupto I, exact I             */
135      1, 1, 1, 3,                    /* *+I, ++I, ?+I, upto+I                  */
136    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
137    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
138    3, 3, 3,                       /* NOT upto, minupto, exact               */    3, 3, 3,                       /* NOT upto, minupto, exact               */
139    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1, 1, 1, 3,                    /* NOT *+, ++, ?+, upto+                  */
140      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
141      3, 3, 3,                       /* NOT upto I, minupto I, exact I         */
142      1, 1, 1, 3,                    /* NOT *+I, ++I, ?+I, upto+I              */
143    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
144    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
145    3, 3, 3,                       /* Type upto, minupto, exact              */    3, 3, 3,                       /* Type upto, minupto, exact              */
# Line 114  static const uschar coptable[] = { Line 151  static const uschar coptable[] = {
151    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
152    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
153    0,                             /* REF                                    */    0,                             /* REF                                    */
154      0,                             /* REFI                                   */
155    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
156    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
157    0,                             /* Alt                                    */    0,                             /* Alt                                    */
158    0,                             /* Ket                                    */    0,                             /* Ket                                    */
159    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
160    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
161      0,                             /* KetRpos                                */
162      0,                             /* Reverse                                */
163    0,                             /* Assert                                 */    0,                             /* Assert                                 */
164    0,                             /* Assert not                             */    0,                             /* Assert not                             */
165    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
166    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
167      0, 0,                          /* ONCE, ONCE_NC                          */
168      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
169      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
170      0, 0,                          /* CREF, NCREF                            */
171      0, 0,                          /* RREF, NRREF                            */
172      0,                             /* DEF                                    */
173      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
174      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
175      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
176      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
177      0, 0                           /* CLOSE, SKIPZERO  */
178    };
179    
180    /* This table identifies those opcodes that inspect a character. It is used to
181    remember the fact that a character could have been inspected when the end of
182    the subject is reached. ***NOTE*** If the start of this table is modified, the
183    two tables that follow must also be modified. */
184    
185    static const uschar poptable[] = {
186      0,                             /* End                                    */
187      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
188      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
189      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
190      1, 1,                          /* \P, \p                                 */
191      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
192      1,                             /* \X                                     */
193      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
194      1,                             /* Char                                   */
195      1,                             /* Chari                                  */
196      1,                             /* not                                    */
197      1,                             /* noti                                   */
198      /* Positive single-char repeats                                          */
199      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
200      1, 1, 1,                       /* upto, minupto, exact                   */
201      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
202      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
203      1, 1, 1,                       /* upto I, minupto I, exact I             */
204      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
205      /* Negative single-char repeats - only for chars < 256                   */
206      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
207      1, 1, 1,                       /* NOT upto, minupto, exact               */
208      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
209      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
210      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
211      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
212      /* Positive type repeats                                                 */
213      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
214      1, 1, 1,                       /* Type upto, minupto, exact              */
215      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
216      /* Character class & ref repeats                                         */
217      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
218      1, 1,                          /* CRRANGE, CRMINRANGE                    */
219      1,                             /* CLASS                                  */
220      1,                             /* NCLASS                                 */
221      1,                             /* XCLASS - variable length               */
222      0,                             /* REF                                    */
223      0,                             /* REFI                                   */
224      0,                             /* RECURSE                                */
225      0,                             /* CALLOUT                                */
226      0,                             /* Alt                                    */
227      0,                             /* Ket                                    */
228      0,                             /* KetRmax                                */
229      0,                             /* KetRmin                                */
230      0,                             /* KetRpos                                */
231    0,                             /* Reverse                                */    0,                             /* Reverse                                */
232    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
233    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
234    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
235    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
236      0, 0,                          /* ONCE, ONCE_NC                          */
237      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
238      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
239      0, 0,                          /* CREF, NCREF                            */
240      0, 0,                          /* RREF, NRREF                            */
241    0,                             /* DEF                                    */    0,                             /* DEF                                    */
242    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
243    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
244    0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
245      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
246      0, 0                           /* CLOSE, SKIPZERO                        */
247  };  };
248    
249  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
# Line 163  these structures in, is a vector of ints Line 274  these structures in, is a vector of ints
274  typedef struct stateblock {  typedef struct stateblock {
275    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
276    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
277    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
278  } stateblock;  } stateblock;
279    
280  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
281    
282    
283  #ifdef DEBUG  #ifdef PCRE_DEBUG
284  /*************************************************  /*************************************************
285  *             Print character string             *  *             Print character string             *
286  *************************************************/  *************************************************/
# Line 219  Arguments: Line 329  Arguments:
329    offsetcount       size of same    offsetcount       size of same
330    workspace         vector of workspace    workspace         vector of workspace
331    wscount           size of same    wscount           size of same
   ims               the current ims flags  
332    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
333    
334  Returns:            > 0 => number of match offset pairs placed in offsets  Returns:            > 0 => number of match offset pairs placed in offsets
335                      = 0 => offsets overflowed; longest matches are present                      = 0 => offsets overflowed; longest matches are present
# Line 236  for the current character, one for the f Line 344  for the current character, one for the f
344      { \      { \
345      next_active_state->offset = (x); \      next_active_state->offset = (x); \
346      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
347      next_active_state++; \      next_active_state++; \
348      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349      } \      } \
# Line 247  for the current character, one for the f Line 354  for the current character, one for the f
354      { \      { \
355      next_active_state->offset = (x); \      next_active_state->offset = (x); \
356      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
357      next_active_state->data   = (z); \      next_active_state->data   = (z); \
358      next_active_state++; \      next_active_state++; \
359      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 259  for the current character, one for the f Line 365  for the current character, one for the f
365      { \      { \
366      next_new_state->offset = (x); \      next_new_state->offset = (x); \
367      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
368      next_new_state++; \      next_new_state++; \
369      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370      } \      } \
# Line 270  for the current character, one for the f Line 375  for the current character, one for the f
375      { \      { \
376      next_new_state->offset = (x); \      next_new_state->offset = (x); \
377      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
378      next_new_state->data   = (z); \      next_new_state->data   = (z); \
379      next_new_state++; \      next_new_state++; \
380      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 289  internal_dfa_exec( Line 393  internal_dfa_exec(
393    int offsetcount,    int offsetcount,
394    int *workspace,    int *workspace,
395    int wscount,    int wscount,
396    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
397  {  {
398  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
399  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
# Line 300  const uschar *ctypes, *lcc, *fcc; Line 402  const uschar *ctypes, *lcc, *fcc;
402  const uschar *ptr;  const uschar *ptr;
403  const uschar *end_code, *first_op;  const uschar *end_code, *first_op;
404    
405    dfa_recursion_info new_recursive;
406    
407  int active_count, new_count, match_count;  int active_count, new_count, match_count;
408    
409  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
# Line 323  wscount = (wscount - (wscount % (INTS_PE Line 427  wscount = (wscount - (wscount % (INTS_PE
427            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
428    
429  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
430    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
431    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432    
433  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
434  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 337  next_new_state = new_states = active_sta Line 441  next_new_state = new_states = active_sta
441  new_count = 0;  new_count = 0;
442    
443  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
444    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
446    
447  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
448  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 386  if (*first_op == OP_REVERSE) Line 491  if (*first_op == OP_REVERSE)
491    
492      {      {
493      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
494        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
495      current_subject -= gone_back;      current_subject -= gone_back;
496      }      }
497    
498      /* Save the earliest consulted character */
499    
500      if (current_subject < md->start_used_ptr)
501        md->start_used_ptr = current_subject;
502    
503    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
504    
505    end_code = this_start_code;    end_code = this_start_code;
# Line 398  if (*first_op == OP_REVERSE) Line 508  if (*first_op == OP_REVERSE)
508      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
509      if (back <= gone_back)      if (back <= gone_back)
510        {        {
511        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
512        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
513        }        }
514      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 431  else Line 541  else
541    else    else
542      {      {
543      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
544        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
545            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
546            2:0);
547      do      do
548        {        {
549        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
550        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
551        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
552        }        }
# Line 454  for (;;) Line 566  for (;;)
566    int i, j;    int i, j;
567    int clen, dlen;    int clen, dlen;
568    unsigned int c, d;    unsigned int c, d;
569      int forced_fail = 0;
570      BOOL could_continue = FALSE;
571    
572    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
573    new state list. */    new state list. */
# Line 467  for (;;) Line 581  for (;;)
581    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
582    workspace[1] = active_count;    workspace[1] = active_count;
583    
584  #ifdef DEBUG  #ifdef PCRE_DEBUG
585    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
586    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars((uschar *)ptr, strlen((char *)ptr), stdout);
587    printf("\"\n");    printf("\"\n");
# Line 509  for (;;) Line 623  for (;;)
623    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
624      {      {
625      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
626        BOOL caseless = FALSE;
627      const uschar *code;      const uschar *code;
628      int state_offset = current_state->offset;      int state_offset = current_state->offset;
629      int count, codevalue;      int count, codevalue, rrc;
630    
631  #ifdef DEBUG  #ifdef PCRE_DEBUG
632      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
633      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
634        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
635          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
636  #endif  #endif
637    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
638      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
639      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
640      been skipped". */      been skipped". */
# Line 543  for (;;) Line 654  for (;;)
654          }          }
655        }        }
656    
657      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
658        See the note at the head of this module about the possibility of improving
659        performance here. */
660    
661      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
662        {        {
# Line 560  for (;;) Line 673  for (;;)
673      code = start_code + state_offset;      code = start_code + state_offset;
674      codevalue = *code;      codevalue = *code;
675    
676        /* If this opcode inspects a character, but we are at the end of the
677        subject, remember the fact for use when testing for a partial match. */
678    
679        if (clen == 0 && poptable[codevalue] != 0)
680          could_continue = TRUE;
681    
682      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
683      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
684      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
# Line 606  for (;;) Line 725  for (;;)
725    
726      switch (codevalue)      switch (codevalue)
727        {        {
728    /* ========================================================================== */
729          /* These cases are never obeyed. This is a fudge that causes a compile-
730          time error if the vectors coptable or poptable, which are indexed by
731          opcode, are not the correct length. It seems to be the only way to do
732          such a check at compile time, as the sizeof() operator does not work
733          in the C preprocessor. */
734    
735          case OP_TABLE_LENGTH:
736          case OP_TABLE_LENGTH +
737            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
738             (sizeof(poptable) == OP_TABLE_LENGTH)):
739          break;
740    
741  /* ========================================================================== */  /* ========================================================================== */
742        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
743        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
744        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
745          subpattern, because the possessive subpattern repeats are always handled
746          using recursive calls. Thus, it never adds any new states.
747    
748          At the end of the (sub)pattern, unless we have an empty string and
749          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
750          start of the subject, save the match data, shifting up all previous
751        matches so we always have the longest first. */        matches so we always have the longest first. */
752    
753        case OP_KET:        case OP_KET:
754        case OP_KETRMIN:        case OP_KETRMIN:
755        case OP_KETRMAX:        case OP_KETRMAX:
756          case OP_KETRPOS:
757        if (code != end_code)        if (code != end_code)
758          {          {
759          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 624  for (;;) Line 762  for (;;)
762            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
763            }            }
764          }          }
765        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
766          {          {
767          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
768            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
769              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
770          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
771          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
772          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
773            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
774            offsets[0] = current_subject - start_subject;                match_count = 0;
775            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
776            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
777              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
778            }              {
779          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
780            {              offsets[1] = (int)(ptr - start_subject);
781            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
782              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], current_subject));
783              match_count, rlevel*2-2, SP));              }
784            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
785                {
786                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
787                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
788                  match_count, rlevel*2-2, SP));
789                return match_count;
790                }
791            }            }
792          }          }
793        break;        break;
# Line 655  for (;;) Line 799  for (;;)
799        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
800        case OP_ALT:        case OP_ALT:
801        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
802        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
803        break;        break;
804    
805        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 663  for (;;) Line 807  for (;;)
807        case OP_SBRA:        case OP_SBRA:
808        do        do
809          {          {
810          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
811          code += GET(code, 1);          code += GET(code, 1);
812          }          }
813        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 672  for (;;) Line 816  for (;;)
816        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
817        case OP_CBRA:        case OP_CBRA:
818        case OP_SCBRA:        case OP_SCBRA:
819        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE),  0);
820        code += GET(code, 1);        code += GET(code, 1);
821        while (*code == OP_ALT)        while (*code == OP_ALT)
822          {          {
823          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
824          code += GET(code, 1);          code += GET(code, 1);
825          }          }
826        break;        break;
# Line 687  for (;;) Line 831  for (;;)
831        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
832        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
833        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
834        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
835        break;        break;
836    
837        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
838        case OP_SKIPZERO:        case OP_SKIPZERO:
839        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
840        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
841        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
842        break;        break;
843    
844        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
845        case OP_CIRC:        case OP_CIRC:
846        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
847          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
848        break;        break;
849    
850        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
851        case OP_EOD:        case OP_CIRCM:
852        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
853              (ptr != end_subject && WAS_NEWLINE(ptr)))
854            { ADD_ACTIVE(state_offset + 1, 0); }
855        break;        break;
856    
857        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
858        case OP_OPT:        case OP_EOD:
859        ims = code[1];        if (ptr >= end_subject)
860        ADD_ACTIVE(state_offset + 2, 0);          {
861            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
862              could_continue = TRUE;
863            else { ADD_ACTIVE(state_offset + 1, 0); }
864            }
865        break;        break;
866    
867        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 748  for (;;) Line 895  for (;;)
895    
896        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
897        case OP_EODN:        case OP_EODN:
898        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
899            could_continue = TRUE;
900          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
901          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
902        break;        break;
903    
# Line 756  for (;;) Line 905  for (;;)
905        case OP_DOLL:        case OP_DOLL:
906        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
907          {          {
908          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
909              (IS_NEWLINE(ptr) &&            could_continue = TRUE;
910                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)          else if (clen == 0 ||
911                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
912                   (ptr == end_subject - md->nllen)
913              ))              ))
914            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
915          }          }
916        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        break;
917    
918          /*-----------------------------------------------------------------*/
919          case OP_DOLLM:
920          if ((md->moptions & PCRE_NOTEOL) == 0)
921            {
922            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
923              could_continue = TRUE;
924            else if (clen == 0 ||
925                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
926              { ADD_ACTIVE(state_offset + 1, 0); }
927            }
928          else if (IS_NEWLINE(ptr))
929          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
930        break;        break;
931    
# Line 794  for (;;) Line 957  for (;;)
957          if (ptr > start_subject)          if (ptr > start_subject)
958            {            {
959            const uschar *temp = ptr - 1;            const uschar *temp = ptr - 1;
960              if (temp < md->start_used_ptr) md->start_used_ptr = temp;
961  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
962            if (utf8) BACKCHAR(temp);            if (utf8) BACKCHAR(temp);
963  #endif  #endif
964            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
965    #ifdef SUPPORT_UCP
966              if ((md->poptions & PCRE_UCP) != 0)
967                {
968                if (d == '_') left_word = TRUE; else
969                  {
970                  int cat = UCD_CATEGORY(d);
971                  left_word = (cat == ucp_L || cat == ucp_N);
972                  }
973                }
974              else
975    #endif
976            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
977            }            }
978          else left_word = 0;          else left_word = FALSE;
979    
980          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
981            else right_word = 0;            {
982    #ifdef SUPPORT_UCP
983              if ((md->poptions & PCRE_UCP) != 0)
984                {
985                if (c == '_') right_word = TRUE; else
986                  {
987                  int cat = UCD_CATEGORY(c);
988                  right_word = (cat == ucp_L || cat == ucp_N);
989                  }
990                }
991              else
992    #endif
993              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
994              }
995            else right_word = FALSE;
996    
997          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
998            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 830  for (;;) Line 1019  for (;;)
1019            break;            break;
1020    
1021            case PT_LAMP:            case PT_LAMP:
1022            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1023                   prop->chartype == ucp_Lt;
1024            break;            break;
1025    
1026            case PT_GC:            case PT_GC:
# Line 845  for (;;) Line 1035  for (;;)
1035            OK = prop->script == code[2];            OK = prop->script == code[2];
1036            break;            break;
1037    
1038              /* These are specials for combination cases. */
1039    
1040              case PT_ALNUM:
1041              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1042                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1043              break;
1044    
1045              case PT_SPACE:    /* Perl space */
1046              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1047                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1048              break;
1049    
1050              case PT_PXSPACE:  /* POSIX space */
1051              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1052                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1053                   c == CHAR_FF || c == CHAR_CR;
1054              break;
1055    
1056              case PT_WORD:
1057              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1058                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1059                   c == CHAR_UNDERSCORE;
1060              break;
1061    
1062            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1063    
1064            default:            default:
# Line 999  for (;;) Line 1213  for (;;)
1213            break;            break;
1214    
1215            case PT_LAMP:            case PT_LAMP:
1216            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1217                prop->chartype == ucp_Lt;
1218            break;            break;
1219    
1220            case PT_GC:            case PT_GC:
# Line 1014  for (;;) Line 1229  for (;;)
1229            OK = prop->script == code[3];            OK = prop->script == code[3];
1230            break;            break;
1231    
1232              /* These are specials for combination cases. */
1233    
1234              case PT_ALNUM:
1235              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1236                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1237              break;
1238    
1239              case PT_SPACE:    /* Perl space */
1240              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1241                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1242              break;
1243    
1244              case PT_PXSPACE:  /* POSIX space */
1245              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1246                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1247                   c == CHAR_FF || c == CHAR_CR;
1248              break;
1249    
1250              case PT_WORD:
1251              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1252                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1253                   c == CHAR_UNDERSCORE;
1254              break;
1255    
1256            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1257    
1258            default:            default:
# Line 1221  for (;;) Line 1460  for (;;)
1460            break;            break;
1461    
1462            case PT_LAMP:            case PT_LAMP:
1463            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1464                prop->chartype == ucp_Lt;
1465            break;            break;
1466    
1467            case PT_GC:            case PT_GC:
# Line 1236  for (;;) Line 1476  for (;;)
1476            OK = prop->script == code[3];            OK = prop->script == code[3];
1477            break;            break;
1478    
1479              /* These are specials for combination cases. */
1480    
1481              case PT_ALNUM:
1482              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1483                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1484              break;
1485    
1486              case PT_SPACE:    /* Perl space */
1487              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1488                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1489              break;
1490    
1491              case PT_PXSPACE:  /* POSIX space */
1492              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1493                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1494                   c == CHAR_FF || c == CHAR_CR;
1495              break;
1496    
1497              case PT_WORD:
1498              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1499                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1500                   c == CHAR_UNDERSCORE;
1501              break;
1502    
1503            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1504    
1505            default:            default:
# Line 1468  for (;;) Line 1732  for (;;)
1732            break;            break;
1733    
1734            case PT_LAMP:            case PT_LAMP:
1735            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1736                prop->chartype == ucp_Lt;
1737            break;            break;
1738    
1739            case PT_GC:            case PT_GC:
# Line 1483  for (;;) Line 1748  for (;;)
1748            OK = prop->script == code[5];            OK = prop->script == code[5];
1749            break;            break;
1750    
1751              /* These are specials for combination cases. */
1752    
1753              case PT_ALNUM:
1754              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1755                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1756              break;
1757    
1758              case PT_SPACE:    /* Perl space */
1759              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1760                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1761              break;
1762    
1763              case PT_PXSPACE:  /* POSIX space */
1764              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1765                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1766                   c == CHAR_FF || c == CHAR_CR;
1767              break;
1768    
1769              case PT_WORD:
1770              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1771                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1772                   c == CHAR_UNDERSCORE;
1773              break;
1774    
1775            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1776    
1777            default:            default:
# Line 1692  for (;;) Line 1981  for (;;)
1981        break;        break;
1982    
1983        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1984        case OP_CHARNC:        case OP_CHARI:
1985        if (clen == 0) break;        if (clen == 0) break;
1986    
1987  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1878  for (;;) Line 2167  for (;;)
2167        break;        break;
2168    
2169        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2170        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character casefully. This is only used for
2171        characters, that is, we know that d < 256. The character we are        one-byte characters, that is, we know that d < 256. The character we are
2172        checking (c) can be multibyte. */        checking (c) can be multibyte. */
2173    
2174        case OP_NOT:        case OP_NOT:
2175        if (clen > 0)        if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2176          {        break;
2177          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;  
2178          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }        /*-----------------------------------------------------------------*/
2179          }        /* Match a negated single character caselessly. This is only used for
2180          one-byte characters, that is, we know that d < 256. The character we are
2181          checking (c) can be multibyte. */
2182    
2183          case OP_NOTI:
2184          if (clen > 0 && c != d && c != fcc[d])
2185            { ADD_NEW(state_offset + dlen + 1, 0); }
2186        break;        break;
2187    
2188        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2189          case OP_PLUSI:
2190          case OP_MINPLUSI:
2191          case OP_POSPLUSI:
2192          case OP_NOTPLUSI:
2193          case OP_NOTMINPLUSI:
2194          case OP_NOTPOSPLUSI:
2195          caseless = TRUE;
2196          codevalue -= OP_STARI - OP_STAR;
2197    
2198          /* Fall through */
2199        case OP_PLUS:        case OP_PLUS:
2200        case OP_MINPLUS:        case OP_MINPLUS:
2201        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1902  for (;;) Line 2207  for (;;)
2207        if (clen > 0)        if (clen > 0)
2208          {          {
2209          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2210          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2211            {            {
2212  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2213            if (utf8 && d >= 128)            if (utf8 && d >= 128)
# Line 1930  for (;;) Line 2235  for (;;)
2235        break;        break;
2236    
2237        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2238          case OP_QUERYI:
2239          case OP_MINQUERYI:
2240          case OP_POSQUERYI:
2241          case OP_NOTQUERYI:
2242          case OP_NOTMINQUERYI:
2243          case OP_NOTPOSQUERYI:
2244          caseless = TRUE;
2245          codevalue -= OP_STARI - OP_STAR;
2246          /* Fall through */
2247        case OP_QUERY:        case OP_QUERY:
2248        case OP_MINQUERY:        case OP_MINQUERY:
2249        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1940  for (;;) Line 2254  for (;;)
2254        if (clen > 0)        if (clen > 0)
2255          {          {
2256          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2257          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2258            {            {
2259  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2260            if (utf8 && d >= 128)            if (utf8 && d >= 128)
# Line 1966  for (;;) Line 2280  for (;;)
2280        break;        break;
2281    
2282        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2283          case OP_STARI:
2284          case OP_MINSTARI:
2285          case OP_POSSTARI:
2286          case OP_NOTSTARI:
2287          case OP_NOTMINSTARI:
2288          case OP_NOTPOSSTARI:
2289          caseless = TRUE;
2290          codevalue -= OP_STARI - OP_STAR;
2291          /* Fall through */
2292        case OP_STAR:        case OP_STAR:
2293        case OP_MINSTAR:        case OP_MINSTAR:
2294        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1976  for (;;) Line 2299  for (;;)
2299        if (clen > 0)        if (clen > 0)
2300          {          {
2301          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2302          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2303            {            {
2304  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2305            if (utf8 && d >= 128)            if (utf8 && d >= 128)
# Line 2002  for (;;) Line 2325  for (;;)
2325        break;        break;
2326    
2327        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2328          case OP_EXACTI:
2329          case OP_NOTEXACTI:
2330          caseless = TRUE;
2331          codevalue -= OP_STARI - OP_STAR;
2332          /* Fall through */
2333        case OP_EXACT:        case OP_EXACT:
2334        case OP_NOTEXACT:        case OP_NOTEXACT:
2335        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2336        if (clen > 0)        if (clen > 0)
2337          {          {
2338          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2339          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2340            {            {
2341  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2342            if (utf8 && d >= 128)            if (utf8 && d >= 128)
# Line 2032  for (;;) Line 2360  for (;;)
2360        break;        break;
2361    
2362        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2363          case OP_UPTOI:
2364          case OP_MINUPTOI:
2365          case OP_POSUPTOI:
2366          case OP_NOTUPTOI:
2367          case OP_NOTMINUPTOI:
2368          case OP_NOTPOSUPTOI:
2369          caseless = TRUE;
2370          codevalue -= OP_STARI - OP_STAR;
2371          /* Fall through */
2372        case OP_UPTO:        case OP_UPTO:
2373        case OP_MINUPTO:        case OP_MINUPTO:
2374        case OP_POSUPTO:        case OP_POSUPTO:
# Line 2043  for (;;) Line 2380  for (;;)
2380        if (clen > 0)        if (clen > 0)
2381          {          {
2382          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2383          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2384            {            {
2385  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2386            if (utf8 && d >= 128)            if (utf8 && d >= 128)
# Line 2110  for (;;) Line 2447  for (;;)
2447          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2448          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2449    
2450          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2451    
2452          switch (*ecode)          switch (*ecode)
2453            {            {
# Line 2157  for (;;) Line 2494  for (;;)
2494    
2495  /* ========================================================================== */  /* ========================================================================== */
2496        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2497        to use recursion in order to handle them. The "always failing" assersion        to use recursion in order to handle them. The "always failing" assertion
2498        (?!) is optimised when compiling to OP_FAIL, so we have to support that,        (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2499        though the other "backtracking verbs" are not supported. */        though the other "backtracking verbs" are not supported. */
2500    
2501        case OP_FAIL:        case OP_FAIL:
2502          forced_fail++;    /* Count FAILs for multiple states */
2503        break;        break;
2504    
2505        case OP_ASSERT:        case OP_ASSERT:
# Line 2180  for (;;) Line 2518  for (;;)
2518            md,                                   /* static match data */            md,                                   /* static match data */
2519            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2520            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2521            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2522            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2523            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2524            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2525            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2526            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2527    
2528            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2529          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2530              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2531          }          }
2532        break;        break;
2533    
# Line 2200  for (;;) Line 2537  for (;;)
2537          {          {
2538          int local_offsets[1000];          int local_offsets[1000];
2539          int local_workspace[1000];          int local_workspace[1000];
2540          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2541            int condcode;
2542    
2543            /* Because of the way auto-callout works during compile, a callout item
2544            is inserted between OP_COND and an assertion condition. This does not
2545            happen for the other conditions. */
2546    
2547            if (code[LINK_SIZE+1] == OP_CALLOUT)
2548              {
2549              rrc = 0;
2550              if (pcre_callout != NULL)
2551                {
2552                pcre_callout_block cb;
2553                cb.version          = 1;   /* Version 1 of the callout block */
2554                cb.callout_number   = code[LINK_SIZE+2];
2555                cb.offset_vector    = offsets;
2556                cb.subject          = (PCRE_SPTR)start_subject;
2557                cb.subject_length   = (int)(end_subject - start_subject);
2558                cb.start_match      = (int)(current_subject - start_subject);
2559                cb.current_position = (int)(ptr - start_subject);
2560                cb.pattern_position = GET(code, LINK_SIZE + 3);
2561                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2562                cb.capture_top      = 1;
2563                cb.capture_last     = -1;
2564                cb.callout_data     = md->callout_data;
2565                cb.mark             = NULL;   /* No (*MARK) support */
2566                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2567                }
2568              if (rrc > 0) break;                      /* Fail this thread */
2569              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2570              }
2571    
2572            condcode = code[LINK_SIZE+1];
2573    
2574          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2575    
2576          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2577              return PCRE_ERROR_DFA_UCOND;
2578    
2579          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2580    
2581          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2582            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2583    
2584          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2585          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2586          recursed groups. */          recursed groups. */
2587    
2588          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2589            {            {
2590            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2591            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2592            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (md->recursive != NULL)
2593              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2594              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2595            }            }
2596    
2597          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2239  for (;;) Line 2608  for (;;)
2608              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2609              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2610              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2611              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2612              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2613              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2614              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2615              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2616              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2617    
2618              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2619            if ((rc >= 0) ==            if ((rc >= 0) ==
2620                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2621              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2622            else            else
2623              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2624            }            }
2625          }          }
2626        break;        break;
# Line 2260  for (;;) Line 2628  for (;;)
2628        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2629        case OP_RECURSE:        case OP_RECURSE:
2630          {          {
2631            dfa_recursion_info *ri;
2632          int local_offsets[1000];          int local_offsets[1000];
2633          int local_workspace[1000];          int local_workspace[1000];
2634            const uschar *callpat = start_code + GET(code, 1);
2635            int recno = (callpat == md->start_code)? 0 :
2636              GET2(callpat, 1 + LINK_SIZE);
2637          int rc;          int rc;
2638    
2639          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2640            recursing + 1));  
2641            /* Check for repeating a recursion without advancing the subject
2642            pointer. This should catch convoluted mutual recursions. (Some simple
2643            cases are caught at compile time.) */
2644    
2645            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2646              if (recno == ri->group_num && ptr == ri->subject_position)
2647                return PCRE_ERROR_RECURSELOOP;
2648    
2649            /* Remember this recursion and where we started it so as to
2650            catch infinite loops. */
2651    
2652            new_recursive.group_num = recno;
2653            new_recursive.subject_position = ptr;
2654            new_recursive.prevrec = md->recursive;
2655            md->recursive = &new_recursive;
2656    
2657          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2658            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2659            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2660            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2661            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2662            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2663            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2664            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2665            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2666            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing + 1);                       /* regex recurse level */  
2667    
2668          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2669            recursing + 1, rc));  
2670            DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2671              rc));
2672    
2673          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2674    
# Line 2314  for (;;) Line 2701  for (;;)
2701        break;        break;
2702    
2703        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2704          case OP_BRAPOS:
2705          case OP_SBRAPOS:
2706          case OP_CBRAPOS:
2707          case OP_SCBRAPOS:
2708          case OP_BRAPOSZERO:
2709            {
2710            int charcount, matched_count;
2711            const uschar *local_ptr = ptr;
2712            BOOL allow_zero;
2713    
2714            if (codevalue == OP_BRAPOSZERO)
2715              {
2716              allow_zero = TRUE;
2717              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2718              }
2719            else allow_zero = FALSE;
2720    
2721            /* Loop to match the subpattern as many times as possible as if it were
2722            a complete pattern. */
2723    
2724            for (matched_count = 0;; matched_count++)
2725              {
2726              int local_offsets[2];
2727              int local_workspace[1000];
2728    
2729              int rc = internal_dfa_exec(
2730                md,                                   /* fixed match data */
2731                code,                                 /* this subexpression's code */
2732                local_ptr,                            /* where we currently are */
2733                (int)(ptr - start_subject),           /* start offset */
2734                local_offsets,                        /* offset vector */
2735                sizeof(local_offsets)/sizeof(int),    /* size of same */
2736                local_workspace,                      /* workspace vector */
2737                sizeof(local_workspace)/sizeof(int),  /* size of same */
2738                rlevel);                              /* function recursion level */
2739    
2740              /* Failed to match */
2741    
2742              if (rc < 0)
2743                {
2744                if (rc != PCRE_ERROR_NOMATCH) return rc;
2745                break;
2746                }
2747    
2748              /* Matched: break the loop if zero characters matched. */
2749    
2750              charcount = local_offsets[1] - local_offsets[0];
2751              if (charcount == 0) break;
2752              local_ptr += charcount;    /* Advance temporary position ptr */
2753              }
2754    
2755            /* At this point we have matched the subpattern matched_count
2756            times, and local_ptr is pointing to the character after the end of the
2757            last match. */
2758    
2759            if (matched_count > 0 || allow_zero)
2760              {
2761              const uschar *end_subpattern = code;
2762              int next_state_offset;
2763    
2764              do { end_subpattern += GET(end_subpattern, 1); }
2765                while (*end_subpattern == OP_ALT);
2766              next_state_offset =
2767                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2768    
2769              /* Optimization: if there are no more active states, and there
2770              are no new states yet set up, then skip over the subject string
2771              right here, to save looping. Otherwise, set up the new state to swing
2772              into action when the end of the matched substring is reached. */
2773    
2774              if (i + 1 >= active_count && new_count == 0)
2775                {
2776                ptr = local_ptr;
2777                clen = 0;
2778                ADD_NEW(next_state_offset, 0);
2779                }
2780              else
2781                {
2782                const uschar *p = ptr;
2783                const uschar *pp = local_ptr;
2784                charcount = (int)(pp - p);
2785                while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2786                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2787                }
2788              }
2789            }
2790          break;
2791    
2792          /*-----------------------------------------------------------------*/
2793        case OP_ONCE:        case OP_ONCE:
2794          case OP_ONCE_NC:
2795          {          {
2796          int local_offsets[2];          int local_offsets[2];
2797          int local_workspace[1000];          int local_workspace[1000];
# Line 2323  for (;;) Line 2800  for (;;)
2800            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2801            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2802            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2803            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2804            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2805            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2806            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2807            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2808            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2809    
2810          if (rc >= 0)          if (rc >= 0)
2811            {            {
# Line 2340  for (;;) Line 2815  for (;;)
2815    
2816            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2817              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2818            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2819                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2820    
2821            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2822            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 2348  for (;;) Line 2824  for (;;)
2824    
2825            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2826                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2827              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2828    
2829            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2830            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 2363  for (;;) Line 2839  for (;;)
2839            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2840            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2841            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2842            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2843    
2844            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2845              {              {
# Line 2393  for (;;) Line 2869  for (;;)
2869              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2870                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2871              }              }
   
2872            }            }
2873          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2874          }          }
# Line 2404  for (;;) Line 2879  for (;;)
2879        /* Handle callouts */        /* Handle callouts */
2880    
2881        case OP_CALLOUT:        case OP_CALLOUT:
2882          rrc = 0;
2883        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2884          {          {
         int rrc;  
2885          pcre_callout_block cb;          pcre_callout_block cb;
2886          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2887          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2888          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2889          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2890          cb.subject_length   = end_subject - start_subject;          cb.subject_length   = (int)(end_subject - start_subject);
2891          cb.start_match      = current_subject - start_subject;          cb.start_match      = (int)(current_subject - start_subject);
2892          cb.current_position = ptr - start_subject;          cb.current_position = (int)(ptr - start_subject);
2893          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
2894          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
2895          cb.capture_top      = 1;          cb.capture_top      = 1;
2896          cb.capture_last     = -1;          cb.capture_last     = -1;
2897          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2898            cb.mark             = NULL;   /* No (*MARK) support */
2899          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2900          }          }
2901          if (rrc == 0)
2902            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2903        break;        break;
2904    
2905    
# Line 2438  for (;;) Line 2915  for (;;)
2915    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2916    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2917    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2918    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
2919    
2920      The "forced_ fail" variable counts the number of (*F) encountered for the
2921      character. If it is equal to the original active_count (saved in
2922      workspace[1]) it means that (*F) was found on every active state. In this
2923      case we don't want to give a partial match.
2924    
2925      The "could_continue" variable is true if a state could have continued but
2926      for the fact that the end of the subject was reached. */
2927    
2928    if (new_count <= 0)    if (new_count <= 0)
2929      {      {
2930      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2931          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on */
2932          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2933            (                                            /* either... */
2934            (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2935            ||                                           /* or... */
2936            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2937             match_count < 0)                            /* no matches */
2938            ) &&                                         /* And... */
2939          ptr >= end_subject &&                  /* Reached end of subject */          ptr >= end_subject &&                  /* Reached end of subject */
2940          ptr > current_subject)                 /* Matched non-empty string */          ptr > md->start_used_ptr)              /* Inspected non-empty string */
2941        {        {
2942        if (offsetcount >= 2)        if (offsetcount >= 2)
2943          {          {
2944          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
2945          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
2946          }          }
2947        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
2948        }        }
# Line 2536  if (re == NULL || subject == NULL || wor Line 3027  if (re == NULL || subject == NULL || wor
3027     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3028  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3029  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3030    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3031    
3032  /* We need to find the pointer to any study data before we test for byte  /* We need to find the pointer to any study data before we test for byte
3033  flipping, so we scan the extra_data block first. This may set two fields in the  flipping, so we scan the extra_data block first. This may set two fields in the
# Line 2592  md->start_code = (const uschar *)argumen Line 3084  md->start_code = (const uschar *)argumen
3084      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3085  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const unsigned char *)subject;
3086  md->end_subject = end_subject;  md->end_subject = end_subject;
3087    md->start_offset = start_offset;
3088  md->moptions = options;  md->moptions = options;
3089  md->poptions = re->options;  md->poptions = re->options;
3090    
# Line 2614  switch ((((options & PCRE_NEWLINE_BITS) Line 3107  switch ((((options & PCRE_NEWLINE_BITS)
3107           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
3108    {    {
3109    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
3110    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3111    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3112    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
3113         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3114    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
3115    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3116    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2653  back the character offset. */ Line 3146  back the character offset. */
3146  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3147  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3148    {    {
3149    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3150      return PCRE_ERROR_BADUTF8;    int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
3151    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3152      {      {
3153      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3154        {        {
3155        tb &= 0xc0;        offsets[0] = erroroffset;
3156        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3157        }        }
3158        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3159          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3160      }      }
3161      if (start_offset > 0 && start_offset < length &&
3162            (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
3163        return PCRE_ERROR_BADUTF8_OFFSET;
3164    }    }
3165  #endif  #endif
3166    
# Line 2696  if (!anchored) Line 3193  if (!anchored)
3193      }      }
3194    else    else
3195      {      {
3196      if (startline && study != NULL &&      if (!startline && study != NULL &&
3197           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3198        start_bits = study->start_bits;        start_bits = study->start_bits;
3199      }      }
3200    }    }
# Line 2713  if ((re->flags & PCRE_REQCHSET) != 0) Line 3210  if ((re->flags & PCRE_REQCHSET) != 0)
3210    }    }
3211    
3212  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
3213  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
3214  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
3215    
3216  for (;;)  for (;;)
3217    {    {
# Line 2725  for (;;) Line 3221  for (;;)
3221      {      {
3222      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
3223    
3224      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
3225      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
3226      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
3227      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
3228    
3229      if (firstline)      if (firstline)
3230        {        {
3231        USPTR t = current_subject;        USPTR t = current_subject;
3232  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3233        if (utf8)        if (utf8)
3234          {          {
3235          while (t < md->end_subject && !IS_NEWLINE(t))          while (t < md->end_subject && !IS_NEWLINE(t))
3236            {            {
3237            t++;            t++;
3238            while (t < end_subject && (*t & 0xc0) == 0x80) t++;            while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3239            }            }
3240          }          }
3241        else        else
3242  #endif  #endif
3243        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3244        end_subject = t;        end_subject = t;
3245        }        }
3246    
3247      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
3248        starting point is not found. However, there is an option that disables
3249        these, for testing and for ensuring that all callouts do actually occur.
3250        The option can be set in the regex by (*NO_START_OPT) or passed in
3251        match-time options. */
3252    
3253        if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3254        {        {
3255        if (first_byte_caseless)        /* Advance to a known first byte. */
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
3256    
3257      /* Or to just after a linebreak for a multiline match if possible */        if (first_byte >= 0)
3258            {
3259            if (first_byte_caseless)
3260              while (current_subject < end_subject &&
3261                     lcc[*current_subject] != first_byte)
3262                current_subject++;
3263            else
3264              while (current_subject < end_subject &&
3265                     *current_subject != first_byte)
3266                current_subject++;
3267            }
3268    
3269      else if (startline)        /* Or to just after a linebreak for a multiline match if possible */
3270        {  
3271        if (current_subject > md->start_subject + start_offset)        else if (startline)
3272          {          {
3273  #ifdef SUPPORT_UTF8          if (current_subject > md->start_subject + start_offset)
         if (utf8)  
3274            {            {
3275            while (current_subject < end_subject && !WAS_NEWLINE(current_subject))  #ifdef SUPPORT_UTF8
3276              if (utf8)
3277              {              {
3278              current_subject++;              while (current_subject < end_subject &&
3279              while(current_subject < end_subject &&                     !WAS_NEWLINE(current_subject))
3280                    (*current_subject & 0xc0) == 0x80)                {
3281                current_subject++;                current_subject++;
3282              }                while(current_subject < end_subject &&
3283                        (*current_subject & 0xc0) == 0x80)
3284                    current_subject++;
3285                  }
3286                }
3287              else
3288    #endif
3289              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3290                current_subject++;
3291    
3292              /* If we have just passed a CR and the newline option is ANY or
3293              ANYCRLF, and we are now at a LF, advance the match position by one
3294              more character. */
3295    
3296              if (current_subject[-1] == CHAR_CR &&
3297                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3298                   current_subject < end_subject &&
3299                   *current_subject == CHAR_NL)
3300                current_subject++;
3301            }            }
         else  
 #endif  
         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))  
           current_subject++;  
   
         /* If we have just passed a CR and the newline option is ANY or  
         ANYCRLF, and we are now at a LF, advance the match position by one more  
         character. */  
   
         if (current_subject[-1] == '\r' &&  
              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&  
              current_subject < end_subject &&  
              *current_subject == '\n')  
           current_subject++;  
3302          }          }
       }  
3303    
3304      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
3305    
3306      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
3307          {          {
3308          register unsigned int c = *current_subject;          while (current_subject < end_subject)
3309          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
3310              register unsigned int c = *current_subject;
3311              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3312                {
3313                current_subject++;
3314    #ifdef SUPPORT_UTF8
3315                if (utf8)
3316                  while(current_subject < end_subject &&
3317                        (*current_subject & 0xc0) == 0x80) current_subject++;
3318    #endif
3319                }
3320            else break;            else break;
3321              }
3322          }          }
3323        }        }
3324    
3325      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3326    
3327      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
   
   /* If req_byte is set, we know that that character must appear in the subject  
   for the match to succeed. If the first character is set, req_byte must be  
   later in the subject; otherwise the test starts at the match point. This  
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
3328    
3329      /* We don't need to repeat the search if we haven't yet reached the      /* The following two optimizations are disabled for partial matching or if
3330      place we found it at last time. */      disabling is explicitly requested (and of course, by the test above, this
3331        code is not obeyed when restarting after a partial match). */
3332    
3333      if (p > req_byte_ptr)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3334            (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3335        {        {
3336        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3337          {        is a lower bound; no actual string of that length may actually match the
3338          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3339            {        bytes to avoid spending too much time in this optimization. */
3340            register int pp = *p++;  
3341            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3342            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3343          }          return PCRE_ERROR_NOMATCH;
3344        else  
3345          /* If req_byte is set, we know that that character must appear in the
3346          subject for the match to succeed. If the first character is set, req_byte
3347          must be later in the subject; otherwise the test starts at the match
3348          point. This optimization can save a huge amount of work in patterns with
3349          nested unlimited repeats that aren't going to match. Writing separate
3350          code for cased/caseless versions makes it go faster, as does using an
3351          autoincrement and backing off on a match.
3352    
3353          HOWEVER: when the subject string is very, very long, searching to its end
3354          can take a long time, and give bad performance on quite ordinary
3355          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3356          string... so we don't do this when the string is sufficiently long. */
3357    
3358          if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3359          {          {
3360          while (p < end_subject)          register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3361    
3362            /* We don't need to repeat the search if we haven't yet reached the
3363            place we found it at last time. */
3364    
3365            if (p > req_byte_ptr)
3366            {            {
3367            if (*p++ == req_byte) { p--; break; }            if (req_byte_caseless)
3368            }              {
3369          }              while (p < end_subject)
3370                  {
3371                  register int pp = *p++;
3372                  if (pp == req_byte || pp == req_byte2) { p--; break; }
3373                  }
3374                }
3375              else
3376                {
3377                while (p < end_subject)
3378                  {
3379                  if (*p++ == req_byte) { p--; break; }
3380                  }
3381                }
3382    
3383        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3384        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3385    
3386        if (p >= end_subject) break;            if (p >= end_subject) break;
3387    
3388        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3389        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3390        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3391    
3392        req_byte_ptr = p;            req_byte_ptr = p;
3393              }
3394            }
3395        }        }
3396      }      }   /* End of optimizations that are done when not restarting */
3397    
3398    /* OK, now we can do the business */    /* OK, now we can do the business */
3399    
3400      md->start_used_ptr = current_subject;
3401      md->recursive = NULL;
3402    
3403    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3404      md,                                /* fixed match data */      md,                                /* fixed match data */
3405      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2878  for (;;) Line 3409  for (;;)
3409      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3410      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3411      wscount,                           /* size of same */      wscount,                           /* size of same */
3412      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3413    
3414    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3415    on only if not anchored. */    on only if not anchored. */
# Line 2903  for (;;) Line 3432  for (;;)
3432    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
3433    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3434    
3435    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
3436        current_subject < end_subject &&        current_subject < end_subject &&
3437        *current_subject == '\n' &&        *current_subject == CHAR_NL &&
3438        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
3439          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
3440           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.365  
changed lines
  Added in v.779

  ViewVC Help
Powered by ViewVC 1.1.5