/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 85 by nigel, Sat Feb 24 21:41:13 2007 UTC revision 926 by ph10, Wed Feb 22 15:01:32 2012 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  pattern matching using an NFA algorithm, trying to mimic Perl as closely as  pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43  possible. There are also some static supporting functions. */  possible. There are also some static supporting functions. */
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49    #define NLBLOCK md             /* Block containing newline information */
50    #define PSSTART start_subject  /* Field containing processed string start */
51    #define PSEND   end_subject    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    /* Undefine some potentially clashing cpp symbols */
56    
57  /* Structure for building a chain of data that actually lives on the  #undef min
58  stack, for holding the values of the subject pointer at the start of each  #undef max
 subpattern, so as to detect when an empty string has been matched by a  
 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks  
 are on the heap, not on the stack. */  
   
 typedef struct eptrblock {  
   struct eptrblock *epb_prev;  
   const uschar *epb_saved_eptr;  
 } eptrblock;  
59    
60  /* Flag bits for the match() function */  /* Values for setting in md->match_function_type to indicate two special types
61    of call to match(). We do it this way to save on using another stack variable,
62    as stack usage is to be discouraged. */
63    
64  #define match_condassert   0x01    /* Called to check a condition assertion */  #define MATCH_CONDASSERT     1  /* Called to check a condition assertion */
65  #define match_isgroup      0x02    /* Set if start of bracketed group */  #define MATCH_CBEGROUP       2  /* Could-be-empty unlimited repeat group */
66    
67  /* Non-error returns from the match() function. Error returns are externally  /* Non-error returns from the match() function. Error returns are externally
68  defined PCRE_ERROR_xxx codes, which are all negative. */  defined PCRE_ERROR_xxx codes, which are all negative. */
# Line 68  defined PCRE_ERROR_xxx codes, which are Line 70  defined PCRE_ERROR_xxx codes, which are
70  #define MATCH_MATCH        1  #define MATCH_MATCH        1
71  #define MATCH_NOMATCH      0  #define MATCH_NOMATCH      0
72    
73    /* Special internal returns from the match() function. Make them sufficiently
74    negative to avoid the external error codes. */
75    
76    #define MATCH_ACCEPT       (-999)
77    #define MATCH_COMMIT       (-998)
78    #define MATCH_KETRPOS      (-997)
79    #define MATCH_ONCE         (-996)
80    #define MATCH_PRUNE        (-995)
81    #define MATCH_SKIP         (-994)
82    #define MATCH_SKIP_ARG     (-993)
83    #define MATCH_THEN         (-992)
84    
85  /* Maximum number of ints of offset to save on the stack for recursive calls.  /* Maximum number of ints of offset to save on the stack for recursive calls.
86  If the offset vector is bigger, malloc is used. This should be a multiple of 3,  If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87  because the offset vector is always a multiple of 3 long. */  because the offset vector is always a multiple of 3 long. */
# Line 81  static const char rep_max[] = { 0, 0, 0, Line 95  static const char rep_max[] = { 0, 0, 0,
95    
96    
97    
98  #ifdef DEBUG  #ifdef PCRE_DEBUG
99  /*************************************************  /*************************************************
100  *        Debugging function to print chars       *  *        Debugging function to print chars       *
101  *************************************************/  *************************************************/
# Line 99  Returns:     nothing Line 113  Returns:     nothing
113  */  */
114    
115  static void  static void
116  pchars(const uschar *p, int length, BOOL is_subject, match_data *md)  pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117  {  {
118  int c;  unsigned int c;
119  if (is_subject && length > md->end_subject - p) length = md->end_subject - p;  if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120  while (length-- > 0)  while (length-- > 0)
121    if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);    if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
# Line 114  while (length-- > 0) Line 128  while (length-- > 0)
128  *          Match a back-reference                *  *          Match a back-reference                *
129  *************************************************/  *************************************************/
130    
131  /* If a back reference hasn't been set, the length that is passed is greater  /* Normally, if a back reference hasn't been set, the length that is passed is
132  than the number of characters left in the string, so the match fails.  negative, so the match always fails. However, in JavaScript compatibility mode,
133    the length passed is zero. Note that in caseless UTF-8 mode, the number of
134    subject bytes matched may be different to the number of reference bytes.
135    
136  Arguments:  Arguments:
137    offset      index into the offset vector    offset      index into the offset vector
138    eptr        points into the subject    eptr        pointer into the subject
139    length      length to be matched    length      length of reference to be matched (number of bytes)
140    md          points to match data block    md          points to match data block
141    ims         the ims flags    caseless    TRUE if caseless
142    
143  Returns:      TRUE if matched  Returns:      >= 0 the number of subject bytes matched
144                  -1 no match
145                  -2 partial match; always given if at end subject
146  */  */
147    
148  static BOOL  static int
149  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
150    unsigned long int ims)    BOOL caseless)
151  {  {
152  const uschar *p = md->start_subject + md->offset_vector[offset];  PCRE_PUCHAR eptr_start = eptr;
153    register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
154    
155  #ifdef DEBUG  #ifdef PCRE_DEBUG
156  if (eptr >= md->end_subject)  if (eptr >= md->end_subject)
157    printf("matching subject <null>");    printf("matching subject <null>");
158  else  else
# Line 146  pchars(p, length, FALSE, md); Line 165  pchars(p, length, FALSE, md);
165  printf("\n");  printf("\n");
166  #endif  #endif
167    
168  /* Always fail if not enough characters left */  /* Always fail if reference not set (and not JavaScript compatible - in that
169    case the length is passed as zero). */
170    
171  if (length > md->end_subject - eptr) return FALSE;  if (length < 0) return -1;
172    
173  /* Separate the caselesss case for speed */  /* Separate the caseless case for speed. In UTF-8 mode we can only do this
174    properly if Unicode properties are supported. Otherwise, we can check only
175    ASCII characters. */
176    
177  if ((ims & PCRE_CASELESS) != 0)  if (caseless)
178    {    {
179    while (length-- > 0)  #ifdef SUPPORT_UTF
180      if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;  #ifdef SUPPORT_UCP
181      if (md->utf)
182        {
183        /* Match characters up to the end of the reference. NOTE: the number of
184        bytes matched may differ, because there are some characters whose upper and
185        lower case versions code as different numbers of bytes. For example, U+023A
186        (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
187        a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
188        the latter. It is important, therefore, to check the length along the
189        reference, not along the subject (earlier code did this wrong). */
190    
191        PCRE_PUCHAR endptr = p + length;
192        while (p < endptr)
193          {
194          int c, d;
195          if (eptr >= md->end_subject) return -2;   /* Partial match */
196          GETCHARINC(c, eptr);
197          GETCHARINC(d, p);
198          if (c != d && c != UCD_OTHERCASE(d)) return -1;
199          }
200        }
201      else
202    #endif
203    #endif
204    
205      /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
206      is no UCP support. */
207        {
208        while (length-- > 0)
209          {
210          if (eptr >= md->end_subject) return -2;   /* Partial match */
211          if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
212          p++;
213          eptr++;
214          }
215        }
216    }    }
217    
218    /* In the caseful case, we can just compare the bytes, whether or not we
219    are in UTF-8 mode. */
220    
221  else  else
222    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    {
223      while (length-- > 0)
224        {
225        if (eptr >= md->end_subject) return -2;   /* Partial match */
226        if (*p++ != *eptr++) return -1;
227        }
228      }
229    
230  return TRUE;  return (int)(eptr - eptr_start);
231  }  }
232    
233    
# Line 169  return TRUE; Line 236  return TRUE;
236  ****************************************************************************  ****************************************************************************
237                     RECURSION IN THE match() FUNCTION                     RECURSION IN THE match() FUNCTION
238    
239  The match() function is highly recursive. Some regular expressions can cause  The match() function is highly recursive, though not every recursive call
240  it to recurse thousands of times. I was writing for Unix, so I just let it  increases the recursive depth. Nevertheless, some regular expressions can cause
241  call itself recursively. This uses the stack for saving everything that has  it to recurse to a great depth. I was writing for Unix, so I just let it call
242  to be saved for a recursive call. On Unix, the stack can be large, and this  itself recursively. This uses the stack for saving everything that has to be
243  works fine.  saved for a recursive call. On Unix, the stack can be large, and this works
244    fine.
245  It turns out that on non-Unix systems there are problems with programs that  
246  use a lot of stack. (This despite the fact that every last chip has oodles  It turns out that on some non-Unix-like systems there are problems with
247  of memory these days, and techniques for extending the stack have been known  programs that use a lot of stack. (This despite the fact that every last chip
248  for decades.) So....  has oodles of memory these days, and techniques for extending the stack have
249    been known for decades.) So....
250    
251  There is a fudge, triggered by defining NO_RECURSE, which avoids recursive  There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
252  calls by keeping local variables that need to be preserved in blocks of memory  calls by keeping local variables that need to be preserved in blocks of memory
253  obtained from malloc instead instead of on the stack. Macros are used to  obtained from malloc() instead instead of on the stack. Macros are used to
254  achieve this so that the actual code doesn't look very different to what it  achieve this so that the actual code doesn't look very different to what it
255  always used to.  always used to.
256    
257    The original heap-recursive code used longjmp(). However, it seems that this
258    can be very slow on some operating systems. Following a suggestion from Stan
259    Switzer, the use of longjmp() has been abolished, at the cost of having to
260    provide a unique number for each call to RMATCH. There is no way of generating
261    a sequence of numbers at compile time in C. I have given them names, to make
262    them stand out more clearly.
263    
264    Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
265    FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
266    tests. Furthermore, not using longjmp() means that local dynamic variables
267    don't have indeterminate values; this has meant that the frame size can be
268    reduced because the result can be "passed back" by straight setting of the
269    variable instead of being passed in the frame.
270  ****************************************************************************  ****************************************************************************
271  ***************************************************************************/  ***************************************************************************/
272    
273    /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
274    below must be updated in sync.  */
275    
276  /* These versions of the macros use the stack, as normal */  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM6,  RM7,  RM8,  RM9,  RM10,
277           RM11,  RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
278           RM21,  RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
279           RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
280           RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
281           RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
282           RM61,  RM62, RM63, RM64, RM65, RM66 };
283    
284    /* These versions of the macros use the stack, as normal. There are debugging
285    versions and production versions. Note that the "rw" argument of RMATCH isn't
286    actually used in this definition. */
287    
288  #ifndef NO_RECURSE  #ifndef NO_RECURSE
289  #define REGISTER register  #define REGISTER register
290  #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)  
291    #ifdef PCRE_DEBUG
292    #define RMATCH(ra,rb,rc,rd,re,rw) \
293      { \
294      printf("match() called in line %d\n", __LINE__); \
295      rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
296      printf("to line %d\n", __LINE__); \
297      }
298    #define RRETURN(ra) \
299      { \
300      printf("match() returned %d from line %d ", ra, __LINE__); \
301      return ra; \
302      }
303    #else
304    #define RMATCH(ra,rb,rc,rd,re,rw) \
305      rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
306  #define RRETURN(ra) return ra  #define RRETURN(ra) return ra
307    #endif
308    
309  #else  #else
310    
311    
312  /* These versions of the macros manage a private stack on the heap. Note  /* These versions of the macros manage a private stack on the heap. Note that
313  that the rd argument of RMATCH isn't actually used. It's the md argument of  the "rd" argument of RMATCH isn't actually used in this definition. It's the md
314  match(), which never changes. */  argument of match(), which never changes. */
315    
316  #define REGISTER  #define REGISTER
317    
318  #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\  #define RMATCH(ra,rb,rc,rd,re,rw)\
319    {\    {\
320    heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\    heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
321    if (setjmp(frame->Xwhere) == 0)\    if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
322      {\    frame->Xwhere = rw; \
323      newframe->Xeptr = ra;\    newframe->Xeptr = ra;\
324      newframe->Xecode = rb;\    newframe->Xecode = rb;\
325      newframe->Xoffset_top = rc;\    newframe->Xmstart = mstart;\
326      newframe->Xims = re;\    newframe->Xoffset_top = rc;\
327      newframe->Xeptrb = rf;\    newframe->Xeptrb = re;\
328      newframe->Xflags = rg;\    newframe->Xrdepth = frame->Xrdepth + 1;\
329      newframe->Xprevframe = frame;\    newframe->Xprevframe = frame;\
330      frame = newframe;\    frame = newframe;\
331      DPRINTF(("restarting from line %d\n", __LINE__));\    DPRINTF(("restarting from line %d\n", __LINE__));\
332      goto HEAP_RECURSE;\    goto HEAP_RECURSE;\
333      }\    L_##rw:\
334    else\    DPRINTF(("jumped back to line %d\n", __LINE__));\
     {\  
     DPRINTF(("longjumped back to line %d\n", __LINE__));\  
     frame = md->thisframe;\  
     rx = frame->Xresult;\  
     }\  
335    }    }
336    
337  #define RRETURN(ra)\  #define RRETURN(ra)\
338    {\    {\
339    heapframe *newframe = frame;\    heapframe *oldframe = frame;\
340    frame = newframe->Xprevframe;\    frame = oldframe->Xprevframe;\
341    (pcre_stack_free)(newframe);\    if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\
342    if (frame != NULL)\    if (frame != NULL)\
343      {\      {\
344      frame->Xresult = ra;\      rrc = ra;\
345      md->thisframe = frame;\      goto HEAP_RETURN;\
     longjmp(frame->Xwhere, 1);\  
346      }\      }\
347    return ra;\    return ra;\
348    }    }
# Line 250  typedef struct heapframe { Line 355  typedef struct heapframe {
355    
356    /* Function arguments that may change */    /* Function arguments that may change */
357    
358    const uschar *Xeptr;    PCRE_PUCHAR Xeptr;
359    const uschar *Xecode;    const pcre_uchar *Xecode;
360      PCRE_PUCHAR Xmstart;
361    int Xoffset_top;    int Xoffset_top;
   long int Xims;  
362    eptrblock *Xeptrb;    eptrblock *Xeptrb;
363    int Xflags;    unsigned int Xrdepth;
364    
365    /* Function local variables */    /* Function local variables */
366    
367    const uschar *Xcallpat;    PCRE_PUCHAR Xcallpat;
368    const uschar *Xcharptr;  #ifdef SUPPORT_UTF
369    const uschar *Xdata;    PCRE_PUCHAR Xcharptr;
370    const uschar *Xnext;  #endif
371    const uschar *Xpp;    PCRE_PUCHAR Xdata;
372    const uschar *Xprev;    PCRE_PUCHAR Xnext;
373    const uschar *Xsaved_eptr;    PCRE_PUCHAR Xpp;
374      PCRE_PUCHAR Xprev;
375      PCRE_PUCHAR Xsaved_eptr;
376    
377    recursion_info Xnew_recursive;    recursion_info Xnew_recursive;
378    
379    BOOL Xcur_is_word;    BOOL Xcur_is_word;
380    BOOL Xcondition;    BOOL Xcondition;
   BOOL Xminimize;  
381    BOOL Xprev_is_word;    BOOL Xprev_is_word;
382    
   unsigned long int Xoriginal_ims;  
   
383  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
384    int Xprop_type;    int Xprop_type;
385      int Xprop_value;
386    int Xprop_fail_result;    int Xprop_fail_result;
387    int Xprop_category;    int Xoclength;
388    int Xprop_chartype;    pcre_uchar Xocchars[6];
   int Xprop_othercase;  
   int Xprop_test_against;  
   int *Xprop_test_variable;  
389  #endif  #endif
390    
391      int Xcodelink;
392    int Xctype;    int Xctype;
393    int Xfc;    unsigned int Xfc;
394    int Xfi;    int Xfi;
395    int Xlength;    int Xlength;
396    int Xmax;    int Xmax;
# Line 301  typedef struct heapframe { Line 404  typedef struct heapframe {
404    
405    eptrblock Xnewptrb;    eptrblock Xnewptrb;
406    
407    /* Place to pass back result, and where to jump back to */    /* Where to jump back to */
408    
409    int  Xresult;    int Xwhere;
   jmp_buf Xwhere;  
410    
411  } heapframe;  } heapframe;
412    
# Line 320  typedef struct heapframe { Line 422  typedef struct heapframe {
422  *         Match from current position            *  *         Match from current position            *
423  *************************************************/  *************************************************/
424    
425  /* On entry ecode points to the first opcode, and eptr to the first character  /* This function is called recursively in many circumstances. Whenever it
 in the subject string, while eptrb holds the value of eptr at the start of the  
 last bracketed group - used for breaking infinite loops matching zero-length  
 strings. This function is called recursively in many circumstances. Whenever it  
426  returns a negative (error) response, the outer incarnation must also return the  returns a negative (error) response, the outer incarnation must also return the
427  same response.  same response. */
428    
429    /* These macros pack up tests that are used for partial matching, and which
430    appear several times in the code. We set the "hit end" flag if the pointer is
431    at the end of the subject and also past the start of the subject (i.e.
432    something has been matched). For hard partial matching, we then return
433    immediately. The second one is used when we already know we are past the end of
434    the subject. */
435    
436    #define CHECK_PARTIAL()\
437      if (md->partial != 0 && eptr >= md->end_subject && \
438          eptr > md->start_used_ptr) \
439        { \
440        md->hitend = TRUE; \
441        if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
442        }
443    
444    #define SCHECK_PARTIAL()\
445      if (md->partial != 0 && eptr > md->start_used_ptr) \
446        { \
447        md->hitend = TRUE; \
448        if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
449        }
450    
451    
452  Performance note: It might be tempting to extract commonly used fields from the  /* Performance note: It might be tempting to extract commonly used fields from
453  md structure (e.g. utf8, end_subject) into individual variables to improve  the md structure (e.g. utf, end_subject) into individual variables to improve
454  performance. Tests using gcc on a SPARC disproved this; in the first case, it  performance. Tests using gcc on a SPARC disproved this; in the first case, it
455  made performance worse.  made performance worse.
456    
457  Arguments:  Arguments:
458     eptr        pointer in subject     eptr        pointer to current character in subject
459     ecode       position in code     ecode       pointer to current position in compiled code
460       mstart      pointer to the current match start position (can be modified
461                     by encountering \K)
462     offset_top  current top pointer     offset_top  current top pointer
463     md          pointer to "static" info for the match     md          pointer to "static" info for the match
    ims         current /i, /m, and /s options  
464     eptrb       pointer to chain of blocks containing eptr at start of     eptrb       pointer to chain of blocks containing eptr at start of
465                   brackets - for testing for empty matches                   brackets - for testing for empty matches
466     flags       can contain     rdepth      the recursion depth
                  match_condassert - this is an assertion condition  
                  match_isgroup - this is the start of a bracketed group  
467    
468  Returns:       MATCH_MATCH if matched            )  these values are >= 0  Returns:       MATCH_MATCH if matched            )  these values are >= 0
469                 MATCH_NOMATCH if failed to match  )                 MATCH_NOMATCH if failed to match  )
470                   a negative MATCH_xxx value for PRUNE, SKIP, etc
471                 a negative PCRE_ERROR_xxx value if aborted by an error condition                 a negative PCRE_ERROR_xxx value if aborted by an error condition
472                   (e.g. stopped by recursion limit)                   (e.g. stopped by repeated call or recursion limit)
473  */  */
474    
475  static int  static int
476  match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,  match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
477    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,    PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
478    int flags)    unsigned int rdepth)
479  {  {
480  /* These variables do not need to be preserved over recursion in this function,  /* These variables do not need to be preserved over recursion in this function,
481  so they can be ordinary variables in all cases. Mark them with "register"  so they can be ordinary variables in all cases. Mark some of them with
482  because they are used a lot in loops. */  "register" because they are used a lot in loops. */
483    
484  register int  rrc;    /* Returns from recursive calls */  register int  rrc;         /* Returns from recursive calls */
485  register int  i;      /* Used for loops not involving calls to RMATCH() */  register int  i;           /* Used for loops not involving calls to RMATCH() */
486  register int  c;      /* Character values not kept over RMATCH() calls */  register unsigned int c;   /* Character values not kept over RMATCH() calls */
487  register BOOL utf8;   /* Local copy of UTF-8 flag for speed */  register BOOL utf;         /* Local copy of UTF flag for speed */
488    
489    BOOL minimize, possessive; /* Quantifier options */
490    BOOL caseless;
491    int condcode;
492    
493  /* When recursion is not being used, all "local" variables that have to be  /* When recursion is not being used, all "local" variables that have to be
494  preserved over calls to RMATCH() are part of a "frame" which is obtained from  preserved over calls to RMATCH() are part of a "frame". We set up the top-level
495  heap storage. Set up the top-level frame here; others are obtained from the  frame on the stack here; subsequent instantiations are obtained from the heap
496  heap whenever RMATCH() does a "recursion". See the macro definitions above. */  whenever RMATCH() does a "recursion". See the macro definitions above. Putting
497    the top-level on the stack rather than malloc-ing them all gives a performance
498    boost in many cases where there is not much "recursion". */
499    
500  #ifdef NO_RECURSE  #ifdef NO_RECURSE
501  heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));  heapframe frame_zero;
502    heapframe *frame = &frame_zero;
503  frame->Xprevframe = NULL;            /* Marks the top level */  frame->Xprevframe = NULL;            /* Marks the top level */
504    
505  /* Copy in the original argument variables */  /* Copy in the original argument variables */
506    
507  frame->Xeptr = eptr;  frame->Xeptr = eptr;
508  frame->Xecode = ecode;  frame->Xecode = ecode;
509    frame->Xmstart = mstart;
510  frame->Xoffset_top = offset_top;  frame->Xoffset_top = offset_top;
 frame->Xims = ims;  
511  frame->Xeptrb = eptrb;  frame->Xeptrb = eptrb;
512  frame->Xflags = flags;  frame->Xrdepth = rdepth;
513    
514  /* This is where control jumps back to to effect "recursion" */  /* This is where control jumps back to to effect "recursion" */
515    
# Line 390  HEAP_RECURSE: Line 519  HEAP_RECURSE:
519    
520  #define eptr               frame->Xeptr  #define eptr               frame->Xeptr
521  #define ecode              frame->Xecode  #define ecode              frame->Xecode
522    #define mstart             frame->Xmstart
523  #define offset_top         frame->Xoffset_top  #define offset_top         frame->Xoffset_top
 #define ims                frame->Xims  
524  #define eptrb              frame->Xeptrb  #define eptrb              frame->Xeptrb
525  #define flags              frame->Xflags  #define rdepth             frame->Xrdepth
526    
527  /* Ditto for the local variables */  /* Ditto for the local variables */
528    
529  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
530  #define charptr            frame->Xcharptr  #define charptr            frame->Xcharptr
531  #endif  #endif
532  #define callpat            frame->Xcallpat  #define callpat            frame->Xcallpat
533    #define codelink           frame->Xcodelink
534  #define data               frame->Xdata  #define data               frame->Xdata
535  #define next               frame->Xnext  #define next               frame->Xnext
536  #define pp                 frame->Xpp  #define pp                 frame->Xpp
# Line 411  HEAP_RECURSE: Line 541  HEAP_RECURSE:
541    
542  #define cur_is_word        frame->Xcur_is_word  #define cur_is_word        frame->Xcur_is_word
543  #define condition          frame->Xcondition  #define condition          frame->Xcondition
 #define minimize           frame->Xminimize  
544  #define prev_is_word       frame->Xprev_is_word  #define prev_is_word       frame->Xprev_is_word
545    
 #define original_ims       frame->Xoriginal_ims  
   
546  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
547  #define prop_type          frame->Xprop_type  #define prop_type          frame->Xprop_type
548    #define prop_value         frame->Xprop_value
549  #define prop_fail_result   frame->Xprop_fail_result  #define prop_fail_result   frame->Xprop_fail_result
550  #define prop_category      frame->Xprop_category  #define oclength           frame->Xoclength
551  #define prop_chartype      frame->Xprop_chartype  #define occhars            frame->Xocchars
 #define prop_othercase     frame->Xprop_othercase  
 #define prop_test_against  frame->Xprop_test_against  
 #define prop_test_variable frame->Xprop_test_variable  
552  #endif  #endif
553    
554  #define ctype              frame->Xctype  #define ctype              frame->Xctype
# Line 447  HEAP_RECURSE: Line 572  HEAP_RECURSE:
572  get preserved during recursion in the normal way. In this environment, fi and  get preserved during recursion in the normal way. In this environment, fi and
573  i, and fc and c, can be the same variables. */  i, and fc and c, can be the same variables. */
574    
575  #else  #else         /* NO_RECURSE not defined */
576  #define fi i  #define fi i
577  #define fc c  #define fc c
578    
579    /* Many of the following variables are used only in small blocks of the code.
580    My normal style of coding would have declared them within each of those blocks.
581    However, in order to accommodate the version of this code that uses an external
582    "stack" implemented on the heap, it is easier to declare them all here, so the
583    declarations can be cut out in a block. The only declarations within blocks
584    below are for variables that do not have to be preserved over a recursive call
585    to RMATCH(). */
586    
587  #ifdef SUPPORT_UTF8                /* Many of these variables are used ony */  #ifdef SUPPORT_UTF
588  const uschar *charptr;             /* small blocks of the code. My normal  */  const pcre_uchar *charptr;
589  #endif                             /* style of coding would have declared  */  #endif
590  const uschar *callpat;             /* them within each of those blocks.    */  const pcre_uchar *callpat;
591  const uschar *data;                /* However, in order to accommodate the */  const pcre_uchar *data;
592  const uschar *next;                /* version of this code that uses an    */  const pcre_uchar *next;
593  const uschar *pp;                  /* external "stack" implemented on the  */  PCRE_PUCHAR       pp;
594  const uschar *prev;                /* heap, it is easier to declare them   */  const pcre_uchar *prev;
595  const uschar *saved_eptr;          /* all here, so the declarations can    */  PCRE_PUCHAR       saved_eptr;
                                    /* be cut out in a block. The only      */  
 recursion_info new_recursive;      /* declarations within blocks below are */  
                                    /* for variables that do not have to    */  
 BOOL cur_is_word;                  /* be preserved over a recursive call   */  
 BOOL condition;                    /* to RMATCH().                         */  
 BOOL minimize;  
 BOOL prev_is_word;  
596    
597  unsigned long int original_ims;  recursion_info new_recursive;
598    
599    BOOL cur_is_word;
600    BOOL condition;
601    BOOL prev_is_word;
602    
603  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
604  int prop_type;  int prop_type;
605    int prop_value;
606  int prop_fail_result;  int prop_fail_result;
607  int prop_category;  int oclength;
608  int prop_chartype;  pcre_uchar occhars[6];
 int prop_othercase;  
 int prop_test_against;  
 int *prop_test_variable;  
609  #endif  #endif
610    
611    int codelink;
612  int ctype;  int ctype;
613  int length;  int length;
614  int max;  int max;
# Line 493  int save_offset1, save_offset2, save_off Line 621  int save_offset1, save_offset2, save_off
621  int stacksave[REC_STACK_SAVE_MAX];  int stacksave[REC_STACK_SAVE_MAX];
622    
623  eptrblock newptrb;  eptrblock newptrb;
624  #endif  
625    /* There is a special fudge for calling match() in a way that causes it to
626    measure the size of its basic stack frame when the stack is being used for
627    recursion. The second argument (ecode) being NULL triggers this behaviour. It
628    cannot normally ever be NULL. The return is the negated value of the frame
629    size. */
630    
631    if (ecode == NULL)
632      {
633      if (rdepth == 0)
634        return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
635      else
636        {
637        int len = (char *)&rdepth - (char *)eptr;
638        return (len > 0)? -len : len;
639        }
640      }
641    #endif     /* NO_RECURSE */
642    
643    /* To save space on the stack and in the heap frame, I have doubled up on some
644    of the local variables that are used only in localised parts of the code, but
645    still need to be preserved over recursive calls of match(). These macros define
646    the alternative names that are used. */
647    
648    #define allow_zero    cur_is_word
649    #define cbegroup      condition
650    #define code_offset   codelink
651    #define condassert    condition
652    #define matched_once  prev_is_word
653    #define foc           number
654    #define save_mark     data
655    
656  /* These statements are here to stop the compiler complaining about unitialized  /* These statements are here to stop the compiler complaining about unitialized
657  variables. */  variables. */
658    
659  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
660    prop_value = 0;
661  prop_fail_result = 0;  prop_fail_result = 0;
 prop_test_against = 0;  
 prop_test_variable = NULL;  
662  #endif  #endif
663    
 /* OK, now we can get on with the real code of the function. Recursion is  
 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,  
 these just turn into a recursive call to match() and a "return", respectively.  
 However, RMATCH isn't like a function call because it's quite a complicated  
 macro. It has to be used in one particular way. This shouldn't, however, impact  
 performance when true recursion is being used. */  
664    
665  if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);  /* This label is used for tail recursion, which is used in a few cases even
666    when NO_RECURSE is not defined, in order to reduce the amount of stack that is
667    used. Thanks to Ian Taylor for noticing this possibility and sending the
668    original patch. */
669    
670    TAIL_RECURSE:
671    
672    /* OK, now we can get on with the real code of the function. Recursive calls
673    are specified by the macro RMATCH and RRETURN is used to return. When
674    NO_RECURSE is *not* defined, these just turn into a recursive call to match()
675    and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
676    defined). However, RMATCH isn't like a function call because it's quite a
677    complicated macro. It has to be used in one particular way. This shouldn't,
678    however, impact performance when true recursion is being used. */
679    
680    #ifdef SUPPORT_UTF
681    utf = md->utf;       /* Local copy of the flag */
682    #else
683    utf = FALSE;
684    #endif
685    
686    /* First check that we haven't called match() too many times, or that we
687    haven't exceeded the recursive call limit. */
688    
689  original_ims = ims;    /* Save for resetting on ')' */  if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
690  utf8 = md->utf8;       /* Local copy of the flag */  if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
691    
692  /* At the start of a bracketed group, add the current subject pointer to the  /* At the start of a group with an unlimited repeat that may match an empty
693  stack of such pointers, to be re-instated at the end of the group when we hit  string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
694  the closing ket. When match() is called in other circumstances, we don't add to  done this way to save having to use another function argument, which would take
695  this stack. */  up space on the stack. See also MATCH_CONDASSERT below.
696    
697    When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
698    such remembered pointers, to be checked when we hit the closing ket, in order
699    to break infinite loops that match no characters. When match() is called in
700    other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
701    NOT be used with tail recursion, because the memory block that is used is on
702    the stack, so a new one may be required for each match(). */
703    
704  if ((flags & match_isgroup) != 0)  if (md->match_function_type == MATCH_CBEGROUP)
705    {    {
   newptrb.epb_prev = eptrb;  
706    newptrb.epb_saved_eptr = eptr;    newptrb.epb_saved_eptr = eptr;
707      newptrb.epb_prev = eptrb;
708    eptrb = &newptrb;    eptrb = &newptrb;
709      md->match_function_type = 0;
710    }    }
711    
712  /* Now start processing the operations. */  /* Now start processing the opcodes. */
713    
714  for (;;)  for (;;)
715    {    {
716      minimize = possessive = FALSE;
717    op = *ecode;    op = *ecode;
   minimize = FALSE;  
718    
719    /* For partial matching, remember if we ever hit the end of the subject after    switch(op)
720    matching at least one subject character. */      {
721        case OP_MARK:
722        md->nomatch_mark = ecode + 2;
723        md->mark = NULL;    /* In case previously set by assertion */
724        RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
725          eptrb, RM55);
726        if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
727             md->mark == NULL) md->mark = ecode + 2;
728    
729        /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
730        argument, and we must check whether that argument matches this MARK's
731        argument. It is passed back in md->start_match_ptr (an overloading of that
732        variable). If it does match, we reset that variable to the current subject
733        position and return MATCH_SKIP. Otherwise, pass back the return code
734        unaltered. */
735    
736        else if (rrc == MATCH_SKIP_ARG &&
737            STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
738          {
739          md->start_match_ptr = eptr;
740          RRETURN(MATCH_SKIP);
741          }
742        RRETURN(rrc);
743    
744        case OP_FAIL:
745        RRETURN(MATCH_NOMATCH);
746    
747    if (md->partial &&      /* COMMIT overrides PRUNE, SKIP, and THEN */
       eptr >= md->end_subject &&  
       eptr > md->start_match)  
     md->hitend = TRUE;  
   
   /* Opening capturing bracket. If there is space in the offset vector, save  
   the current subject position in the working slot at the top of the vector. We  
   mustn't change the current values of the data slot, because they may be set  
   from a previous iteration of this group, and be referred to by a reference  
   inside the group.  
   
   If the bracket fails to match, we need to restore this value and also the  
   values of the final offsets, in case they were set by a previous iteration of  
   the same bracket.  
   
   If there isn't enough space in the offset vector, treat this as if it were a  
   non-capturing bracket. Don't worry about setting the flag for the error case  
   here; that is handled in the code for KET. */  
748    
749    if (op > OP_BRA)      case OP_COMMIT:
750      {      RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
751      number = op - OP_BRA;        eptrb, RM52);
752        if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
753            rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
754            rrc != MATCH_THEN)
755          RRETURN(rrc);
756        RRETURN(MATCH_COMMIT);
757    
758        /* PRUNE overrides THEN */
759    
760        case OP_PRUNE:
761        RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
762          eptrb, RM51);
763        if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
764        RRETURN(MATCH_PRUNE);
765    
766        case OP_PRUNE_ARG:
767        md->nomatch_mark = ecode + 2;
768        md->mark = NULL;    /* In case previously set by assertion */
769        RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
770          eptrb, RM56);
771        if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
772             md->mark == NULL) md->mark = ecode + 2;
773        if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
774        RRETURN(MATCH_PRUNE);
775    
776        /* SKIP overrides PRUNE and THEN */
777    
778        case OP_SKIP:
779        RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
780          eptrb, RM53);
781        if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
782          RRETURN(rrc);
783        md->start_match_ptr = eptr;   /* Pass back current position */
784        RRETURN(MATCH_SKIP);
785    
786        /* Note that, for Perl compatibility, SKIP with an argument does NOT set
787        nomatch_mark. There is a flag that disables this opcode when re-matching a
788        pattern that ended with a SKIP for which there was not a matching MARK. */
789    
790        case OP_SKIP_ARG:
791        if (md->ignore_skip_arg)
792          {
793          ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
794          break;
795          }
796        RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
797          eptrb, RM57);
798        if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
799          RRETURN(rrc);
800    
801        /* Pass back the current skip name by overloading md->start_match_ptr and
802        returning the special MATCH_SKIP_ARG return code. This will either be
803        caught by a matching MARK, or get to the top, where it causes a rematch
804        with the md->ignore_skip_arg flag set. */
805    
806        md->start_match_ptr = ecode + 2;
807        RRETURN(MATCH_SKIP_ARG);
808    
809        /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
810        the branch in which it occurs can be determined. Overload the start of
811        match pointer to do this. */
812    
813        case OP_THEN:
814        RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
815          eptrb, RM54);
816        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
817        md->start_match_ptr = ecode;
818        RRETURN(MATCH_THEN);
819    
820        case OP_THEN_ARG:
821        md->nomatch_mark = ecode + 2;
822        md->mark = NULL;    /* In case previously set by assertion */
823        RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
824          md, eptrb, RM58);
825        if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
826             md->mark == NULL) md->mark = ecode + 2;
827        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
828        md->start_match_ptr = ecode;
829        RRETURN(MATCH_THEN);
830    
831        /* Handle an atomic group that does not contain any capturing parentheses.
832        This can be handled like an assertion. Prior to 8.13, all atomic groups
833        were handled this way. In 8.13, the code was changed as below for ONCE, so
834        that backups pass through the group and thereby reset captured values.
835        However, this uses a lot more stack, so in 8.20, atomic groups that do not
836        contain any captures generate OP_ONCE_NC, which can be handled in the old,
837        less stack intensive way.
838    
839        Check the alternative branches in turn - the matching won't pass the KET
840        for this kind of subpattern. If any one branch matches, we carry on as at
841        the end of a normal bracket, leaving the subject pointer, but resetting
842        the start-of-match value in case it was changed by \K. */
843    
844        case OP_ONCE_NC:
845        prev = ecode;
846        saved_eptr = eptr;
847        save_mark = md->mark;
848        do
849          {
850          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
851          if (rrc == MATCH_MATCH)  /* Note: _not_ MATCH_ACCEPT */
852            {
853            mstart = md->start_match_ptr;
854            break;
855            }
856          if (rrc == MATCH_THEN)
857            {
858            next = ecode + GET(ecode,1);
859            if (md->start_match_ptr < next &&
860                (*ecode == OP_ALT || *next == OP_ALT))
861              rrc = MATCH_NOMATCH;
862            }
863    
864          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
865          ecode += GET(ecode,1);
866          md->mark = save_mark;
867          }
868        while (*ecode == OP_ALT);
869    
870        /* If hit the end of the group (which could be repeated), fail */
871    
872        if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
873    
874        /* Continue as from after the group, updating the offsets high water
875        mark, since extracts may have been taken. */
876    
877        do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
878    
879        offset_top = md->end_offset_top;
880        eptr = md->end_match_ptr;
881    
882        /* For a non-repeating ket, just continue at this level. This also
883        happens for a repeating ket if no characters were matched in the group.
884        This is the forcible breaking of infinite loops as implemented in Perl
885        5.005. */
886    
887        if (*ecode == OP_KET || eptr == saved_eptr)
888          {
889          ecode += 1+LINK_SIZE;
890          break;
891          }
892    
893      /* For extended extraction brackets (large number), we have to fish out the      /* The repeating kets try the rest of the pattern or restart from the
894      number from a dummy opcode at the start. */      preceding bracket, in the appropriate order. The second "call" of match()
895        uses tail recursion, to avoid using another stack frame. */
896    
897        if (*ecode == OP_KETRMIN)
898          {
899          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
900          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
901          ecode = prev;
902          goto TAIL_RECURSE;
903          }
904        else  /* OP_KETRMAX */
905          {
906          md->match_function_type = MATCH_CBEGROUP;
907          RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
908          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
909          ecode += 1 + LINK_SIZE;
910          goto TAIL_RECURSE;
911          }
912        /* Control never gets here */
913    
914      if (number > EXTRACT_BASIC_MAX)      /* Handle a capturing bracket, other than those that are possessive with an
915        number = GET2(ecode, 2+LINK_SIZE);      unlimited repeat. If there is space in the offset vector, save the current
916        subject position in the working slot at the top of the vector. We mustn't
917        change the current values of the data slot, because they may be set from a
918        previous iteration of this group, and be referred to by a reference inside
919        the group. A failure to match might occur after the group has succeeded,
920        if something later on doesn't match. For this reason, we need to restore
921        the working value and also the values of the final offsets, in case they
922        were set by a previous iteration of the same bracket.
923    
924        If there isn't enough space in the offset vector, treat this as if it were
925        a non-capturing bracket. Don't worry about setting the flag for the error
926        case here; that is handled in the code for KET. */
927    
928        case OP_CBRA:
929        case OP_SCBRA:
930        number = GET2(ecode, 1+LINK_SIZE);
931      offset = number << 1;      offset = number << 1;
932    
933  #ifdef DEBUG  #ifdef PCRE_DEBUG
934      printf("start bracket %d subject=", number);      printf("start bracket %d\n", number);
935        printf("subject=");
936      pchars(eptr, 16, TRUE, md);      pchars(eptr, 16, TRUE, md);
937      printf("\n");      printf("\n");
938  #endif  #endif
# Line 580  for (;;) Line 943  for (;;)
943        save_offset2 = md->offset_vector[offset+1];        save_offset2 = md->offset_vector[offset+1];
944        save_offset3 = md->offset_vector[md->offset_end - number];        save_offset3 = md->offset_vector[md->offset_end - number];
945        save_capture_last = md->capture_last;        save_capture_last = md->capture_last;
946          save_mark = md->mark;
947    
948        DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));        DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
949        md->offset_vector[md->offset_end - number] = eptr - md->start_subject;        md->offset_vector[md->offset_end - number] =
950            (int)(eptr - md->start_subject);
951    
952        do        for (;;)
953          {          {
954          RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,          if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
955            match_isgroup);          RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
956              eptrb, RM1);
957            if (rrc == MATCH_ONCE) break;  /* Backing up through an atomic group */
958    
959            /* If we backed up to a THEN, check whether it is within the current
960            branch by comparing the address of the THEN that is passed back with
961            the end of the branch. If it is within the current branch, and the
962            branch is one of two or more alternatives (it either starts or ends
963            with OP_ALT), we have reached the limit of THEN's action, so convert
964            the return code to NOMATCH, which will cause normal backtracking to
965            happen from now on. Otherwise, THEN is passed back to an outer
966            alternative. This implements Perl's treatment of parenthesized groups,
967            where a group not containing | does not affect the current alternative,
968            that is, (X) is NOT the same as (X|(*F)). */
969    
970            if (rrc == MATCH_THEN)
971              {
972              next = ecode + GET(ecode,1);
973              if (md->start_match_ptr < next &&
974                  (*ecode == OP_ALT || *next == OP_ALT))
975                rrc = MATCH_NOMATCH;
976              }
977    
978            /* Anything other than NOMATCH is passed back. */
979    
980          if (rrc != MATCH_NOMATCH) RRETURN(rrc);          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
981          md->capture_last = save_capture_last;          md->capture_last = save_capture_last;
982          ecode += GET(ecode, 1);          ecode += GET(ecode, 1);
983            md->mark = save_mark;
984            if (*ecode != OP_ALT) break;
985          }          }
       while (*ecode == OP_ALT);  
986    
987        DPRINTF(("bracket %d failed\n", number));        DPRINTF(("bracket %d failed\n", number));
   
988        md->offset_vector[offset] = save_offset1;        md->offset_vector[offset] = save_offset1;
989        md->offset_vector[offset+1] = save_offset2;        md->offset_vector[offset+1] = save_offset2;
990        md->offset_vector[md->offset_end - number] = save_offset3;        md->offset_vector[md->offset_end - number] = save_offset3;
991    
992        RRETURN(MATCH_NOMATCH);        /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
993    
994          RRETURN(rrc);
995        }        }
996    
997      /* Insufficient room for saving captured contents */      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
998        as a non-capturing bracket. */
999    
1000      else op = OP_BRA;      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1001      }      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1002    
1003    /* Other types of node can be handled by a switch */      DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1004    
1005    switch(op)      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006      {      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007      case OP_BRA:     /* Non-capturing bracket: optimized */  
1008      DPRINTF(("start bracket 0\n"));      /* Non-capturing or atomic group, except for possessive with unlimited
1009      do      repeat and ONCE group with no captures. Loop for all the alternatives.
1010    
1011        When we get to the final alternative within the brackets, we used to return
1012        the result of a recursive call to match() whatever happened so it was
1013        possible to reduce stack usage by turning this into a tail recursion,
1014        except in the case of a possibly empty group. However, now that there is
1015        the possiblity of (*THEN) occurring in the final alternative, this
1016        optimization is no longer always possible.
1017    
1018        We can optimize if we know there are no (*THEN)s in the pattern; at present
1019        this is the best that can be done.
1020    
1021        MATCH_ONCE is returned when the end of an atomic group is successfully
1022        reached, but subsequent matching fails. It passes back up the tree (causing
1023        captured values to be reset) until the original atomic group level is
1024        reached. This is tested by comparing md->once_target with the start of the
1025        group. At this point, the return is converted into MATCH_NOMATCH so that
1026        previous backup points can be taken. */
1027    
1028        case OP_ONCE:
1029        case OP_BRA:
1030        case OP_SBRA:
1031        DPRINTF(("start non-capturing bracket\n"));
1032    
1033        for (;;)
1034        {        {
1035        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,        if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1036          match_isgroup);  
1037        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        /* If this is not a possibly empty group, and there are no (*THEN)s in
1038          the pattern, and this is the final alternative, optimize as described
1039          above. */
1040    
1041          else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1042            {
1043            ecode += PRIV(OP_lengths)[*ecode];
1044            goto TAIL_RECURSE;
1045            }
1046    
1047          /* In all other cases, we have to make another call to match(). */
1048    
1049          save_mark = md->mark;
1050          RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1051            RM2);
1052    
1053          /* See comment in the code for capturing groups above about handling
1054          THEN. */
1055    
1056          if (rrc == MATCH_THEN)
1057            {
1058            next = ecode + GET(ecode,1);
1059            if (md->start_match_ptr < next &&
1060                (*ecode == OP_ALT || *next == OP_ALT))
1061              rrc = MATCH_NOMATCH;
1062            }
1063    
1064          if (rrc != MATCH_NOMATCH)
1065            {
1066            if (rrc == MATCH_ONCE)
1067              {
1068              const pcre_uchar *scode = ecode;
1069              if (*scode != OP_ONCE)           /* If not at start, find it */
1070                {
1071                while (*scode == OP_ALT) scode += GET(scode, 1);
1072                scode -= GET(scode, 1);
1073                }
1074              if (md->once_target == scode) rrc = MATCH_NOMATCH;
1075              }
1076            RRETURN(rrc);
1077            }
1078        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
1079          md->mark = save_mark;
1080          if (*ecode != OP_ALT) break;
1081        }        }
1082      while (*ecode == OP_ALT);  
     DPRINTF(("bracket 0 failed\n"));  
1083      RRETURN(MATCH_NOMATCH);      RRETURN(MATCH_NOMATCH);
1084    
1085      /* Conditional group: compilation checked that there are no more than      /* Handle possessive capturing brackets with an unlimited repeat. We come
1086      two branches. If the condition is false, skipping the first branch takes us      here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1087      past the end if there is only one branch, but that's OK because that is      handled similarly to the normal case above. However, the matching is
1088      exactly what going to the ket would do. */      different. The end of these brackets will always be OP_KETRPOS, which
1089        returns MATCH_KETRPOS without going further in the pattern. By this means
1090        we can handle the group by iteration rather than recursion, thereby
1091        reducing the amount of stack needed. */
1092    
1093        case OP_CBRAPOS:
1094        case OP_SCBRAPOS:
1095        allow_zero = FALSE;
1096    
1097      case OP_COND:      POSSESSIVE_CAPTURE:
1098      if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */      number = GET2(ecode, 1+LINK_SIZE);
1099        {      offset = number << 1;
       offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */  
       condition = (offset == CREF_RECURSE * 2)?  
         (md->recursive != NULL) :  
         (offset < offset_top && md->offset_vector[offset] >= 0);  
       RMATCH(rrc, eptr, ecode + (condition?  
         (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),  
         offset_top, md, ims, eptrb, match_isgroup);  
       RRETURN(rrc);  
       }  
1100    
1101      /* The condition is an assertion. Call match() to evaluate it - setting  #ifdef PCRE_DEBUG
1102      the final argument TRUE causes it to stop at the end of an assertion. */      printf("start possessive bracket %d\n", number);
1103        printf("subject=");
1104        pchars(eptr, 16, TRUE, md);
1105        printf("\n");
1106    #endif
1107    
1108      else      if (offset < md->offset_max)
1109        {        {
1110        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,        matched_once = FALSE;
1111            match_condassert | match_isgroup);        code_offset = (int)(ecode - md->start_code);
1112        if (rrc == MATCH_MATCH)  
1113          save_offset1 = md->offset_vector[offset];
1114          save_offset2 = md->offset_vector[offset+1];
1115          save_offset3 = md->offset_vector[md->offset_end - number];
1116          save_capture_last = md->capture_last;
1117    
1118          DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1119    
1120          /* Each time round the loop, save the current subject position for use
1121          when the group matches. For MATCH_MATCH, the group has matched, so we
1122          restart it with a new subject starting position, remembering that we had
1123          at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1124          usual. If we haven't matched any alternatives in any iteration, check to
1125          see if a previous iteration matched. If so, the group has matched;
1126          continue from afterwards. Otherwise it has failed; restore the previous
1127          capture values before returning NOMATCH. */
1128    
1129          for (;;)
1130            {
1131            md->offset_vector[md->offset_end - number] =
1132              (int)(eptr - md->start_subject);
1133            if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1134            RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1135              eptrb, RM63);
1136            if (rrc == MATCH_KETRPOS)
1137              {
1138              offset_top = md->end_offset_top;
1139              eptr = md->end_match_ptr;
1140              ecode = md->start_code + code_offset;
1141              save_capture_last = md->capture_last;
1142              matched_once = TRUE;
1143              continue;
1144              }
1145    
1146            /* See comment in the code for capturing groups above about handling
1147            THEN. */
1148    
1149            if (rrc == MATCH_THEN)
1150              {
1151              next = ecode + GET(ecode,1);
1152              if (md->start_match_ptr < next &&
1153                  (*ecode == OP_ALT || *next == OP_ALT))
1154                rrc = MATCH_NOMATCH;
1155              }
1156    
1157            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1158            md->capture_last = save_capture_last;
1159            ecode += GET(ecode, 1);
1160            if (*ecode != OP_ALT) break;
1161            }
1162    
1163          if (!matched_once)
1164          {          {
1165          ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);          md->offset_vector[offset] = save_offset1;
1166          while (*ecode == OP_ALT) ecode += GET(ecode, 1);          md->offset_vector[offset+1] = save_offset2;
1167            md->offset_vector[md->offset_end - number] = save_offset3;
1168          }          }
1169        else if (rrc != MATCH_NOMATCH)  
1170          if (allow_zero || matched_once)
1171          {          {
1172          RRETURN(rrc);         /* Need braces because of following else */          ecode += 1 + LINK_SIZE;
1173            break;
1174          }          }
1175        else ecode += GET(ecode, 1);  
1176        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,        RRETURN(MATCH_NOMATCH);
         match_isgroup);  
       RRETURN(rrc);  
1177        }        }
     /* Control never reaches here */  
1178    
1179      /* Skip over conditional reference or large extraction number data if      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1180      encountered. */      as a non-capturing bracket. */
1181    
1182      case OP_CREF:      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1183      case OP_BRANUMBER:      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
     ecode += 3;  
     break;  
1184    
1185      /* End of the pattern. If we are in a recursion, we should restore the      DPRINTF(("insufficient capture room: treat as non-capturing\n"));
     offsets appropriately and continue from after the call. */  
1186    
1187      case OP_END:      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1188      if (md->recursive != NULL && md->recursive->group_num == 0)      /* VVVVVVVVVVVVVVVVVVVVVVVVV */
       {  
       recursion_info *rec = md->recursive;  
       DPRINTF(("Hit the end in a (?0) recursion\n"));  
       md->recursive = rec->prevrec;  
       memmove(md->offset_vector, rec->offset_save,  
         rec->saved_max * sizeof(int));  
       md->start_match = rec->save_start;  
       ims = original_ims;  
       ecode = rec->after_call;  
       break;  
       }  
1189    
1190      /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty      /* Non-capturing possessive bracket with unlimited repeat. We come here
1191      string - backtracking will then try other alternatives, if any. */      from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1192        without the capturing complication. It is written out separately for speed
1193        and cleanliness. */
1194    
1195      if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);      case OP_BRAPOS:
1196      md->end_match_ptr = eptr;          /* Record where we ended */      case OP_SBRAPOS:
1197      md->end_offset_top = offset_top;   /* and how many extracts were taken */      allow_zero = FALSE;
     RRETURN(MATCH_MATCH);  
   
     /* Change option settings */  
   
     case OP_OPT:  
     ims = ecode[1];  
     ecode += 2;  
     DPRINTF(("ims set to %02lx\n", ims));  
     break;  
1198    
1199      /* Assertion brackets. Check the alternative branches in turn - the      POSSESSIVE_NON_CAPTURE:
1200      matching won't pass the KET for an assertion. If any one branch matches,      matched_once = FALSE;
1201      the assertion is true. Lookbehind assertions have an OP_REVERSE item at the      code_offset = (int)(ecode - md->start_code);
     start of each branch to move the current point backwards, so the code at  
     this level is identical to the lookahead case. */  
1202    
1203      case OP_ASSERT:      for (;;)
     case OP_ASSERTBACK:  
     do  
1204        {        {
1205        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,        if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1206          match_isgroup);        RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1207        if (rrc == MATCH_MATCH) break;          eptrb, RM48);
1208          if (rrc == MATCH_KETRPOS)
1209            {
1210            offset_top = md->end_offset_top;
1211            eptr = md->end_match_ptr;
1212            ecode = md->start_code + code_offset;
1213            matched_once = TRUE;
1214            continue;
1215            }
1216    
1217          /* See comment in the code for capturing groups above about handling
1218          THEN. */
1219    
1220          if (rrc == MATCH_THEN)
1221            {
1222            next = ecode + GET(ecode,1);
1223            if (md->start_match_ptr < next &&
1224                (*ecode == OP_ALT || *next == OP_ALT))
1225              rrc = MATCH_NOMATCH;
1226            }
1227    
1228        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1229        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
1230          if (*ecode != OP_ALT) break;
1231        }        }
     while (*ecode == OP_ALT);  
     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);  
1232    
1233      /* If checking an assertion for a condition, return MATCH_MATCH. */      if (matched_once || allow_zero)
1234          {
1235          ecode += 1 + LINK_SIZE;
1236          break;
1237          }
1238        RRETURN(MATCH_NOMATCH);
1239    
1240      if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);      /* Control never reaches here. */
1241    
1242      /* Continue from after the assertion, updating the offsets high water      /* Conditional group: compilation checked that there are no more than
1243      mark, since extracts may have been taken during the assertion. */      two branches. If the condition is false, skipping the first branch takes us
1244        past the end if there is only one branch, but that's OK because that is
1245        exactly what going to the ket would do. */
1246    
1247      do ecode += GET(ecode,1); while (*ecode == OP_ALT);      case OP_COND:
1248      ecode += 1 + LINK_SIZE;      case OP_SCOND:
1249      offset_top = md->end_offset_top;      codelink = GET(ecode, 1);
     continue;  
1250    
1251      /* Negative assertion: all branches must fail to match */      /* Because of the way auto-callout works during compile, a callout item is
1252        inserted between OP_COND and an assertion condition. */
1253    
1254      case OP_ASSERT_NOT:      if (ecode[LINK_SIZE+1] == OP_CALLOUT)
     case OP_ASSERTBACK_NOT:  
     do  
1255        {        {
1256        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,        if (PUBL(callout) != NULL)
1257          match_isgroup);          {
1258        if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);          PUBL(callout_block) cb;
1259        if (rrc != MATCH_NOMATCH) RRETURN(rrc);          cb.version          = 2;   /* Version 1 of the callout block */
1260        ecode += GET(ecode,1);          cb.callout_number   = ecode[LINK_SIZE+2];
1261            cb.offset_vector    = md->offset_vector;
1262    #ifdef COMPILE_PCRE8
1263            cb.subject          = (PCRE_SPTR)md->start_subject;
1264    #else
1265            cb.subject          = (PCRE_SPTR16)md->start_subject;
1266    #endif
1267            cb.subject_length   = (int)(md->end_subject - md->start_subject);
1268            cb.start_match      = (int)(mstart - md->start_subject);
1269            cb.current_position = (int)(eptr - md->start_subject);
1270            cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1271            cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1272            cb.capture_top      = offset_top/2;
1273            cb.capture_last     = md->capture_last;
1274            cb.callout_data     = md->callout_data;
1275            cb.mark             = md->nomatch_mark;
1276            if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1277            if (rrc < 0) RRETURN(rrc);
1278            }
1279          ecode += PRIV(OP_lengths)[OP_CALLOUT];
1280        }        }
     while (*ecode == OP_ALT);  
1281    
1282      if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);      condcode = ecode[LINK_SIZE+1];
   
     ecode += 1 + LINK_SIZE;  
     continue;  
1283    
1284      /* Move the subject pointer back. This occurs only at the start of      /* Now see what the actual condition is */
     each branch of a lookbehind assertion. If we are too close to the start to  
     move back, this match function fails. When working with UTF-8 we move  
     back a number of characters, not bytes. */  
1285    
1286      case OP_REVERSE:      if (condcode == OP_RREF || condcode == OP_NRREF)    /* Recursion test */
 #ifdef SUPPORT_UTF8  
     if (utf8)  
1287        {        {
1288        c = GET(ecode,1);        if (md->recursive == NULL)                /* Not recursing => FALSE */
       for (i = 0; i < c; i++)  
1289          {          {
1290          eptr--;          condition = FALSE;
1291          if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);          ecode += GET(ecode, 1);
         BACKCHAR(eptr)  
1292          }          }
1293        }        else
1294      else          {
1295  #endif          int recno = GET2(ecode, LINK_SIZE + 2);   /* Recursion group number*/
1296            condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1297    
1298            /* If the test is for recursion into a specific subpattern, and it is
1299            false, but the test was set up by name, scan the table to see if the
1300            name refers to any other numbers, and test them. The condition is true
1301            if any one is set. */
1302    
1303            if (!condition && condcode == OP_NRREF)
1304              {
1305              pcre_uchar *slotA = md->name_table;
1306              for (i = 0; i < md->name_count; i++)
1307                {
1308                if (GET2(slotA, 0) == recno) break;
1309                slotA += md->name_entry_size;
1310                }
1311    
1312              /* Found a name for the number - there can be only one; duplicate
1313              names for different numbers are allowed, but not vice versa. First
1314              scan down for duplicates. */
1315    
1316              if (i < md->name_count)
1317                {
1318                pcre_uchar *slotB = slotA;
1319                while (slotB > md->name_table)
1320                  {
1321                  slotB -= md->name_entry_size;
1322                  if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1323                    {
1324                    condition = GET2(slotB, 0) == md->recursive->group_num;
1325                    if (condition) break;
1326                    }
1327                  else break;
1328                  }
1329    
1330                /* Scan up for duplicates */
1331    
1332                if (!condition)
1333                  {
1334                  slotB = slotA;
1335                  for (i++; i < md->name_count; i++)
1336                    {
1337                    slotB += md->name_entry_size;
1338                    if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1339                      {
1340                      condition = GET2(slotB, 0) == md->recursive->group_num;
1341                      if (condition) break;
1342                      }
1343                    else break;
1344                    }
1345                  }
1346                }
1347              }
1348    
1349            /* Chose branch according to the condition */
1350    
1351            ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1352            }
1353          }
1354    
1355        else if (condcode == OP_CREF || condcode == OP_NCREF)  /* Group used test */
1356          {
1357          offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
1358          condition = offset < offset_top && md->offset_vector[offset] >= 0;
1359    
1360          /* If the numbered capture is unset, but the reference was by name,
1361          scan the table to see if the name refers to any other numbers, and test
1362          them. The condition is true if any one is set. This is tediously similar
1363          to the code above, but not close enough to try to amalgamate. */
1364    
1365          if (!condition && condcode == OP_NCREF)
1366            {
1367            int refno = offset >> 1;
1368            pcre_uchar *slotA = md->name_table;
1369    
1370            for (i = 0; i < md->name_count; i++)
1371              {
1372              if (GET2(slotA, 0) == refno) break;
1373              slotA += md->name_entry_size;
1374              }
1375    
1376            /* Found a name for the number - there can be only one; duplicate names
1377            for different numbers are allowed, but not vice versa. First scan down
1378            for duplicates. */
1379    
1380            if (i < md->name_count)
1381              {
1382              pcre_uchar *slotB = slotA;
1383              while (slotB > md->name_table)
1384                {
1385                slotB -= md->name_entry_size;
1386                if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1387                  {
1388                  offset = GET2(slotB, 0) << 1;
1389                  condition = offset < offset_top &&
1390                    md->offset_vector[offset] >= 0;
1391                  if (condition) break;
1392                  }
1393                else break;
1394                }
1395    
1396              /* Scan up for duplicates */
1397    
1398              if (!condition)
1399                {
1400                slotB = slotA;
1401                for (i++; i < md->name_count; i++)
1402                  {
1403                  slotB += md->name_entry_size;
1404                  if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1405                    {
1406                    offset = GET2(slotB, 0) << 1;
1407                    condition = offset < offset_top &&
1408                      md->offset_vector[offset] >= 0;
1409                    if (condition) break;
1410                    }
1411                  else break;
1412                  }
1413                }
1414              }
1415            }
1416    
1417          /* Chose branch according to the condition */
1418    
1419          ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1420          }
1421    
1422        else if (condcode == OP_DEF)     /* DEFINE - always false */
1423          {
1424          condition = FALSE;
1425          ecode += GET(ecode, 1);
1426          }
1427    
1428        /* The condition is an assertion. Call match() to evaluate it - setting
1429        md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1430        an assertion. */
1431    
1432        else
1433          {
1434          md->match_function_type = MATCH_CONDASSERT;
1435          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1436          if (rrc == MATCH_MATCH)
1437            {
1438            if (md->end_offset_top > offset_top)
1439              offset_top = md->end_offset_top;  /* Captures may have happened */
1440            condition = TRUE;
1441            ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1442            while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1443            }
1444    
1445          /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1446          assertion; it is therefore treated as NOMATCH. */
1447    
1448          else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1449            {
1450            RRETURN(rrc);         /* Need braces because of following else */
1451            }
1452          else
1453            {
1454            condition = FALSE;
1455            ecode += codelink;
1456            }
1457          }
1458    
1459        /* We are now at the branch that is to be obeyed. As there is only one, can
1460        use tail recursion to avoid using another stack frame, except when there is
1461        unlimited repeat of a possibly empty group. In the latter case, a recursive
1462        call to match() is always required, unless the second alternative doesn't
1463        exist, in which case we can just plough on. Note that, for compatibility
1464        with Perl, the | in a conditional group is NOT treated as creating two
1465        alternatives. If a THEN is encountered in the branch, it propagates out to
1466        the enclosing alternative (unless nested in a deeper set of alternatives,
1467        of course). */
1468    
1469        if (condition || *ecode == OP_ALT)
1470          {
1471          if (op != OP_SCOND)
1472            {
1473            ecode += 1 + LINK_SIZE;
1474            goto TAIL_RECURSE;
1475            }
1476    
1477          md->match_function_type = MATCH_CBEGROUP;
1478          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1479          RRETURN(rrc);
1480          }
1481    
1482         /* Condition false & no alternative; continue after the group. */
1483    
1484        else
1485          {
1486          ecode += 1 + LINK_SIZE;
1487          }
1488        break;
1489    
1490    
1491        /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1492        to close any currently open capturing brackets. */
1493    
1494        case OP_CLOSE:
1495        number = GET2(ecode, 1);
1496        offset = number << 1;
1497    
1498    #ifdef PCRE_DEBUG
1499          printf("end bracket %d at *ACCEPT", number);
1500          printf("\n");
1501    #endif
1502    
1503        md->capture_last = number;
1504        if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1505          {
1506          md->offset_vector[offset] =
1507            md->offset_vector[md->offset_end - number];
1508          md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1509          if (offset_top <= offset) offset_top = offset + 2;
1510          }
1511        ecode += 1 + IMM2_SIZE;
1512        break;
1513    
1514    
1515        /* End of the pattern, either real or forced. */
1516    
1517        case OP_END:
1518        case OP_ACCEPT:
1519        case OP_ASSERT_ACCEPT:
1520    
1521        /* If we have matched an empty string, fail if not in an assertion and not
1522        in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1523        is set and we have matched at the start of the subject. In both cases,
1524        backtracking will then try other alternatives, if any. */
1525    
1526        if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1527             md->recursive == NULL &&
1528             (md->notempty ||
1529               (md->notempty_atstart &&
1530                 mstart == md->start_subject + md->start_offset)))
1531          RRETURN(MATCH_NOMATCH);
1532    
1533        /* Otherwise, we have a match. */
1534    
1535        md->end_match_ptr = eptr;           /* Record where we ended */
1536        md->end_offset_top = offset_top;    /* and how many extracts were taken */
1537        md->start_match_ptr = mstart;       /* and the start (\K can modify) */
1538    
1539        /* For some reason, the macros don't work properly if an expression is
1540        given as the argument to RRETURN when the heap is in use. */
1541    
1542        rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1543        RRETURN(rrc);
1544    
1545        /* Assertion brackets. Check the alternative branches in turn - the
1546        matching won't pass the KET for an assertion. If any one branch matches,
1547        the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1548        start of each branch to move the current point backwards, so the code at
1549        this level is identical to the lookahead case. When the assertion is part
1550        of a condition, we want to return immediately afterwards. The caller of
1551        this incarnation of the match() function will have set MATCH_CONDASSERT in
1552        md->match_function type, and one of these opcodes will be the first opcode
1553        that is processed. We use a local variable that is preserved over calls to
1554        match() to remember this case. */
1555    
1556        case OP_ASSERT:
1557        case OP_ASSERTBACK:
1558        save_mark = md->mark;
1559        if (md->match_function_type == MATCH_CONDASSERT)
1560          {
1561          condassert = TRUE;
1562          md->match_function_type = 0;
1563          }
1564        else condassert = FALSE;
1565    
1566        do
1567          {
1568          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1569          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1570            {
1571            mstart = md->start_match_ptr;   /* In case \K reset it */
1572            break;
1573            }
1574    
1575          /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1576          as NOMATCH. */
1577    
1578          if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1579          ecode += GET(ecode, 1);
1580          md->mark = save_mark;
1581          }
1582        while (*ecode == OP_ALT);
1583    
1584        if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1585    
1586        /* If checking an assertion for a condition, return MATCH_MATCH. */
1587    
1588        if (condassert) RRETURN(MATCH_MATCH);
1589    
1590        /* Continue from after the assertion, updating the offsets high water
1591        mark, since extracts may have been taken during the assertion. */
1592    
1593        do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1594        ecode += 1 + LINK_SIZE;
1595        offset_top = md->end_offset_top;
1596        continue;
1597    
1598        /* Negative assertion: all branches must fail to match. Encountering SKIP,
1599        PRUNE, or COMMIT means we must assume failure without checking subsequent
1600        branches. */
1601    
1602        case OP_ASSERT_NOT:
1603        case OP_ASSERTBACK_NOT:
1604        save_mark = md->mark;
1605        if (md->match_function_type == MATCH_CONDASSERT)
1606          {
1607          condassert = TRUE;
1608          md->match_function_type = 0;
1609          }
1610        else condassert = FALSE;
1611    
1612        do
1613          {
1614          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1615          md->mark = save_mark;
1616          if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1617          if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1618            {
1619            do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1620            break;
1621            }
1622    
1623          /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1624          as NOMATCH. */
1625    
1626          if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1627          ecode += GET(ecode,1);
1628          }
1629        while (*ecode == OP_ALT);
1630    
1631        if (condassert) RRETURN(MATCH_MATCH);  /* Condition assertion */
1632    
1633        ecode += 1 + LINK_SIZE;
1634        continue;
1635    
1636        /* Move the subject pointer back. This occurs only at the start of
1637        each branch of a lookbehind assertion. If we are too close to the start to
1638        move back, this match function fails. When working with UTF-8 we move
1639        back a number of characters, not bytes. */
1640    
1641        case OP_REVERSE:
1642    #ifdef SUPPORT_UTF
1643        if (utf)
1644          {
1645          i = GET(ecode, 1);
1646          while (i-- > 0)
1647            {
1648            eptr--;
1649            if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1650            BACKCHAR(eptr);
1651            }
1652          }
1653        else
1654    #endif
1655    
1656      /* No UTF-8 support, or not in UTF-8 mode: count is byte count */      /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1657    
1658        {        {
1659        eptr -= GET(ecode,1);        eptr -= GET(ecode, 1);
1660        if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);        if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1661        }        }
1662    
1663      /* Skip to next op code */      /* Save the earliest consulted character, then skip to next op code */
1664    
1665        if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1666      ecode += 1 + LINK_SIZE;      ecode += 1 + LINK_SIZE;
1667      break;      break;
1668    
# Line 794  for (;;) Line 1671  for (;;)
1671      function is able to force a failure. */      function is able to force a failure. */
1672    
1673      case OP_CALLOUT:      case OP_CALLOUT:
1674      if (pcre_callout != NULL)      if (PUBL(callout) != NULL)
1675        {        {
1676        pcre_callout_block cb;        PUBL(callout_block) cb;
1677        cb.version          = 1;   /* Version 1 of the callout block */        cb.version          = 2;   /* Version 1 of the callout block */
1678        cb.callout_number   = ecode[1];        cb.callout_number   = ecode[1];
1679        cb.offset_vector    = md->offset_vector;        cb.offset_vector    = md->offset_vector;
1680        cb.subject          = (const char *)md->start_subject;  #ifdef COMPILE_PCRE8
1681        cb.subject_length   = md->end_subject - md->start_subject;        cb.subject          = (PCRE_SPTR)md->start_subject;
1682        cb.start_match      = md->start_match - md->start_subject;  #else
1683        cb.current_position = eptr - md->start_subject;        cb.subject          = (PCRE_SPTR16)md->start_subject;
1684    #endif
1685          cb.subject_length   = (int)(md->end_subject - md->start_subject);
1686          cb.start_match      = (int)(mstart - md->start_subject);
1687          cb.current_position = (int)(eptr - md->start_subject);
1688        cb.pattern_position = GET(ecode, 2);        cb.pattern_position = GET(ecode, 2);
1689        cb.next_item_length = GET(ecode, 2 + LINK_SIZE);        cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1690        cb.capture_top      = offset_top/2;        cb.capture_top      = offset_top/2;
1691        cb.capture_last     = md->capture_last;        cb.capture_last     = md->capture_last;
1692        cb.callout_data     = md->callout_data;        cb.callout_data     = md->callout_data;
1693        if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);        cb.mark             = md->nomatch_mark;
1694          if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1695        if (rrc < 0) RRETURN(rrc);        if (rrc < 0) RRETURN(rrc);
1696        }        }
1697      ecode += 2 + 2*LINK_SIZE;      ecode += 2 + 2*LINK_SIZE;
# Line 819  for (;;) Line 1701  for (;;)
1701      offset data is the offset to the starting bracket from the start of the      offset data is the offset to the starting bracket from the start of the
1702      whole pattern. (This is so that it works from duplicated subpatterns.)      whole pattern. (This is so that it works from duplicated subpatterns.)
1703    
1704      If there are any capturing brackets started but not finished, we have to      The state of the capturing groups is preserved over recursion, and
1705      save their starting points and reinstate them after the recursion. However,      re-instated afterwards. We don't know how many are started and not yet
1706      we don't know how many such there are (offset_top records the completed      finished (offset_top records the completed total) so we just have to save
1707      total) so we just have to save all the potential data. There may be up to      all the potential data. There may be up to 65535 such values, which is too
1708      65535 such values, which is too large to put on the stack, but using malloc      large to put on the stack, but using malloc for small numbers seems
1709      for small numbers seems expensive. As a compromise, the stack is used when      expensive. As a compromise, the stack is used when there are no more than
1710      there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc      REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
     is used. A problem is what to do if the malloc fails ... there is no way of  
     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX  
     values on the stack, and accept that the rest may be wrong.  
1711    
1712      There are also other values that have to be saved. We use a chained      There are also other values that have to be saved. We use a chained
1713      sequence of blocks that actually live on the stack. Thanks to Robin Houston      sequence of blocks that actually live on the stack. Thanks to Robin Houston
1714      for the original version of this logic. */      for the original version of this logic. It has, however, been hacked around
1715        a lot, so he is not to blame for the current way it works. */
1716    
1717      case OP_RECURSE:      case OP_RECURSE:
1718        {        {
1719        callpat = md->start_code + GET(ecode, 1);        recursion_info *ri;
1720        new_recursive.group_num = *callpat - OP_BRA;        int recno;
1721    
1722        /* For extended extraction brackets (large number), we have to fish out        callpat = md->start_code + GET(ecode, 1);
1723        the number from a dummy opcode at the start. */        recno = (callpat == md->start_code)? 0 :
1724            GET2(callpat, 1 + LINK_SIZE);
1725    
1726        if (new_recursive.group_num > EXTRACT_BASIC_MAX)        /* Check for repeating a recursion without advancing the subject pointer.
1727          new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);        This should catch convoluted mutual recursions. (Some simple cases are
1728          caught at compile time.) */
1729    
1730          for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1731            if (recno == ri->group_num && eptr == ri->subject_position)
1732              RRETURN(PCRE_ERROR_RECURSELOOP);
1733    
1734        /* Add to "recursing stack" */        /* Add to "recursing stack" */
1735    
1736          new_recursive.group_num = recno;
1737          new_recursive.subject_position = eptr;
1738        new_recursive.prevrec = md->recursive;        new_recursive.prevrec = md->recursive;
1739        md->recursive = &new_recursive;        md->recursive = &new_recursive;
1740    
1741        /* Find where to continue from afterwards */        /* Where to continue from afterwards */
1742    
1743        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
       new_recursive.after_call = ecode;  
1744    
1745        /* Now save the offset data. */        /* Now save the offset data */
1746    
1747        new_recursive.saved_max = md->offset_end;        new_recursive.saved_max = md->offset_end;
1748        if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)        if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
# Line 863  for (;;) Line 1750  for (;;)
1750        else        else
1751          {          {
1752          new_recursive.offset_save =          new_recursive.offset_save =
1753            (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));            (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1754          if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);          if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1755          }          }
   
1756        memcpy(new_recursive.offset_save, md->offset_vector,        memcpy(new_recursive.offset_save, md->offset_vector,
1757              new_recursive.saved_max * sizeof(int));              new_recursive.saved_max * sizeof(int));
       new_recursive.save_start = md->start_match;  
       md->start_match = eptr;  
1758    
1759        /* OK, now we can do the recursion. For each top-level alternative we        /* OK, now we can do the recursion. After processing each alternative,
1760        restore the offset and recursion data. */        restore the offset data. If there were nested recursions, md->recursive
1761          might be changed, so reset it before looping. */
1762    
1763        DPRINTF(("Recursing into group %d\n", new_recursive.group_num));        DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1764          cbegroup = (*callpat >= OP_SBRA);
1765        do        do
1766          {          {
1767          RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,          if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1768              eptrb, match_isgroup);          RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1769          if (rrc == MATCH_MATCH)            md, eptrb, RM6);
1770            memcpy(md->offset_vector, new_recursive.offset_save,
1771                new_recursive.saved_max * sizeof(int));
1772            md->recursive = new_recursive.prevrec;
1773            if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1774              {
1775              DPRINTF(("Recursion matched\n"));
1776              if (new_recursive.offset_save != stacksave)
1777                (PUBL(free))(new_recursive.offset_save);
1778    
1779              /* Set where we got to in the subject, and reset the start in case
1780              it was changed by \K. This *is* propagated back out of a recursion,
1781              for Perl compatibility. */
1782    
1783              eptr = md->end_match_ptr;
1784              mstart = md->start_match_ptr;
1785              goto RECURSION_MATCHED;        /* Exit loop; end processing */
1786              }
1787    
1788            /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1789            as NOMATCH. */
1790    
1791            else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1792            {            {
1793            md->recursive = new_recursive.prevrec;            DPRINTF(("Recursion gave error %d\n", rrc));
1794            if (new_recursive.offset_save != stacksave)            if (new_recursive.offset_save != stacksave)
1795              (pcre_free)(new_recursive.offset_save);              (PUBL(free))(new_recursive.offset_save);
1796            RRETURN(MATCH_MATCH);            RRETURN(rrc);
1797            }            }
         else if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
1798    
1799          md->recursive = &new_recursive;          md->recursive = &new_recursive;
         memcpy(md->offset_vector, new_recursive.offset_save,  
             new_recursive.saved_max * sizeof(int));  
1800          callpat += GET(callpat, 1);          callpat += GET(callpat, 1);
1801          }          }
1802        while (*callpat == OP_ALT);        while (*callpat == OP_ALT);
# Line 899  for (;;) Line 1804  for (;;)
1804        DPRINTF(("Recursion didn't match\n"));        DPRINTF(("Recursion didn't match\n"));
1805        md->recursive = new_recursive.prevrec;        md->recursive = new_recursive.prevrec;
1806        if (new_recursive.offset_save != stacksave)        if (new_recursive.offset_save != stacksave)
1807          (pcre_free)(new_recursive.offset_save);          (PUBL(free))(new_recursive.offset_save);
1808        RRETURN(MATCH_NOMATCH);        RRETURN(MATCH_NOMATCH);
1809        }        }
     /* Control never reaches here */  
   
     /* "Once" brackets are like assertion brackets except that after a match,  
     the point in the subject string is not moved back. Thus there can never be  
     a move back into the brackets. Friedl calls these "atomic" subpatterns.  
     Check the alternative branches in turn - the matching won't pass the KET  
     for this kind of subpattern. If any one branch matches, we carry on as at  
     the end of a normal bracket, leaving the subject pointer. */  
   
     case OP_ONCE:  
       {  
       prev = ecode;  
       saved_eptr = eptr;  
   
       do  
         {  
         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,  
           eptrb, match_isgroup);  
         if (rrc == MATCH_MATCH) break;  
         if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
         ecode += GET(ecode,1);  
         }  
       while (*ecode == OP_ALT);  
   
       /* If hit the end of the group (which could be repeated), fail */  
   
       if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);  
   
       /* Continue as from after the assertion, updating the offsets high water  
       mark, since extracts may have been taken. */  
   
       do ecode += GET(ecode,1); while (*ecode == OP_ALT);  
   
       offset_top = md->end_offset_top;  
       eptr = md->end_match_ptr;  
   
       /* For a non-repeating ket, just continue at this level. This also  
       happens for a repeating ket if no characters were matched in the group.  
       This is the forcible breaking of infinite loops as implemented in Perl  
       5.005. If there is an options reset, it will get obeyed in the normal  
       course of events. */  
   
       if (*ecode == OP_KET || eptr == saved_eptr)  
         {  
         ecode += 1+LINK_SIZE;  
         break;  
         }  
   
       /* The repeating kets try the rest of the pattern or restart from the  
       preceding bracket, in the appropriate order. We need to reset any options  
       that changed within the bracket before re-running it, so check the next  
       opcode. */  
   
       if (ecode[1+LINK_SIZE] == OP_OPT)  
         {  
         ims = (ims & ~PCRE_IMS) | ecode[4];  
         DPRINTF(("ims set to %02lx at group repeat\n", ims));  
         }  
1810    
1811        if (*ecode == OP_KETRMIN)      RECURSION_MATCHED:
1812          {      break;
         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);  
         if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);  
         if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
         }  
       else  /* OP_KETRMAX */  
         {  
         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);  
         if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);  
         if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
         }  
       }  
     RRETURN(MATCH_NOMATCH);  
1813    
1814      /* An alternation is the end of a branch; scan along to find the end of the      /* An alternation is the end of a branch; scan along to find the end of the
1815      bracketed group and go to there. */      bracketed group and go to there. */
# Line 985  for (;;) Line 1818  for (;;)
1818      do ecode += GET(ecode,1); while (*ecode == OP_ALT);      do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1819      break;      break;
1820    
1821      /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating      /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1822      that it may occur zero times. It may repeat infinitely, or not at all -      indicating that it may occur zero times. It may repeat infinitely, or not
1823      i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper      at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1824      repeat limits are compiled as a number of copies, with the optional ones      with fixed upper repeat limits are compiled as a number of copies, with the
1825      preceded by BRAZERO or BRAMINZERO. */      optional ones preceded by BRAZERO or BRAMINZERO. */
1826    
1827      case OP_BRAZERO:      case OP_BRAZERO:
1828        {      next = ecode + 1;
1829        next = ecode+1;      RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1830        RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1831        if (rrc != MATCH_NOMATCH) RRETURN(rrc);      do next += GET(next, 1); while (*next == OP_ALT);
1832        do next += GET(next,1); while (*next == OP_ALT);      ecode = next + 1 + LINK_SIZE;
       ecode = next + 1+LINK_SIZE;  
       }  
1833      break;      break;
1834    
1835      case OP_BRAMINZERO:      case OP_BRAMINZERO:
1836        {      next = ecode + 1;
1837        next = ecode+1;      do next += GET(next, 1); while (*next == OP_ALT);
1838        do next += GET(next,1); while (*next == OP_ALT);      RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1839        RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1840          match_isgroup);      ecode++;
1841        if (rrc != MATCH_NOMATCH) RRETURN(rrc);      break;
1842        ecode++;  
1843        }      case OP_SKIPZERO:
1844        next = ecode+1;
1845        do next += GET(next,1); while (*next == OP_ALT);
1846        ecode = next + 1 + LINK_SIZE;
1847      break;      break;
1848    
1849      /* End of a group, repeated or non-repeating. If we are at the end of      /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1850      an assertion "group", stop matching and return MATCH_MATCH, but record the      here; just jump to the group, with allow_zero set TRUE. */
1851      current high water mark for use by positive assertions. Do this also  
1852      for the "once" (not-backup up) groups. */      case OP_BRAPOSZERO:
1853        op = *(++ecode);
1854        allow_zero = TRUE;
1855        if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1856          goto POSSESSIVE_NON_CAPTURE;
1857    
1858        /* End of a group, repeated or non-repeating. */
1859    
1860      case OP_KET:      case OP_KET:
1861      case OP_KETRMIN:      case OP_KETRMIN:
1862      case OP_KETRMAX:      case OP_KETRMAX:
1863        {      case OP_KETRPOS:
1864        prev = ecode - GET(ecode, 1);      prev = ecode - GET(ecode, 1);
       saved_eptr = eptrb->epb_saved_eptr;  
1865    
1866        /* Back up the stack of bracket start pointers. */      /* If this was a group that remembered the subject start, in order to break
1867        infinite repeats of empty string matches, retrieve the subject start from
1868        the chain. Otherwise, set it NULL. */
1869    
1870        eptrb = eptrb->epb_prev;      if (*prev >= OP_SBRA || *prev == OP_ONCE)
1871          {
1872          saved_eptr = eptrb->epb_saved_eptr;   /* Value at start of group */
1873          eptrb = eptrb->epb_prev;              /* Backup to previous group */
1874          }
1875        else saved_eptr = NULL;
1876    
1877        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||      /* If we are at the end of an assertion group or a non-capturing atomic
1878            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||      group, stop matching and return MATCH_MATCH, but record the current high
1879            *prev == OP_ONCE)      water mark for use by positive assertions. We also need to record the match
1880          {      start in case it was changed by \K. */
         md->end_match_ptr = eptr;      /* For ONCE */  
         md->end_offset_top = offset_top;  
         RRETURN(MATCH_MATCH);  
         }  
1881    
1882        /* In all other cases except a conditional group we have to check the      if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1883        group number back at the start and if necessary complete handling an           *prev == OP_ONCE_NC)
1884        extraction by setting the offsets and bumping the high water mark. */        {
1885          md->end_match_ptr = eptr;      /* For ONCE_NC */
1886          md->end_offset_top = offset_top;
1887          md->start_match_ptr = mstart;
1888          RRETURN(MATCH_MATCH);         /* Sets md->mark */
1889          }
1890    
1891        if (*prev != OP_COND)      /* For capturing groups we have to check the group number back at the start
1892          {      and if necessary complete handling an extraction by setting the offsets and
1893          number = *prev - OP_BRA;      bumping the high water mark. Whole-pattern recursion is coded as a recurse
1894        into group 0, so it won't be picked up here. Instead, we catch it when the
1895        OP_END is reached. Other recursion is handled here. We just have to record
1896        the current subject position and start match pointer and give a MATCH
1897        return. */
1898    
1899          /* For extended extraction brackets (large number), we have to fish out      if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1900          the number from a dummy opcode at the start. */          *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1901          {
1902          if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);        number = GET2(prev, 1+LINK_SIZE);
1903          offset = number << 1;        offset = number << 1;
1904    
1905  #ifdef DEBUG  #ifdef PCRE_DEBUG
1906          printf("end bracket %d", number);        printf("end bracket %d", number);
1907          printf("\n");        printf("\n");
1908  #endif  #endif
1909    
1910          /* Test for a numbered group. This includes groups called as a result        /* Handle a recursively called group. */
         of recursion. Note that whole-pattern recursion is coded as a recurse  
         into group 0, so it won't be picked up here. Instead, we catch it when  
         the OP_END is reached. */  
1911    
1912          if (number > 0)        if (md->recursive != NULL && md->recursive->group_num == number)
1913            {          {
1914            md->capture_last = number;          md->end_match_ptr = eptr;
1915            if (offset >= md->offset_max) md->offset_overflow = TRUE; else          md->start_match_ptr = mstart;
1916              {          RRETURN(MATCH_MATCH);
1917              md->offset_vector[offset] =          }
               md->offset_vector[md->offset_end - number];  
             md->offset_vector[offset+1] = eptr - md->start_subject;  
             if (offset_top <= offset) offset_top = offset + 2;  
             }  
1918    
1919            /* Handle a recursively called group. Restore the offsets        /* Deal with capturing */
           appropriately and continue from after the call. */  
1920    
1921            if (md->recursive != NULL && md->recursive->group_num == number)        md->capture_last = number;
1922              {        if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1923              recursion_info *rec = md->recursive;          {
1924              DPRINTF(("Recursion (%d) succeeded - continuing\n", number));          /* If offset is greater than offset_top, it means that we are
1925              md->recursive = rec->prevrec;          "skipping" a capturing group, and that group's offsets must be marked
1926              md->start_match = rec->save_start;          unset. In earlier versions of PCRE, all the offsets were unset at the
1927              memcpy(md->offset_vector, rec->offset_save,          start of matching, but this doesn't work because atomic groups and
1928                rec->saved_max * sizeof(int));          assertions can cause a value to be set that should later be unset.
1929              ecode = rec->after_call;          Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1930              ims = original_ims;          part of the atomic group, but this is not on the final matching path,
1931              break;          so must be unset when 2 is set. (If there is no group 2, there is no
1932              }          problem, because offset_top will then be 2, indicating no capture.) */
1933    
1934            if (offset > offset_top)
1935              {
1936              register int *iptr = md->offset_vector + offset_top;
1937              register int *iend = md->offset_vector + offset;
1938              while (iptr < iend) *iptr++ = -1;
1939            }            }
         }  
1940    
1941        /* Reset the value of the ims flags, in case they got changed during          /* Now make the extraction */
       the group. */  
1942    
1943        ims = original_ims;          md->offset_vector[offset] =
1944        DPRINTF(("ims reset to %02lx\n", ims));            md->offset_vector[md->offset_end - number];
1945            md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1946            if (offset_top <= offset) offset_top = offset + 2;
1947            }
1948          }
1949    
1950        /* For a non-repeating ket, just continue at this level. This also      /* For an ordinary non-repeating ket, just continue at this level. This
1951        happens for a repeating ket if no characters were matched in the group.      also happens for a repeating ket if no characters were matched in the
1952        This is the forcible breaking of infinite loops as implemented in Perl      group. This is the forcible breaking of infinite loops as implemented in
1953        5.005. If there is an options reset, it will get obeyed in the normal      Perl 5.005. For a non-repeating atomic group that includes captures,
1954        course of events. */      establish a backup point by processing the rest of the pattern at a lower
1955        level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1956        original OP_ONCE level, thereby bypassing intermediate backup points, but
1957        resetting any captures that happened along the way. */
1958    
1959        if (*ecode == OP_KET || eptr == saved_eptr)      if (*ecode == OP_KET || eptr == saved_eptr)
1960          {
1961          if (*prev == OP_ONCE)
1962          {          {
1963          ecode += 1 + LINK_SIZE;          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1964          break;          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1965            md->once_target = prev;  /* Level at which to change to MATCH_NOMATCH */
1966            RRETURN(MATCH_ONCE);
1967          }          }
1968          ecode += 1 + LINK_SIZE;    /* Carry on at this level */
1969          break;
1970          }
1971    
1972        /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1973        and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1974        at a time from the outer level, thus saving stack. */
1975    
1976        /* The repeating kets try the rest of the pattern or restart from the      if (*ecode == OP_KETRPOS)
1977        preceding bracket, in the appropriate order. */        {
1978          md->end_match_ptr = eptr;
1979          md->end_offset_top = offset_top;
1980          RRETURN(MATCH_KETRPOS);
1981          }
1982    
1983        if (*ecode == OP_KETRMIN)      /* The normal repeating kets try the rest of the pattern or restart from
1984        the preceding bracket, in the appropriate order. In the second case, we can
1985        use tail recursion to avoid using another stack frame, unless we have an
1986        an atomic group or an unlimited repeat of a group that can match an empty
1987        string. */
1988    
1989        if (*ecode == OP_KETRMIN)
1990          {
1991          RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1992          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1993          if (*prev == OP_ONCE)
1994          {          {
1995          RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);          RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
         if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);  
1996          if (rrc != MATCH_NOMATCH) RRETURN(rrc);          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997            md->once_target = prev;  /* Level at which to change to MATCH_NOMATCH */
1998            RRETURN(MATCH_ONCE);
1999          }          }
2000        else  /* OP_KETRMAX */        if (*prev >= OP_SBRA)    /* Could match an empty string */
2001          {          {
2002          RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);          md->match_function_type = MATCH_CBEGROUP;
2003          if (rrc != MATCH_NOMATCH) RRETURN(rrc);          RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2004          RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);          RRETURN(rrc);
2005            }
2006          ecode = prev;
2007          goto TAIL_RECURSE;
2008          }
2009        else  /* OP_KETRMAX */
2010          {
2011          if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
2012          RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2013          if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2014          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2015          if (*prev == OP_ONCE)
2016            {
2017            RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2018          if (rrc != MATCH_NOMATCH) RRETURN(rrc);          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019            md->once_target = prev;
2020            RRETURN(MATCH_ONCE);
2021          }          }
2022          ecode += 1 + LINK_SIZE;
2023          goto TAIL_RECURSE;
2024        }        }
2025        /* Control never gets here */
2026    
2027      RRETURN(MATCH_NOMATCH);      /* Not multiline mode: start of subject assertion, unless notbol. */
   
     /* Start of subject unless notbol, or after internal newline if multiline */  
2028    
2029      case OP_CIRC:      case OP_CIRC:
2030      if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);      if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
     if ((ims & PCRE_MULTILINE) != 0)  
       {  
       if (eptr != md->start_subject && eptr[-1] != NEWLINE)  
         RRETURN(MATCH_NOMATCH);  
       ecode++;  
       break;  
       }  
     /* ... else fall through */  
2031    
2032      /* Start of subject assertion */      /* Start of subject assertion */
2033    
# Line 1149  for (;;) Line 2036  for (;;)
2036      ecode++;      ecode++;
2037      break;      break;
2038    
2039        /* Multiline mode: start of subject unless notbol, or after any newline. */
2040    
2041        case OP_CIRCM:
2042        if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2043        if (eptr != md->start_subject &&
2044            (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2045          RRETURN(MATCH_NOMATCH);
2046        ecode++;
2047        break;
2048    
2049      /* Start of match assertion */      /* Start of match assertion */
2050    
2051      case OP_SOM:      case OP_SOM:
# Line 1156  for (;;) Line 2053  for (;;)
2053      ecode++;      ecode++;
2054      break;      break;
2055    
2056      /* Assert before internal newline if multiline, or before a terminating      /* Reset the start of match point */
     newline unless endonly is set, else end of subject unless noteol is set. */  
2057    
2058      case OP_DOLL:      case OP_SET_SOM:
2059      if ((ims & PCRE_MULTILINE) != 0)      mstart = eptr;
2060        {      ecode++;
2061        if (eptr < md->end_subject)      break;
2062          { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }  
2063        else      /* Multiline mode: assert before any newline, or before end of subject
2064          { if (md->noteol) RRETURN(MATCH_NOMATCH); }      unless noteol is set. */
2065        ecode++;  
2066        break;      case OP_DOLLM:
2067        if (eptr < md->end_subject)
2068          {
2069          if (!IS_NEWLINE(eptr))
2070            {
2071            if (md->partial != 0 &&
2072                eptr + 1 >= md->end_subject &&
2073                NLBLOCK->nltype == NLTYPE_FIXED &&
2074                NLBLOCK->nllen == 2 &&
2075                *eptr == NLBLOCK->nl[0])
2076              {
2077              md->hitend = TRUE;
2078              if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2079              }
2080            RRETURN(MATCH_NOMATCH);
2081            }
2082        }        }
2083      else      else
2084        {        {
2085        if (md->noteol) RRETURN(MATCH_NOMATCH);        if (md->noteol) RRETURN(MATCH_NOMATCH);
2086        if (!md->endonly)        SCHECK_PARTIAL();
         {  
         if (eptr < md->end_subject - 1 ||  
            (eptr == md->end_subject - 1 && *eptr != NEWLINE))  
           RRETURN(MATCH_NOMATCH);  
         ecode++;  
         break;  
         }  
2087        }        }
2088      /* ... else fall through */      ecode++;
2089        break;
2090    
2091        /* Not multiline mode: assert before a terminating newline or before end of
2092        subject unless noteol is set. */
2093    
2094        case OP_DOLL:
2095        if (md->noteol) RRETURN(MATCH_NOMATCH);
2096        if (!md->endonly) goto ASSERT_NL_OR_EOS;
2097    
2098        /* ... else fall through for endonly */
2099    
2100      /* End of subject assertion (\z) */      /* End of subject assertion (\z) */
2101    
2102      case OP_EOD:      case OP_EOD:
2103      if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2104        SCHECK_PARTIAL();
2105      ecode++;      ecode++;
2106      break;      break;
2107    
2108      /* End of subject or ending \n assertion (\Z) */      /* End of subject or ending \n assertion (\Z) */
2109    
2110      case OP_EODN:      case OP_EODN:
2111      if (eptr < md->end_subject - 1 ||      ASSERT_NL_OR_EOS:
2112         (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);      if (eptr < md->end_subject &&
2113            (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2114          {
2115          if (md->partial != 0 &&
2116              eptr + 1 >= md->end_subject &&
2117              NLBLOCK->nltype == NLTYPE_FIXED &&
2118              NLBLOCK->nllen == 2 &&
2119              *eptr == NLBLOCK->nl[0])
2120            {
2121            md->hitend = TRUE;
2122            if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2123            }
2124          RRETURN(MATCH_NOMATCH);
2125          }
2126    
2127        /* Either at end of string or \n before end. */
2128    
2129        SCHECK_PARTIAL();
2130      ecode++;      ecode++;
2131      break;      break;
2132    
# Line 1206  for (;;) Line 2138  for (;;)
2138    
2139        /* Find out if the previous and current characters are "word" characters.        /* Find out if the previous and current characters are "word" characters.
2140        It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to        It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2141        be "non-word" characters. */        be "non-word" characters. Remember the earliest consulted character for
2142          partial matching. */
2143    
2144  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2145        if (utf8)        if (utf)
2146          {          {
2147            /* Get status of previous character */
2148    
2149          if (eptr == md->start_subject) prev_is_word = FALSE; else          if (eptr == md->start_subject) prev_is_word = FALSE; else
2150            {            {
2151            const uschar *lastptr = eptr - 1;            PCRE_PUCHAR lastptr = eptr - 1;
2152            while((*lastptr & 0xc0) == 0x80) lastptr--;            BACKCHAR(lastptr);
2153              if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2154            GETCHAR(c, lastptr);            GETCHAR(c, lastptr);
2155    #ifdef SUPPORT_UCP
2156              if (md->use_ucp)
2157                {
2158                if (c == '_') prev_is_word = TRUE; else
2159                  {
2160                  int cat = UCD_CATEGORY(c);
2161                  prev_is_word = (cat == ucp_L || cat == ucp_N);
2162                  }
2163                }
2164              else
2165    #endif
2166            prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;            prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2167            }            }
2168          if (eptr >= md->end_subject) cur_is_word = FALSE; else  
2169            /* Get status of next character */
2170    
2171            if (eptr >= md->end_subject)
2172              {
2173              SCHECK_PARTIAL();
2174              cur_is_word = FALSE;
2175              }
2176            else
2177            {            {
2178            GETCHAR(c, eptr);            GETCHAR(c, eptr);
2179    #ifdef SUPPORT_UCP
2180              if (md->use_ucp)
2181                {
2182                if (c == '_') cur_is_word = TRUE; else
2183                  {
2184                  int cat = UCD_CATEGORY(c);
2185                  cur_is_word = (cat == ucp_L || cat == ucp_N);
2186                  }
2187                }
2188              else
2189    #endif
2190            cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;            cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2191            }            }
2192          }          }
2193        else        else
2194  #endif  #endif
2195    
2196        /* More streamlined when not in UTF-8 mode */        /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2197          consistency with the behaviour of \w we do use it in this case. */
2198    
2199          {          {
2200          prev_is_word = (eptr != md->start_subject) &&          /* Get status of previous character */
2201            ((md->ctypes[eptr[-1]] & ctype_word) != 0);  
2202          cur_is_word = (eptr < md->end_subject) &&          if (eptr == md->start_subject) prev_is_word = FALSE; else
2203            ((md->ctypes[*eptr] & ctype_word) != 0);            {
2204              if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2205    #ifdef SUPPORT_UCP
2206              if (md->use_ucp)
2207                {
2208                c = eptr[-1];
2209                if (c == '_') prev_is_word = TRUE; else
2210                  {
2211                  int cat = UCD_CATEGORY(c);
2212                  prev_is_word = (cat == ucp_L || cat == ucp_N);
2213                  }
2214                }
2215              else
2216    #endif
2217              prev_is_word = MAX_255(eptr[-1])
2218                && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2219              }
2220    
2221            /* Get status of next character */
2222    
2223            if (eptr >= md->end_subject)
2224              {
2225              SCHECK_PARTIAL();
2226              cur_is_word = FALSE;
2227              }
2228            else
2229    #ifdef SUPPORT_UCP
2230            if (md->use_ucp)
2231              {
2232              c = *eptr;
2233              if (c == '_') cur_is_word = TRUE; else
2234                {
2235                int cat = UCD_CATEGORY(c);
2236                cur_is_word = (cat == ucp_L || cat == ucp_N);
2237                }
2238              }
2239            else
2240    #endif
2241            cur_is_word = MAX_255(*eptr)
2242              && ((md->ctypes[*eptr] & ctype_word) != 0);
2243          }          }
2244    
2245        /* Now see if the situation is what we want */        /* Now see if the situation is what we want */
# Line 1244  for (;;) Line 2250  for (;;)
2250        }        }
2251      break;      break;
2252    
2253      /* Match a single character type; inline for speed */      /* Match any single character type except newline; have to take care with
2254        CRLF newlines and partial matching. */
2255    
2256      case OP_ANY:      case OP_ANY:
2257      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)      if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2258        if (md->partial != 0 &&
2259            eptr + 1 >= md->end_subject &&
2260            NLBLOCK->nltype == NLTYPE_FIXED &&
2261            NLBLOCK->nllen == 2 &&
2262            *eptr == NLBLOCK->nl[0])
2263          {
2264          md->hitend = TRUE;
2265          if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2266          }
2267    
2268        /* Fall through */
2269    
2270        /* Match any single character whatsoever. */
2271    
2272        case OP_ALLANY:
2273        if (eptr >= md->end_subject)   /* DO NOT merge the eptr++ here; it must */
2274          {                            /* not be updated before SCHECK_PARTIAL. */
2275          SCHECK_PARTIAL();
2276        RRETURN(MATCH_NOMATCH);        RRETURN(MATCH_NOMATCH);
2277      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);        }
2278  #ifdef SUPPORT_UTF8      eptr++;
2279      if (utf8)  #ifdef SUPPORT_UTF
2280        while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;      if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2281  #endif  #endif
2282      ecode++;      ecode++;
2283      break;      break;
# Line 1261  for (;;) Line 2286  for (;;)
2286      any byte, even newline, independent of the setting of PCRE_DOTALL. */      any byte, even newline, independent of the setting of PCRE_DOTALL. */
2287    
2288      case OP_ANYBYTE:      case OP_ANYBYTE:
2289      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject)   /* DO NOT merge the eptr++ here; it must */
2290          {                            /* not be updated before SCHECK_PARTIAL. */
2291          SCHECK_PARTIAL();
2292          RRETURN(MATCH_NOMATCH);
2293          }
2294        eptr++;
2295      ecode++;      ecode++;
2296      break;      break;
2297    
2298      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
2299      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject)
2300          {
2301          SCHECK_PARTIAL();
2302          RRETURN(MATCH_NOMATCH);
2303          }
2304      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
2305      if (      if (
2306  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2307         c < 256 &&         c < 256 &&
2308  #endif  #endif
2309         (md->ctypes[c] & ctype_digit) != 0         (md->ctypes[c] & ctype_digit) != 0
# Line 1279  for (;;) Line 2313  for (;;)
2313      break;      break;
2314    
2315      case OP_DIGIT:      case OP_DIGIT:
2316      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject)
2317          {
2318          SCHECK_PARTIAL();
2319          RRETURN(MATCH_NOMATCH);
2320          }
2321      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
2322      if (      if (
2323  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2324         c >= 256 ||         c > 255 ||
2325  #endif  #endif
2326         (md->ctypes[c] & ctype_digit) == 0         (md->ctypes[c] & ctype_digit) == 0
2327         )         )
# Line 1292  for (;;) Line 2330  for (;;)
2330      break;      break;
2331    
2332      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
2333      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject)
2334          {
2335          SCHECK_PARTIAL();
2336          RRETURN(MATCH_NOMATCH);
2337          }
2338      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
2339      if (      if (
2340  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2341         c < 256 &&         c < 256 &&
2342  #endif  #endif
2343         (md->ctypes[c] & ctype_space) != 0         (md->ctypes[c] & ctype_space) != 0
# Line 1305  for (;;) Line 2347  for (;;)
2347      break;      break;
2348    
2349      case OP_WHITESPACE:      case OP_WHITESPACE:
2350      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject)
2351          {
2352          SCHECK_PARTIAL();
2353          RRETURN(MATCH_NOMATCH);
2354          }
2355      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
2356      if (      if (
2357  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2358         c >= 256 ||         c > 255 ||
2359  #endif  #endif
2360         (md->ctypes[c] & ctype_space) == 0         (md->ctypes[c] & ctype_space) == 0
2361         )         )
# Line 1318  for (;;) Line 2364  for (;;)
2364      break;      break;
2365    
2366      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2367      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject)
2368          {
2369          SCHECK_PARTIAL();
2370          RRETURN(MATCH_NOMATCH);
2371          }
2372      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
2373      if (      if (
2374  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2375         c < 256 &&         c < 256 &&
2376  #endif  #endif
2377         (md->ctypes[c] & ctype_word) != 0         (md->ctypes[c] & ctype_word) != 0
# Line 1331  for (;;) Line 2381  for (;;)
2381      break;      break;
2382    
2383      case OP_WORDCHAR:      case OP_WORDCHAR:
2384      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject)
2385          {
2386          SCHECK_PARTIAL();
2387          RRETURN(MATCH_NOMATCH);
2388          }
2389      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
2390      if (      if (
2391  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2392         c >= 256 ||         c > 255 ||
2393  #endif  #endif
2394         (md->ctypes[c] & ctype_word) == 0         (md->ctypes[c] & ctype_word) == 0
2395         )         )
# Line 1343  for (;;) Line 2397  for (;;)
2397      ecode++;      ecode++;
2398      break;      break;
2399    
2400        case OP_ANYNL:
2401        if (eptr >= md->end_subject)
2402          {
2403          SCHECK_PARTIAL();
2404          RRETURN(MATCH_NOMATCH);
2405          }
2406        GETCHARINCTEST(c, eptr);
2407        switch(c)
2408          {
2409          default: RRETURN(MATCH_NOMATCH);
2410    
2411          case 0x000d:
2412          if (eptr >= md->end_subject)
2413            {
2414            SCHECK_PARTIAL();
2415            }
2416          else if (*eptr == 0x0a) eptr++;
2417          break;
2418    
2419          case 0x000a:
2420          break;
2421    
2422          case 0x000b:
2423          case 0x000c:
2424          case 0x0085:
2425          case 0x2028:
2426          case 0x2029:
2427          if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2428          break;
2429          }
2430        ecode++;
2431        break;
2432    
2433        case OP_NOT_HSPACE:
2434        if (eptr >= md->end_subject)
2435          {
2436          SCHECK_PARTIAL();
2437          RRETURN(MATCH_NOMATCH);
2438          }
2439        GETCHARINCTEST(c, eptr);
2440        switch(c)
2441          {
2442          default: break;
2443          case 0x09:      /* HT */
2444          case 0x20:      /* SPACE */
2445          case 0xa0:      /* NBSP */
2446          case 0x1680:    /* OGHAM SPACE MARK */
2447          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2448          case 0x2000:    /* EN QUAD */
2449          case 0x2001:    /* EM QUAD */
2450          case 0x2002:    /* EN SPACE */
2451          case 0x2003:    /* EM SPACE */
2452          case 0x2004:    /* THREE-PER-EM SPACE */
2453          case 0x2005:    /* FOUR-PER-EM SPACE */
2454          case 0x2006:    /* SIX-PER-EM SPACE */
2455          case 0x2007:    /* FIGURE SPACE */
2456          case 0x2008:    /* PUNCTUATION SPACE */
2457          case 0x2009:    /* THIN SPACE */
2458          case 0x200A:    /* HAIR SPACE */
2459          case 0x202f:    /* NARROW NO-BREAK SPACE */
2460          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2461          case 0x3000:    /* IDEOGRAPHIC SPACE */
2462          RRETURN(MATCH_NOMATCH);
2463          }
2464        ecode++;
2465        break;
2466    
2467        case OP_HSPACE:
2468        if (eptr >= md->end_subject)
2469          {
2470          SCHECK_PARTIAL();
2471          RRETURN(MATCH_NOMATCH);
2472          }
2473        GETCHARINCTEST(c, eptr);
2474        switch(c)
2475          {
2476          default: RRETURN(MATCH_NOMATCH);
2477          case 0x09:      /* HT */
2478          case 0x20:      /* SPACE */
2479          case 0xa0:      /* NBSP */
2480          case 0x1680:    /* OGHAM SPACE MARK */
2481          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2482          case 0x2000:    /* EN QUAD */
2483          case 0x2001:    /* EM QUAD */
2484          case 0x2002:    /* EN SPACE */
2485          case 0x2003:    /* EM SPACE */
2486          case 0x2004:    /* THREE-PER-EM SPACE */
2487          case 0x2005:    /* FOUR-PER-EM SPACE */
2488          case 0x2006:    /* SIX-PER-EM SPACE */
2489          case 0x2007:    /* FIGURE SPACE */
2490          case 0x2008:    /* PUNCTUATION SPACE */
2491          case 0x2009:    /* THIN SPACE */
2492          case 0x200A:    /* HAIR SPACE */
2493          case 0x202f:    /* NARROW NO-BREAK SPACE */
2494          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2495          case 0x3000:    /* IDEOGRAPHIC SPACE */
2496          break;
2497          }
2498        ecode++;
2499        break;
2500    
2501        case OP_NOT_VSPACE:
2502        if (eptr >= md->end_subject)
2503          {
2504          SCHECK_PARTIAL();
2505          RRETURN(MATCH_NOMATCH);
2506          }
2507        GETCHARINCTEST(c, eptr);
2508        switch(c)
2509          {
2510          default: break;
2511          case 0x0a:      /* LF */
2512          case 0x0b:      /* VT */
2513          case 0x0c:      /* FF */
2514          case 0x0d:      /* CR */
2515          case 0x85:      /* NEL */
2516          case 0x2028:    /* LINE SEPARATOR */
2517          case 0x2029:    /* PARAGRAPH SEPARATOR */
2518          RRETURN(MATCH_NOMATCH);
2519          }
2520        ecode++;
2521        break;
2522    
2523        case OP_VSPACE:
2524        if (eptr >= md->end_subject)
2525          {
2526          SCHECK_PARTIAL();
2527          RRETURN(MATCH_NOMATCH);
2528          }
2529        GETCHARINCTEST(c, eptr);
2530        switch(c)
2531          {
2532          default: RRETURN(MATCH_NOMATCH);
2533          case 0x0a:      /* LF */
2534          case 0x0b:      /* VT */
2535          case 0x0c:      /* FF */
2536          case 0x0d:      /* CR */
2537          case 0x85:      /* NEL */
2538          case 0x2028:    /* LINE SEPARATOR */
2539          case 0x2029:    /* PARAGRAPH SEPARATOR */
2540          break;
2541          }
2542        ecode++;
2543        break;
2544    
2545  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2546      /* Check the next character by Unicode property. We will get here only      /* Check the next character by Unicode property. We will get here only
2547      if the support is in the binary; otherwise a compile-time error occurs. */      if the support is in the binary; otherwise a compile-time error occurs. */
2548    
2549      case OP_PROP:      case OP_PROP:
2550      case OP_NOTPROP:      case OP_NOTPROP:
2551      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject)
2552          {
2553          SCHECK_PARTIAL();
2554          RRETURN(MATCH_NOMATCH);
2555          }
2556      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
2557        {        {
2558        int chartype, rqdtype;        const ucd_record *prop = GET_UCD(c);
       int othercase;  
       int category = _pcre_ucp_findchar(c, &chartype, &othercase);  
   
       rqdtype = *(++ecode);  
       ecode++;  
2559    
2560        if (rqdtype >= 128)        switch(ecode[1])
2561          {          {
2562          if ((rqdtype - 128 != category) == (op == OP_PROP))          case PT_ANY:
2563            if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2564            break;
2565    
2566            case PT_LAMP:
2567            if ((prop->chartype == ucp_Lu ||
2568                 prop->chartype == ucp_Ll ||
2569                 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2570            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
2571          }          break;
2572        else  
2573          {          case PT_GC:
2574          if ((rqdtype != chartype) == (op == OP_PROP))          if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2575              RRETURN(MATCH_NOMATCH);
2576            break;
2577    
2578            case PT_PC:
2579            if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2580              RRETURN(MATCH_NOMATCH);
2581            break;
2582    
2583            case PT_SC:
2584            if ((ecode[2] != prop->script) == (op == OP_PROP))
2585              RRETURN(MATCH_NOMATCH);
2586            break;
2587    
2588            /* These are specials */
2589    
2590            case PT_ALNUM:
2591            if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2592                 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2593              RRETURN(MATCH_NOMATCH);
2594            break;
2595    
2596            case PT_SPACE:    /* Perl space */
2597            if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2598                 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2599                   == (op == OP_NOTPROP))
2600              RRETURN(MATCH_NOMATCH);
2601            break;
2602    
2603            case PT_PXSPACE:  /* POSIX space */
2604            if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2605                 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2606                 c == CHAR_FF || c == CHAR_CR)
2607                   == (op == OP_NOTPROP))
2608            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
2609            break;
2610    
2611            case PT_WORD:
2612            if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2613                 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2614                 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2615              RRETURN(MATCH_NOMATCH);
2616            break;
2617    
2618            /* This should never occur */
2619    
2620            default:
2621            RRETURN(PCRE_ERROR_INTERNAL);
2622          }          }
2623    
2624          ecode += 3;
2625        }        }
2626      break;      break;
2627    
# Line 1376  for (;;) Line 2629  for (;;)
2629      is in the binary; otherwise a compile-time error occurs. */      is in the binary; otherwise a compile-time error occurs. */
2630    
2631      case OP_EXTUNI:      case OP_EXTUNI:
2632      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject)
2633          {
2634          SCHECK_PARTIAL();
2635          RRETURN(MATCH_NOMATCH);
2636          }
2637      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
2638        if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2639        while (eptr < md->end_subject)
2640        {        {
2641        int chartype;        int len = 1;
2642        int othercase;        if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2643        int category = _pcre_ucp_findchar(c, &chartype, &othercase);        if (UCD_CATEGORY(c) != ucp_M) break;
2644        if (category == ucp_M) RRETURN(MATCH_NOMATCH);        eptr += len;
       while (eptr < md->end_subject)  
         {  
         int len = 1;  
         if (!utf8) c = *eptr; else  
           {  
           GETCHARLEN(c, eptr, len);  
           }  
         category = _pcre_ucp_findchar(c, &chartype, &othercase);  
         if (category != ucp_M) break;  
         eptr += len;  
         }  
2645        }        }
2646        CHECK_PARTIAL();
2647      ecode++;      ecode++;
2648      break;      break;
2649  #endif  #endif
# Line 1409  for (;;) Line 2658  for (;;)
2658      loops). */      loops). */
2659    
2660      case OP_REF:      case OP_REF:
2661        case OP_REFI:
2662        caseless = op == OP_REFI;
2663        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
2664        ecode += 1 + IMM2_SIZE;
2665    
2666        /* If the reference is unset, there are two possibilities:
2667    
2668        (a) In the default, Perl-compatible state, set the length negative;
2669        this ensures that every attempt at a match fails. We can't just fail
2670        here, because of the possibility of quantifiers with zero minima.
2671    
2672        (b) If the JavaScript compatibility flag is set, set the length to zero
2673        so that the back reference matches an empty string.
2674    
2675        Otherwise, set the length to the length of what was matched by the
2676        referenced subpattern. */
2677    
2678        if (offset >= offset_top || md->offset_vector[offset] < 0)
2679          length = (md->jscript_compat)? 0 : -1;
2680        else
2681          length = md->offset_vector[offset+1] - md->offset_vector[offset];
2682    
2683        /* Set up for repetition, or handle the non-repeated case */
2684    
2685        switch (*ecode)
2686        {        {
2687        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */        case OP_CRSTAR:
2688        ecode += 3;                                 /* Advance past item */        case OP_CRMINSTAR:
2689          case OP_CRPLUS:
2690          case OP_CRMINPLUS:
2691          case OP_CRQUERY:
2692          case OP_CRMINQUERY:
2693          c = *ecode++ - OP_CRSTAR;
2694          minimize = (c & 1) != 0;
2695          min = rep_min[c];                 /* Pick up values from tables; */
2696          max = rep_max[c];                 /* zero for max => infinity */
2697          if (max == 0) max = INT_MAX;
2698          break;
2699    
2700          case OP_CRRANGE:
2701          case OP_CRMINRANGE:
2702          minimize = (*ecode == OP_CRMINRANGE);
2703          min = GET2(ecode, 1);
2704          max = GET2(ecode, 1 + IMM2_SIZE);
2705          if (max == 0) max = INT_MAX;
2706          ecode += 1 + 2 * IMM2_SIZE;
2707          break;
2708    
2709        /* If the reference is unset, set the length to be longer than the amount        default:               /* No repeat follows */
2710        of subject left; this ensures that every attempt at a match fails. We        if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2711        can't just fail here, because of the possibility of quantifiers with zero          {
2712        minima. */          if (length == -2) eptr = md->end_subject;   /* Partial match */
2713            CHECK_PARTIAL();
2714        length = (offset >= offset_top || md->offset_vector[offset] < 0)?          RRETURN(MATCH_NOMATCH);
2715          md->end_subject - eptr + 1 :          }
2716          md->offset_vector[offset+1] - md->offset_vector[offset];        eptr += length;
2717          continue;              /* With the main loop */
2718          }
2719    
2720        /* Set up for repetition, or handle the non-repeated case */      /* Handle repeated back references. If the length of the reference is
2721        zero, just continue with the main loop. If the length is negative, it
2722        means the reference is unset in non-Java-compatible mode. If the minimum is
2723        zero, we can continue at the same level without recursion. For any other
2724        minimum, carrying on will result in NOMATCH. */
2725    
2726        switch (*ecode)      if (length == 0) continue;
2727        if (length < 0 && min == 0) continue;
2728    
2729        /* First, ensure the minimum number of matches are present. We get back
2730        the length of the reference string explicitly rather than passing the
2731        address of eptr, so that eptr can be a register variable. */
2732    
2733        for (i = 1; i <= min; i++)
2734          {
2735          int slength;
2736          if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2737          {          {
2738          case OP_CRSTAR:          if (slength == -2) eptr = md->end_subject;   /* Partial match */
2739          case OP_CRMINSTAR:          CHECK_PARTIAL();
2740          case OP_CRPLUS:          RRETURN(MATCH_NOMATCH);
         case OP_CRMINPLUS:  
         case OP_CRQUERY:  
         case OP_CRMINQUERY:  
         c = *ecode++ - OP_CRSTAR;  
         minimize = (c & 1) != 0;  
         min = rep_min[c];                 /* Pick up values from tables; */  
         max = rep_max[c];                 /* zero for max => infinity */  
         if (max == 0) max = INT_MAX;  
         break;  
   
         case OP_CRRANGE:  
         case OP_CRMINRANGE:  
         minimize = (*ecode == OP_CRMINRANGE);  
         min = GET2(ecode, 1);  
         max = GET2(ecode, 3);  
         if (max == 0) max = INT_MAX;  
         ecode += 5;  
         break;  
   
         default:               /* No repeat follows */  
         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);  
         eptr += length;  
         continue;              /* With the main loop */  
         }  
   
       /* If the length of the reference is zero, just continue with the  
       main loop. */  
   
       if (length == 0) continue;  
   
       /* First, ensure the minimum number of matches are present. We get back  
       the length of the reference string explicitly rather than passing the  
       address of eptr, so that eptr can be a register variable. */  
   
       for (i = 1; i <= min; i++)  
         {  
         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);  
         eptr += length;  
2741          }          }
2742          eptr += slength;
2743          }
2744    
2745        /* If min = max, continue at the same level without recursion.      /* If min = max, continue at the same level without recursion.
2746        They are not both allowed to be zero. */      They are not both allowed to be zero. */
2747    
2748        if (min == max) continue;      if (min == max) continue;
2749    
2750        /* If minimizing, keep trying and advancing the pointer */      /* If minimizing, keep trying and advancing the pointer */
2751    
2752        if (minimize)      if (minimize)
2753          {
2754          for (fi = min;; fi++)
2755          {          {
2756          for (fi = min;; fi++)          int slength;
2757            RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2758            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2759            if (fi >= max) RRETURN(MATCH_NOMATCH);
2760            if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2761            {            {
2762            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            if (slength == -2) eptr = md->end_subject;   /* Partial match */
2763            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            CHECK_PARTIAL();
2764            if (fi >= max || !match_ref(offset, eptr, length, md, ims))            RRETURN(MATCH_NOMATCH);
             RRETURN(MATCH_NOMATCH);  
           eptr += length;  
2765            }            }
2766          /* Control never gets here */          eptr += slength;
2767          }          }
2768          /* Control never gets here */
2769          }
2770    
2771        /* If maximizing, find the longest string and work backwards */      /* If maximizing, find the longest string and work backwards */
2772    
2773        else      else
2774          {
2775          pp = eptr;
2776          for (i = min; i < max; i++)
2777          {          {
2778          pp = eptr;          int slength;
2779          for (i = min; i < max; i++)          if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2780            {            {
2781            if (!match_ref(offset, eptr, length, md, ims)) break;            /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2782            eptr += length;            the soft partial matching case. */
2783            }  
2784          while (eptr >= pp)            if (slength == -2 && md->partial != 0 &&
2785            {                md->end_subject > md->start_used_ptr)
2786            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              {
2787            if (rrc != MATCH_NOMATCH) RRETURN(rrc);              md->hitend = TRUE;
2788            eptr -= length;              if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2789                }
2790              break;
2791            }            }
2792          RRETURN(MATCH_NOMATCH);          eptr += slength;
2793            }
2794    
2795          while (eptr >= pp)
2796            {
2797            RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2798            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2799            eptr -= length;
2800          }          }
2801          RRETURN(MATCH_NOMATCH);
2802        }        }
2803      /* Control never gets here */      /* Control never gets here */
2804    
   
   
2805      /* Match a bit-mapped character class, possibly repeatedly. This op code is      /* Match a bit-mapped character class, possibly repeatedly. This op code is
2806      used when all the characters in the class have values in the range 0-255,      used when all the characters in the class have values in the range 0-255,
2807      and either the matching is caseful, or the characters are in the range      and either the matching is caseful, or the characters are in the range
# Line 1526  for (;;) Line 2816  for (;;)
2816      case OP_NCLASS:      case OP_NCLASS:
2817      case OP_CLASS:      case OP_CLASS:
2818        {        {
2819          /* The data variable is saved across frames, so the byte map needs to
2820          be stored there. */
2821    #define BYTE_MAP ((pcre_uint8 *)data)
2822        data = ecode + 1;                /* Save for matching */        data = ecode + 1;                /* Save for matching */
2823        ecode += 33;                     /* Advance past the item */        ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2824    
2825        switch (*ecode)        switch (*ecode)
2826          {          {
# Line 1548  for (;;) Line 2841  for (;;)
2841          case OP_CRMINRANGE:          case OP_CRMINRANGE:
2842          minimize = (*ecode == OP_CRMINRANGE);          minimize = (*ecode == OP_CRMINRANGE);
2843          min = GET2(ecode, 1);          min = GET2(ecode, 1);
2844          max = GET2(ecode, 3);          max = GET2(ecode, 1 + IMM2_SIZE);
2845          if (max == 0) max = INT_MAX;          if (max == 0) max = INT_MAX;
2846          ecode += 5;          ecode += 1 + 2 * IMM2_SIZE;
2847          break;          break;
2848    
2849          default:               /* No repeat follows */          default:               /* No repeat follows */
# Line 1560  for (;;) Line 2853  for (;;)
2853    
2854        /* First, ensure the minimum number of matches are present. */        /* First, ensure the minimum number of matches are present. */
2855    
2856  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2857        /* UTF-8 mode */        if (utf)
       if (utf8)  
2858          {          {
2859          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2860            {            {
2861            if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (eptr >= md->end_subject)
2862                {
2863                SCHECK_PARTIAL();
2864                RRETURN(MATCH_NOMATCH);
2865                }
2866            GETCHARINC(c, eptr);            GETCHARINC(c, eptr);
2867            if (c > 255)            if (c > 255)
2868              {              {
2869              if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);              if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2870              }              }
2871            else            else
2872              {              if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
             if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);  
             }  
2873            }            }
2874          }          }
2875        else        else
2876  #endif  #endif
2877        /* Not UTF-8 mode */        /* Not UTF mode */
2878          {          {
2879          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2880            {            {
2881            if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (eptr >= md->end_subject)
2882                {
2883                SCHECK_PARTIAL();
2884                RRETURN(MATCH_NOMATCH);
2885                }
2886            c = *eptr++;            c = *eptr++;
2887            if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);  #ifndef COMPILE_PCRE8
2888              if (c > 255)
2889                {
2890                if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2891                }
2892              else
2893    #endif
2894                if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2895            }            }
2896          }          }
2897    
# Line 1600  for (;;) Line 2905  for (;;)
2905    
2906        if (minimize)        if (minimize)
2907          {          {
2908  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2909          /* UTF-8 mode */          if (utf)
         if (utf8)  
2910            {            {
2911            for (fi = min;; fi++)            for (fi = min;; fi++)
2912              {              {
2913              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2914              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2915              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max) RRETURN(MATCH_NOMATCH);
2916                if (eptr >= md->end_subject)
2917                  {
2918                  SCHECK_PARTIAL();
2919                  RRETURN(MATCH_NOMATCH);
2920                  }
2921              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
2922              if (c > 255)              if (c > 255)
2923                {                {
2924                if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);                if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2925                }                }
2926              else              else
2927                {                if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
               if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);  
               }  
2928              }              }
2929            }            }
2930          else          else
2931  #endif  #endif
2932          /* Not UTF-8 mode */          /* Not UTF mode */
2933            {            {
2934            for (fi = min;; fi++)            for (fi = min;; fi++)
2935              {              {
2936              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2937              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2938              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max) RRETURN(MATCH_NOMATCH);
2939                if (eptr >= md->end_subject)
2940                  {
2941                  SCHECK_PARTIAL();
2942                  RRETURN(MATCH_NOMATCH);
2943                  }
2944              c = *eptr++;              c = *eptr++;
2945              if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);  #ifndef COMPILE_PCRE8
2946                if (c > 255)
2947                  {
2948                  if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2949                  }
2950                else
2951    #endif
2952                  if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2953              }              }
2954            }            }
2955          /* Control never gets here */          /* Control never gets here */
# Line 1642  for (;;) Line 2961  for (;;)
2961          {          {
2962          pp = eptr;          pp = eptr;
2963    
2964  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2965          /* UTF-8 mode */          if (utf)
         if (utf8)  
2966            {            {
2967            for (i = min; i < max; i++)            for (i = min; i < max; i++)
2968              {              {
2969              int len = 1;              int len = 1;
2970              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject)
2971                  {
2972                  SCHECK_PARTIAL();
2973                  break;
2974                  }
2975              GETCHARLEN(c, eptr, len);              GETCHARLEN(c, eptr, len);
2976              if (c > 255)              if (c > 255)
2977                {                {
2978                if (op == OP_CLASS) break;                if (op == OP_CLASS) break;
2979                }                }
2980              else              else
2981                {                if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
               if ((data[c/8] & (1 << (c&7))) == 0) break;  
               }  
2982              eptr += len;              eptr += len;
2983              }              }
2984            for (;;)            for (;;)
2985              {              {
2986              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2987              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2988              if (eptr-- == pp) break;        /* Stop if tried at original pos */              if (eptr-- == pp) break;        /* Stop if tried at original pos */
2989              BACKCHAR(eptr);              BACKCHAR(eptr);
# Line 1671  for (;;) Line 2991  for (;;)
2991            }            }
2992          else          else
2993  #endif  #endif
2994            /* Not UTF-8 mode */            /* Not UTF mode */
2995            {            {
2996            for (i = min; i < max; i++)            for (i = min; i < max; i++)
2997              {              {
2998              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject)
2999                  {
3000                  SCHECK_PARTIAL();
3001                  break;
3002                  }
3003              c = *eptr;              c = *eptr;
3004              if ((data[c/8] & (1 << (c&7))) == 0) break;  #ifndef COMPILE_PCRE8
3005                if (c > 255)
3006                  {
3007                  if (op == OP_CLASS) break;
3008                  }
3009                else
3010    #endif
3011                  if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3012              eptr++;              eptr++;
3013              }              }
3014            while (eptr >= pp)            while (eptr >= pp)
3015              {              {
3016              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
             eptr--;  
3017              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3018                eptr--;
3019              }              }
3020            }            }
3021    
3022          RRETURN(MATCH_NOMATCH);          RRETURN(MATCH_NOMATCH);
3023          }          }
3024    #undef BYTE_MAP
3025        }        }
3026      /* Control never gets here */      /* Control never gets here */
3027    
3028    
3029      /* Match an extended character class. This opcode is encountered only      /* Match an extended character class. This opcode is encountered only
3030      in UTF-8 mode, because that's the only time it is compiled. */      when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3031        mode, because Unicode properties are supported in non-UTF-8 mode. */
3032    
3033  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3034      case OP_XCLASS:      case OP_XCLASS:
3035        {        {
3036        data = ecode + 1 + LINK_SIZE;                /* Save for matching */        data = ecode + 1 + LINK_SIZE;                /* Save for matching */
# Line 1722  for (;;) Line 3055  for (;;)
3055          case OP_CRMINRANGE:          case OP_CRMINRANGE:
3056          minimize = (*ecode == OP_CRMINRANGE);          minimize = (*ecode == OP_CRMINRANGE);
3057          min = GET2(ecode, 1);          min = GET2(ecode, 1);
3058          max = GET2(ecode, 3);          max = GET2(ecode, 1 + IMM2_SIZE);
3059          if (max == 0) max = INT_MAX;          if (max == 0) max = INT_MAX;
3060          ecode += 5;          ecode += 1 + 2 * IMM2_SIZE;
3061          break;          break;
3062    
3063          default:               /* No repeat follows */          default:               /* No repeat follows */
# Line 1736  for (;;) Line 3069  for (;;)
3069    
3070        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3071          {          {
3072          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);          if (eptr >= md->end_subject)
3073          GETCHARINC(c, eptr);            {
3074          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);            SCHECK_PARTIAL();
3075              RRETURN(MATCH_NOMATCH);
3076              }
3077            GETCHARINCTEST(c, eptr);
3078            if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3079          }          }
3080    
3081        /* If max == min we can continue with the main loop without the        /* If max == min we can continue with the main loop without the
# Line 1753  for (;;) Line 3090  for (;;)
3090          {          {
3091          for (fi = min;; fi++)          for (fi = min;; fi++)
3092            {            {
3093            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3094            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3095            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max) RRETURN(MATCH_NOMATCH);
3096            GETCHARINC(c, eptr);            if (eptr >= md->end_subject)
3097            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);              {
3098                SCHECK_PARTIAL();
3099                RRETURN(MATCH_NOMATCH);
3100                }
3101              GETCHARINCTEST(c, eptr);
3102              if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3103            }            }
3104          /* Control never gets here */          /* Control never gets here */
3105          }          }
# Line 1770  for (;;) Line 3112  for (;;)
3112          for (i = min; i < max; i++)          for (i = min; i < max; i++)
3113            {            {
3114            int len = 1;            int len = 1;
3115            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject)
3116            GETCHARLEN(c, eptr, len);              {
3117            if (!_pcre_xclass(c, data)) break;              SCHECK_PARTIAL();
3118                break;
3119                }
3120    #ifdef SUPPORT_UTF
3121              GETCHARLENTEST(c, eptr, len);
3122    #else
3123              c = *eptr;
3124    #endif
3125              if (!PRIV(xclass)(c, data, utf)) break;
3126            eptr += len;            eptr += len;
3127            }            }
3128          for(;;)          for(;;)
3129            {            {
3130            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3131            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3132            if (eptr-- == pp) break;        /* Stop if tried at original pos */            if (eptr-- == pp) break;        /* Stop if tried at original pos */
3133            BACKCHAR(eptr)  #ifdef SUPPORT_UTF
3134              if (utf) BACKCHAR(eptr);
3135    #endif
3136            }            }
3137          RRETURN(MATCH_NOMATCH);          RRETURN(MATCH_NOMATCH);
3138          }          }
# Line 1792  for (;;) Line 3144  for (;;)
3144      /* Match a single character, casefully */      /* Match a single character, casefully */
3145    
3146      case OP_CHAR:      case OP_CHAR:
3147  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3148      if (utf8)      if (utf)
3149        {        {
3150        length = 1;        length = 1;
3151        ecode++;        ecode++;
3152        GETCHARLEN(fc, ecode, length);        GETCHARLEN(fc, ecode, length);
3153        if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);        if (length > md->end_subject - eptr)
3154            {
3155            CHECK_PARTIAL();             /* Not SCHECK_PARTIAL() */
3156            RRETURN(MATCH_NOMATCH);
3157            }
3158        while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);        while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3159        }        }
3160      else      else
3161  #endif  #endif
3162        /* Not UTF mode */
     /* Non-UTF-8 mode */  
3163        {        {
3164        if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);        if (md->end_subject - eptr < 1)
3165            {
3166            SCHECK_PARTIAL();            /* This one can use SCHECK_PARTIAL() */
3167            RRETURN(MATCH_NOMATCH);
3168            }
3169        if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);        if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3170        ecode += 2;        ecode += 2;
3171        }        }
3172      break;      break;
3173    
3174      /* Match a single character, caselessly */      /* Match a single character, caselessly. If we are at the end of the
3175        subject, give up immediately. */
3176    
3177        case OP_CHARI:
3178        if (eptr >= md->end_subject)
3179          {
3180          SCHECK_PARTIAL();
3181          RRETURN(MATCH_NOMATCH);
3182          }
3183    
3184      case OP_CHARNC:  #ifdef SUPPORT_UTF
3185  #ifdef SUPPORT_UTF8      if (utf)
     if (utf8)  
3186        {        {
3187        length = 1;        length = 1;
3188        ecode++;        ecode++;
3189        GETCHARLEN(fc, ecode, length);        GETCHARLEN(fc, ecode, length);
3190    
       if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);  
   
3191        /* If the pattern character's value is < 128, we have only one byte, and        /* If the pattern character's value is < 128, we have only one byte, and
3192        can use the fast lookup table. */        we know that its other case must also be one byte long, so we can use the
3193          fast lookup table. We know that there is at least one byte left in the
3194          subject. */
3195    
3196        if (fc < 128)        if (fc < 128)
3197          {          {
3198          if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);          if (md->lcc[fc]
3199                != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3200            ecode++;
3201            eptr++;
3202          }          }
3203    
3204        /* Otherwise we must pick up the subject character */        /* Otherwise we must pick up the subject character. Note that we cannot
3205          use the value of "length" to check for sufficient bytes left, because the
3206          other case of the character may have more or fewer bytes.  */
3207    
3208        else        else
3209          {          {
3210          int dc;          unsigned int dc;
3211          GETCHARINC(dc, eptr);          GETCHARINC(dc, eptr);
3212          ecode += length;          ecode += length;
3213    
3214          /* If we have Unicode property support, we can use it to test the other          /* If we have Unicode property support, we can use it to test the other
3215          case of the character, if there is one. The result of _pcre_ucp_findchar() is          case of the character, if there is one. */
         < 0 if the char isn't found, and othercase is returned as zero if there  
         isn't one. */  
3216    
3217          if (fc != dc)          if (fc != dc)
3218            {            {
3219  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3220            int chartype;            if (dc != UCD_OTHERCASE(fc))
           int othercase;  
           if (_pcre_ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)  
3221  #endif  #endif
3222              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3223            }            }
3224          }          }
3225        }        }
3226      else      else
3227  #endif   /* SUPPORT_UTF8 */  #endif   /* SUPPORT_UTF */
3228    
3229      /* Non-UTF-8 mode */      /* Not UTF mode */
3230        {        {
3231        if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);        if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3232        if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);            != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3233          eptr++;
3234        ecode += 2;        ecode += 2;
3235        }        }
3236      break;      break;
3237    
3238      /* Match a single character repeatedly; different opcodes share code. */      /* Match a single character repeatedly. */
3239    
3240      case OP_EXACT:      case OP_EXACT:
3241        case OP_EXACTI:
3242      min = max = GET2(ecode, 1);      min = max = GET2(ecode, 1);
3243      ecode += 3;      ecode += 1 + IMM2_SIZE;
3244      goto REPEATCHAR;      goto REPEATCHAR;
3245    
3246        case OP_POSUPTO:
3247        case OP_POSUPTOI:
3248        possessive = TRUE;
3249        /* Fall through */
3250    
3251      case OP_UPTO:      case OP_UPTO:
3252        case OP_UPTOI:
3253      case OP_MINUPTO:      case OP_MINUPTO:
3254        case OP_MINUPTOI:
3255      min = 0;      min = 0;
3256      max = GET2(ecode, 1);      max = GET2(ecode, 1);
3257      minimize = *ecode == OP_MINUPTO;      minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3258      ecode += 3;      ecode += 1 + IMM2_SIZE;
3259        goto REPEATCHAR;
3260    
3261        case OP_POSSTAR:
3262        case OP_POSSTARI:
3263        possessive = TRUE;
3264        min = 0;
3265        max = INT_MAX;
3266        ecode++;
3267        goto REPEATCHAR;
3268    
3269        case OP_POSPLUS:
3270        case OP_POSPLUSI:
3271        possessive = TRUE;
3272        min = 1;
3273        max = INT_MAX;
3274        ecode++;
3275        goto REPEATCHAR;
3276    
3277        case OP_POSQUERY:
3278        case OP_POSQUERYI:
3279        possessive = TRUE;
3280        min = 0;
3281        max = 1;
3282        ecode++;
3283      goto REPEATCHAR;      goto REPEATCHAR;
3284    
3285      case OP_STAR:      case OP_STAR:
3286        case OP_STARI:
3287      case OP_MINSTAR:      case OP_MINSTAR:
3288        case OP_MINSTARI:
3289      case OP_PLUS:      case OP_PLUS:
3290        case OP_PLUSI:
3291      case OP_MINPLUS:      case OP_MINPLUS:
3292        case OP_MINPLUSI:
3293      case OP_QUERY:      case OP_QUERY:
3294        case OP_QUERYI:
3295      case OP_MINQUERY:      case OP_MINQUERY:
3296      c = *ecode++ - OP_STAR;      case OP_MINQUERYI:
3297        c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3298      minimize = (c & 1) != 0;      minimize = (c & 1) != 0;
3299      min = rep_min[c];                 /* Pick up values from tables; */      min = rep_min[c];                 /* Pick up values from tables; */
3300      max = rep_max[c];                 /* zero for max => infinity */      max = rep_max[c];                 /* zero for max => infinity */
3301      if (max == 0) max = INT_MAX;      if (max == 0) max = INT_MAX;
3302    
3303      /* Common code for all repeated single-character matches. We can give      /* Common code for all repeated single-character matches. */
     up quickly if there are fewer than the minimum number of characters left in  
     the subject. */  
3304    
3305      REPEATCHAR:      REPEATCHAR:
3306  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3307      if (utf8)      if (utf)
3308        {        {
3309        length = 1;        length = 1;
3310        charptr = ecode;        charptr = ecode;
3311        GETCHARLEN(fc, ecode, length);        GETCHARLEN(fc, ecode, length);
       if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);  
3312        ecode += length;        ecode += length;
3313    
3314        /* Handle multibyte character matching specially here. There is        /* Handle multibyte character matching specially here. There is
# Line 1913  for (;;) Line 3316  for (;;)