/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 149 by ph10, Mon Apr 16 15:28:08 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  pattern matching using an NFA algorithm, trying to mimic Perl as closely as  pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43  possible. There are also some static supporting functions. */  possible. There are also some static supporting functions. */
44    
45    #define NLBLOCK md             /* Block containing newline information */
46    #define PSSTART start_subject  /* Field containing processed string start */
47    #define PSEND   end_subject    /* Field containing processed string end */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    /* Undefine some potentially clashing cpp symbols */
52    
53  /* Structure for building a chain of data that actually lives on the  #undef min
54  stack, for holding the values of the subject pointer at the start of each  #undef max
55  subpattern, so as to detect when an empty string has been matched by a  
56  subpattern - to break infinite loops. When NO_RECURSE is set, these blocks  /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
57  are on the heap, not on the stack. */  obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
58    
59  typedef struct eptrblock {  #define EPTR_WORK_SIZE (1000)
   struct eptrblock *epb_prev;  
   USPTR epb_saved_eptr;  
 } eptrblock;  
60    
61  /* Flag bits for the match() function */  /* Flag bits for the match() function */
62    
63  #define match_condassert   0x01    /* Called to check a condition assertion */  #define match_condassert     0x01  /* Called to check a condition assertion */
64  #define match_isgroup      0x02    /* Set if start of bracketed group */  #define match_cbegroup       0x02  /* Could-be-empty unlimited repeat group */
65    #define match_tail_recursed  0x04  /* Tail recursive call */
66    
67  /* Non-error returns from the match() function. Error returns are externally  /* Non-error returns from the match() function. Error returns are externally
68  defined PCRE_ERROR_xxx codes, which are all negative. */  defined PCRE_ERROR_xxx codes, which are all negative. */
# Line 101  Returns:     nothing Line 103  Returns:     nothing
103  static void  static void
104  pchars(const uschar *p, int length, BOOL is_subject, match_data *md)  pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105  {  {
106  int c;  unsigned int c;
107  if (is_subject && length > md->end_subject - p) length = md->end_subject - p;  if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108  while (length-- > 0)  while (length-- > 0)
109    if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);    if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
# Line 275  typedef struct heapframe { Line 277  typedef struct heapframe {
277    long int Xims;    long int Xims;
278    eptrblock *Xeptrb;    eptrblock *Xeptrb;
279    int Xflags;    int Xflags;
280    int Xrdepth;    unsigned int Xrdepth;
281    
282    /* Function local variables */    /* Function local variables */
283    
# Line 291  typedef struct heapframe { Line 293  typedef struct heapframe {
293    
294    BOOL Xcur_is_word;    BOOL Xcur_is_word;
295    BOOL Xcondition;    BOOL Xcondition;
   BOOL Xminimize;  
296    BOOL Xprev_is_word;    BOOL Xprev_is_word;
297    
298    unsigned long int Xoriginal_ims;    unsigned long int Xoriginal_ims;
# Line 303  typedef struct heapframe { Line 304  typedef struct heapframe {
304    int Xprop_category;    int Xprop_category;
305    int Xprop_chartype;    int Xprop_chartype;
306    int Xprop_script;    int Xprop_script;
307    int *Xprop_test_variable;    int Xoclength;
308      uschar Xocchars[8];
309  #endif  #endif
310    
311    int Xctype;    int Xctype;
312    int Xfc;    unsigned int Xfc;
313    int Xfi;    int Xfi;
314    int Xlength;    int Xlength;
315    int Xmax;    int Xmax;
# Line 340  typedef struct heapframe { Line 342  typedef struct heapframe {
342  *         Match from current position            *  *         Match from current position            *
343  *************************************************/  *************************************************/
344    
345  /* On entry ecode points to the first opcode, and eptr to the first character  /* This function is called recursively in many circumstances. Whenever it
 in the subject string, while eptrb holds the value of eptr at the start of the  
 last bracketed group - used for breaking infinite loops matching zero-length  
 strings. This function is called recursively in many circumstances. Whenever it  
346  returns a negative (error) response, the outer incarnation must also return the  returns a negative (error) response, the outer incarnation must also return the
347  same response.  same response.
348    
# Line 353  performance. Tests using gcc on a SPARC Line 352  performance. Tests using gcc on a SPARC
352  made performance worse.  made performance worse.
353    
354  Arguments:  Arguments:
355     eptr        pointer in subject     eptr        pointer to current character in subject
356     ecode       position in code     ecode       pointer to current position in compiled code
357     offset_top  current top pointer     offset_top  current top pointer
358     md          pointer to "static" info for the match     md          pointer to "static" info for the match
359     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
# Line 362  Arguments: Line 361  Arguments:
361                   brackets - for testing for empty matches                   brackets - for testing for empty matches
362     flags       can contain     flags       can contain
363                   match_condassert - this is an assertion condition                   match_condassert - this is an assertion condition
364                   match_isgroup - this is the start of a bracketed group                   match_cbegroup - this is the start of an unlimited repeat
365                       group that can match an empty string
366                     match_tail_recursed - this is a tail_recursed group
367     rdepth      the recursion depth     rdepth      the recursion depth
368    
369  Returns:       MATCH_MATCH if matched            )  these values are >= 0  Returns:       MATCH_MATCH if matched            )  these values are >= 0
# Line 374  Returns:       MATCH_MATCH if matched Line 375  Returns:       MATCH_MATCH if matched
375  static int  static int
376  match(REGISTER USPTR eptr, REGISTER const uschar *ecode,  match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
377    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
378    int flags, int rdepth)    int flags, unsigned int rdepth)
379  {  {
380  /* These variables do not need to be preserved over recursion in this function,  /* These variables do not need to be preserved over recursion in this function,
381  so they can be ordinary variables in all cases. Mark them with "register"  so they can be ordinary variables in all cases. Mark some of them with
382  because they are used a lot in loops. */  "register" because they are used a lot in loops. */
383    
384    register int  rrc;         /* Returns from recursive calls */
385    register int  i;           /* Used for loops not involving calls to RMATCH() */
386    register unsigned int c;   /* Character values not kept over RMATCH() calls */
387    register BOOL utf8;        /* Local copy of UTF-8 flag for speed */
388    
389  register int  rrc;    /* Returns from recursive calls */  BOOL minimize, possessive; /* Quantifier options */
 register int  i;      /* Used for loops not involving calls to RMATCH() */  
 register int  c;      /* Character values not kept over RMATCH() calls */  
 register BOOL utf8;   /* Local copy of UTF-8 flag for speed */  
390    
391  /* When recursion is not being used, all "local" variables that have to be  /* When recursion is not being used, all "local" variables that have to be
392  preserved over calls to RMATCH() are part of a "frame" which is obtained from  preserved over calls to RMATCH() are part of a "frame" which is obtained from
# Line 434  HEAP_RECURSE: Line 437  HEAP_RECURSE:
437    
438  #define cur_is_word        frame->Xcur_is_word  #define cur_is_word        frame->Xcur_is_word
439  #define condition          frame->Xcondition  #define condition          frame->Xcondition
 #define minimize           frame->Xminimize  
440  #define prev_is_word       frame->Xprev_is_word  #define prev_is_word       frame->Xprev_is_word
441    
442  #define original_ims       frame->Xoriginal_ims  #define original_ims       frame->Xoriginal_ims
# Line 446  HEAP_RECURSE: Line 448  HEAP_RECURSE:
448  #define prop_category      frame->Xprop_category  #define prop_category      frame->Xprop_category
449  #define prop_chartype      frame->Xprop_chartype  #define prop_chartype      frame->Xprop_chartype
450  #define prop_script        frame->Xprop_script  #define prop_script        frame->Xprop_script
451  #define prop_test_variable frame->Xprop_test_variable  #define oclength           frame->Xoclength
452    #define occhars            frame->Xocchars
453  #endif  #endif
454    
455  #define ctype              frame->Xctype  #define ctype              frame->Xctype
# Line 470  HEAP_RECURSE: Line 473  HEAP_RECURSE:
473  get preserved during recursion in the normal way. In this environment, fi and  get preserved during recursion in the normal way. In this environment, fi and
474  i, and fc and c, can be the same variables. */  i, and fc and c, can be the same variables. */
475    
476  #else  #else         /* NO_RECURSE not defined */
477  #define fi i  #define fi i
478  #define fc c  #define fc c
479    
# Line 489  recursion_info new_recursive;      /* wi Line 492  recursion_info new_recursive;      /* wi
492                                     /* that do not have to be preserved over  */                                     /* that do not have to be preserved over  */
493  BOOL cur_is_word;                  /* a recursive call to RMATCH().          */  BOOL cur_is_word;                  /* a recursive call to RMATCH().          */
494  BOOL condition;  BOOL condition;
 BOOL minimize;  
495  BOOL prev_is_word;  BOOL prev_is_word;
496    
497  unsigned long int original_ims;  unsigned long int original_ims;
# Line 501  int prop_fail_result; Line 503  int prop_fail_result;
503  int prop_category;  int prop_category;
504  int prop_chartype;  int prop_chartype;
505  int prop_script;  int prop_script;
506  int *prop_test_variable;  int oclength;
507    uschar occhars[8];
508  #endif  #endif
509    
510  int ctype;  int ctype;
# Line 516  int save_offset1, save_offset2, save_off Line 519  int save_offset1, save_offset2, save_off
519  int stacksave[REC_STACK_SAVE_MAX];  int stacksave[REC_STACK_SAVE_MAX];
520    
521  eptrblock newptrb;  eptrblock newptrb;
522  #endif  #endif     /* NO_RECURSE */
523    
524  /* These statements are here to stop the compiler complaining about unitialized  /* These statements are here to stop the compiler complaining about unitialized
525  variables. */  variables. */
# Line 524  variables. */ Line 527  variables. */
527  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
528  prop_value = 0;  prop_value = 0;
529  prop_fail_result = 0;  prop_fail_result = 0;
 prop_test_variable = NULL;  
530  #endif  #endif
531    
532    
533    /* This label is used for tail recursion, which is used in a few cases even
534    when NO_RECURSE is not defined, in order to reduce the amount of stack that is
535    used. Thanks to Ian Taylor for noticing this possibility and sending the
536    original patch. */
537    
538    TAIL_RECURSE:
539    
540  /* OK, now we can get on with the real code of the function. Recursive calls  /* OK, now we can get on with the real code of the function. Recursive calls
541  are specified by the macro RMATCH and RRETURN is used to return. When  are specified by the macro RMATCH and RRETURN is used to return. When
542  NO_RECURSE is *not* defined, these just turn into a recursive call to match()  NO_RECURSE is *not* defined, these just turn into a recursive call to match()
# Line 542  if (md->match_call_count++ >= md->match_ Line 552  if (md->match_call_count++ >= md->match_
552  if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);  if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
553    
554  original_ims = ims;    /* Save for resetting on ')' */  original_ims = ims;    /* Save for resetting on ')' */
555    
556    #ifdef SUPPORT_UTF8
557  utf8 = md->utf8;       /* Local copy of the flag */  utf8 = md->utf8;       /* Local copy of the flag */
558    #else
559    utf8 = FALSE;
560    #endif
561    
562  /* At the start of a bracketed group, add the current subject pointer to the  /* At the start of a group with an unlimited repeat that may match an empty
563  stack of such pointers, to be re-instated at the end of the group when we hit  string, the match_cbegroup flag is set. When this is the case, add the current
564  the closing ket. When match() is called in other circumstances, we don't add to  subject pointer to the chain of such remembered pointers, to be checked when we
565  this stack. */  hit the closing ket, in order to break infinite loops that match no characters.
566    When match() is called in other circumstances, don't add to the chain. If this
567    is a tail recursion, use a block from the workspace, as the one on the stack is
568    already used. */
569    
570  if ((flags & match_isgroup) != 0)  if ((flags & match_cbegroup) != 0)
571    {    {
572    newptrb.epb_prev = eptrb;    eptrblock *p;
573    newptrb.epb_saved_eptr = eptr;    if ((flags & match_tail_recursed) != 0)
574    eptrb = &newptrb;      {
575        if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
576        p = md->eptrchain + md->eptrn++;
577        }
578      else p = &newptrb;
579      p->epb_saved_eptr = eptr;
580      p->epb_prev = eptrb;
581      eptrb = p;
582    }    }
583    
584  /* Now start processing the operations. */  /* Now start processing the opcodes. */
585    
586  for (;;)  for (;;)
587    {    {
588      minimize = possessive = FALSE;
589    op = *ecode;    op = *ecode;
   minimize = FALSE;  
590    
591    /* For partial matching, remember if we ever hit the end of the subject after    /* For partial matching, remember if we ever hit the end of the subject after
592    matching at least one subject character. */    matching at least one subject character. */
# Line 571  for (;;) Line 596  for (;;)
596        eptr > md->start_match)        eptr > md->start_match)
597      md->hitend = TRUE;      md->hitend = TRUE;
598    
599    /* Opening capturing bracket. If there is space in the offset vector, save    switch(op)
   the current subject position in the working slot at the top of the vector. We  
   mustn't change the current values of the data slot, because they may be set  
   from a previous iteration of this group, and be referred to by a reference  
   inside the group.  
   
   If the bracket fails to match, we need to restore this value and also the  
   values of the final offsets, in case they were set by a previous iteration of  
   the same bracket.  
   
   If there isn't enough space in the offset vector, treat this as if it were a  
   non-capturing bracket. Don't worry about setting the flag for the error case  
   here; that is handled in the code for KET. */  
   
   if (op > OP_BRA)  
600      {      {
601      number = op - OP_BRA;      /* Handle a capturing bracket. If there is space in the offset vector, save
602        the current subject position in the working slot at the top of the vector.
603      /* For extended extraction brackets (large number), we have to fish out the      We mustn't change the current values of the data slot, because they may be
604      number from a dummy opcode at the start. */      set from a previous iteration of this group, and be referred to by a
605        reference inside the group.
606      if (number > EXTRACT_BASIC_MAX)  
607        number = GET2(ecode, 2+LINK_SIZE);      If the bracket fails to match, we need to restore this value and also the
608        values of the final offsets, in case they were set by a previous iteration
609        of the same bracket.
610    
611        If there isn't enough space in the offset vector, treat this as if it were
612        a non-capturing bracket. Don't worry about setting the flag for the error
613        case here; that is handled in the code for KET. */
614    
615        case OP_CBRA:
616        case OP_SCBRA:
617        number = GET2(ecode, 1+LINK_SIZE);
618      offset = number << 1;      offset = number << 1;
619    
620  #ifdef DEBUG  #ifdef DEBUG
621      printf("start bracket %d subject=", number);      printf("start bracket %d\n", number);
622        printf("subject=");
623      pchars(eptr, 16, TRUE, md);      pchars(eptr, 16, TRUE, md);
624      printf("\n");      printf("\n");
625  #endif  #endif
# Line 612  for (;;) Line 634  for (;;)
634        DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));        DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
635        md->offset_vector[md->offset_end - number] = eptr - md->start_subject;        md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
636    
637          flags = (op == OP_SCBRA)? match_cbegroup : 0;
638        do        do
639          {          {
640          RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,          RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
641            match_isgroup);            ims, eptrb, flags);
642          if (rrc != MATCH_NOMATCH) RRETURN(rrc);          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
643          md->capture_last = save_capture_last;          md->capture_last = save_capture_last;
644          ecode += GET(ecode, 1);          ecode += GET(ecode, 1);
# Line 631  for (;;) Line 654  for (;;)
654        RRETURN(MATCH_NOMATCH);        RRETURN(MATCH_NOMATCH);
655        }        }
656    
657      /* Insufficient room for saving captured contents */      /* Insufficient room for saving captured contents. Treat as a non-capturing
658        bracket. */
659    
660      else op = OP_BRA;      DPRINTF(("insufficient capture room: treat as non-capturing\n"));
     }  
661    
662    /* Other types of node can be handled by a switch */      /* Non-capturing bracket. Loop for all the alternatives. When we get to the
663        final alternative within the brackets, we would return the result of a
664        recursive call to match() whatever happened. We can reduce stack usage by
665        turning this into a tail recursion. */
666    
667    switch(op)      case OP_BRA:
668      {      case OP_SBRA:
669      case OP_BRA:     /* Non-capturing bracket: optimized */      DPRINTF(("start non-capturing bracket\n"));
670      DPRINTF(("start bracket 0\n"));      flags = (op >= OP_SBRA)? match_cbegroup : 0;
671      do      for (;;)
672        {        {
673        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,        if (ecode[GET(ecode, 1)] != OP_ALT)
674          match_isgroup);          {
675            ecode += _pcre_OP_lengths[*ecode];
676            flags |= match_tail_recursed;
677            DPRINTF(("bracket 0 tail recursion\n"));
678            goto TAIL_RECURSE;
679            }
680    
681          /* For non-final alternatives, continue the loop for a NOMATCH result;
682          otherwise return. */
683    
684          RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
685            eptrb, flags);
686        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
687        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
688        }        }
689      while (*ecode == OP_ALT);      /* Control never reaches here. */
     DPRINTF(("bracket 0 failed\n"));  
     RRETURN(MATCH_NOMATCH);  
690    
691      /* Conditional group: compilation checked that there are no more than      /* Conditional group: compilation checked that there are no more than
692      two branches. If the condition is false, skipping the first branch takes us      two branches. If the condition is false, skipping the first branch takes us
693      past the end if there is only one branch, but that's OK because that is      past the end if there is only one branch, but that's OK because that is
694      exactly what going to the ket would do. */      exactly what going to the ket would do. As there is only one branch to be
695        obeyed, we can use tail recursion to avoid using another stack frame. */
696    
697      case OP_COND:      case OP_COND:
698      if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */      case OP_SCOND:
699        if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */
700          {
701          offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
702          condition = md->recursive != NULL &&
703            (offset == RREF_ANY || offset == md->recursive->group_num);
704          ecode += condition? 3 : GET(ecode, 1);
705          }
706    
707        else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */
708        {        {
709        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
710        condition = (offset == CREF_RECURSE * 2)?        condition = offset < offset_top && md->offset_vector[offset] >= 0;
711          (md->recursive != NULL) :        ecode += condition? 3 : GET(ecode, 1);
712          (offset < offset_top && md->offset_vector[offset] >= 0);        }
713        RMATCH(rrc, eptr, ecode + (condition?  
714          (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),      else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */
715          offset_top, md, ims, eptrb, match_isgroup);        {
716        RRETURN(rrc);        condition = FALSE;
717          ecode += GET(ecode, 1);
718        }        }
719    
720      /* The condition is an assertion. Call match() to evaluate it - setting      /* The condition is an assertion. Call match() to evaluate it - setting
721      the final argument TRUE causes it to stop at the end of an assertion. */      the final argument match_condassert causes it to stop at the end of an
722        assertion. */
723    
724      else      else
725        {        {
726        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
727            match_condassert | match_isgroup);            match_condassert);
728        if (rrc == MATCH_MATCH)        if (rrc == MATCH_MATCH)
729          {          {
730          ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);          condition = TRUE;
731            ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
732          while (*ecode == OP_ALT) ecode += GET(ecode, 1);          while (*ecode == OP_ALT) ecode += GET(ecode, 1);
733          }          }
734        else if (rrc != MATCH_NOMATCH)        else if (rrc != MATCH_NOMATCH)
735          {          {
736          RRETURN(rrc);         /* Need braces because of following else */          RRETURN(rrc);         /* Need braces because of following else */
737          }          }
738        else ecode += GET(ecode, 1);        else
739        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,          {
740          match_isgroup);          condition = FALSE;
741        RRETURN(rrc);          ecode += GET(ecode, 1);
742            }
743        }        }
     /* Control never reaches here */  
744    
745      /* Skip over conditional reference or large extraction number data if      /* We are now at the branch that is to be obeyed. As there is only one,
746      encountered. */      we can use tail recursion to avoid using another stack frame. If the second
747        alternative doesn't exist, we can just plough on. */
748    
749      case OP_CREF:      if (condition || *ecode == OP_ALT)
750      case OP_BRANUMBER:        {
751      ecode += 3;        ecode += 1 + LINK_SIZE;
752          flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
753          goto TAIL_RECURSE;
754          }
755        else
756          {
757          ecode += 1 + LINK_SIZE;
758          }
759      break;      break;
760    
761      /* End of the pattern. If we are in a recursion, we should restore the  
762      offsets appropriately and continue from after the call. */      /* End of the pattern. If we are in a top-level recursion, we should
763        restore the offsets appropriately and continue from after the call. */
764    
765      case OP_END:      case OP_END:
766      if (md->recursive != NULL && md->recursive->group_num == 0)      if (md->recursive != NULL && md->recursive->group_num == 0)
# Line 745  for (;;) Line 802  for (;;)
802      case OP_ASSERTBACK:      case OP_ASSERTBACK:
803      do      do
804        {        {
805        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
         match_isgroup);  
806        if (rrc == MATCH_MATCH) break;        if (rrc == MATCH_MATCH) break;
807        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
# Line 772  for (;;) Line 828  for (;;)
828      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
829      do      do
830        {        {
831        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
         match_isgroup);  
832        if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);        if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
833        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
834        ecode += GET(ecode,1);        ecode += GET(ecode,1);
# Line 794  for (;;) Line 849  for (;;)
849  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
850      if (utf8)      if (utf8)
851        {        {
852        c = GET(ecode,1);        i = GET(ecode, 1);
853        for (i = 0; i < c; i++)        while (i-- > 0)
854          {          {
855          eptr--;          eptr--;
856          if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);          if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
# Line 808  for (;;) Line 863  for (;;)
863      /* No UTF-8 support, or not in UTF-8 mode: count is byte count */      /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
864    
865        {        {
866        eptr -= GET(ecode,1);        eptr -= GET(ecode, 1);
867        if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);        if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
868        }        }
869    
# Line 865  for (;;) Line 920  for (;;)
920      case OP_RECURSE:      case OP_RECURSE:
921        {        {
922        callpat = md->start_code + GET(ecode, 1);        callpat = md->start_code + GET(ecode, 1);
923        new_recursive.group_num = *callpat - OP_BRA;        new_recursive.group_num = (callpat == md->start_code)? 0 :
924            GET2(callpat, 1 + LINK_SIZE);
       /* For extended extraction brackets (large number), we have to fish out  
       the number from a dummy opcode at the start. */  
   
       if (new_recursive.group_num > EXTRACT_BASIC_MAX)  
         new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);  
925    
926        /* Add to "recursing stack" */        /* Add to "recursing stack" */
927    
# Line 904  for (;;) Line 954  for (;;)
954        restore the offset and recursion data. */        restore the offset and recursion data. */
955    
956        DPRINTF(("Recursing into group %d\n", new_recursive.group_num));        DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
957          flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
958        do        do
959          {          {
960          RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,          RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
961              eptrb, match_isgroup);            md, ims, eptrb, flags);
962          if (rrc == MATCH_MATCH)          if (rrc == MATCH_MATCH)
963            {            {
964            DPRINTF(("Recursion matched\n"));            DPRINTF(("Recursion matched\n"));
# Line 945  for (;;) Line 996  for (;;)
996      the end of a normal bracket, leaving the subject pointer. */      the end of a normal bracket, leaving the subject pointer. */
997    
998      case OP_ONCE:      case OP_ONCE:
999        {      prev = ecode;
1000        prev = ecode;      saved_eptr = eptr;
       saved_eptr = eptr;  
1001    
1002        do      do
1003          {        {
1004          RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1005            eptrb, match_isgroup);          eptrb, 0);
1006          if (rrc == MATCH_MATCH) break;        if (rrc == MATCH_MATCH) break;
1007          if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1008          ecode += GET(ecode,1);        ecode += GET(ecode,1);
1009          }        }
1010        while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
1011    
1012        /* If hit the end of the group (which could be repeated), fail */      /* If hit the end of the group (which could be repeated), fail */
1013    
1014        if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);      if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1015    
1016        /* Continue as from after the assertion, updating the offsets high water      /* Continue as from after the assertion, updating the offsets high water
1017        mark, since extracts may have been taken. */      mark, since extracts may have been taken. */
1018    
1019        do ecode += GET(ecode,1); while (*ecode == OP_ALT);      do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1020    
1021        offset_top = md->end_offset_top;      offset_top = md->end_offset_top;
1022        eptr = md->end_match_ptr;      eptr = md->end_match_ptr;
1023    
1024        /* For a non-repeating ket, just continue at this level. This also      /* For a non-repeating ket, just continue at this level. This also
1025        happens for a repeating ket if no characters were matched in the group.      happens for a repeating ket if no characters were matched in the group.
1026        This is the forcible breaking of infinite loops as implemented in Perl      This is the forcible breaking of infinite loops as implemented in Perl
1027        5.005. If there is an options reset, it will get obeyed in the normal      5.005. If there is an options reset, it will get obeyed in the normal
1028        course of events. */      course of events. */
1029    
1030        if (*ecode == OP_KET || eptr == saved_eptr)      if (*ecode == OP_KET || eptr == saved_eptr)
1031          {        {
1032          ecode += 1+LINK_SIZE;        ecode += 1+LINK_SIZE;
1033          break;        break;
1034          }        }
1035    
1036        /* The repeating kets try the rest of the pattern or restart from the      /* The repeating kets try the rest of the pattern or restart from the
1037        preceding bracket, in the appropriate order. We need to reset any options      preceding bracket, in the appropriate order. The second "call" of match()
1038        that changed within the bracket before re-running it, so check the next      uses tail recursion, to avoid using another stack frame. We need to reset
1039        opcode. */      any options that changed within the bracket before re-running it, so
1040        check the next opcode. */
1041    
1042        if (ecode[1+LINK_SIZE] == OP_OPT)      if (ecode[1+LINK_SIZE] == OP_OPT)
1043          {        {
1044          ims = (ims & ~PCRE_IMS) | ecode[4];        ims = (ims & ~PCRE_IMS) | ecode[4];
1045          DPRINTF(("ims set to %02lx at group repeat\n", ims));        DPRINTF(("ims set to %02lx at group repeat\n", ims));
1046          }        }
1047    
1048        if (*ecode == OP_KETRMIN)      if (*ecode == OP_KETRMIN)
1049          {        {
1050          RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1051          if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1052          RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);        ecode = prev;
1053          if (rrc != MATCH_NOMATCH) RRETURN(rrc);        flags = match_tail_recursed;
1054          }        goto TAIL_RECURSE;
       else  /* OP_KETRMAX */  
         {  
         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);  
         if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);  
         if (rrc != MATCH_NOMATCH) RRETURN(rrc);  
         }  
1055        }        }
1056      RRETURN(MATCH_NOMATCH);      else  /* OP_KETRMAX */
1057          {
1058          RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
1059          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1060          ecode += 1 + LINK_SIZE;
1061          flags = match_tail_recursed;
1062          goto TAIL_RECURSE;
1063          }
1064        /* Control never gets here */
1065    
1066      /* An alternation is the end of a branch; scan along to find the end of the      /* An alternation is the end of a branch; scan along to find the end of the
1067      bracketed group and go to there. */      bracketed group and go to there. */
# Line 1027  for (;;) Line 1079  for (;;)
1079      case OP_BRAZERO:      case OP_BRAZERO:
1080        {        {
1081        next = ecode+1;        next = ecode+1;
1082        RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);        RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
1083        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1084        do next += GET(next,1); while (*next == OP_ALT);        do next += GET(next,1); while (*next == OP_ALT);
1085        ecode = next + 1+LINK_SIZE;        ecode = next + 1 + LINK_SIZE;
1086        }        }
1087      break;      break;
1088    
1089      case OP_BRAMINZERO:      case OP_BRAMINZERO:
1090        {        {
1091        next = ecode+1;        next = ecode+1;
1092        do next += GET(next,1); while (*next == OP_ALT);        do next += GET(next, 1); while (*next == OP_ALT);
1093        RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,        RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
         match_isgroup);  
1094        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1095        ecode++;        ecode++;
1096        }        }
1097      break;      break;
1098    
1099      /* End of a group, repeated or non-repeating. If we are at the end of      /* End of a group, repeated or non-repeating. */
     an assertion "group", stop matching and return MATCH_MATCH, but record the  
     current high water mark for use by positive assertions. Do this also  
     for the "once" (not-backup up) groups. */  
1100    
1101      case OP_KET:      case OP_KET:
1102      case OP_KETRMIN:      case OP_KETRMIN:
1103      case OP_KETRMAX:      case OP_KETRMAX:
1104        {      prev = ecode - GET(ecode, 1);
       prev = ecode - GET(ecode, 1);  
       saved_eptr = eptrb->epb_saved_eptr;  
   
       /* Back up the stack of bracket start pointers. */  
1105    
1106        eptrb = eptrb->epb_prev;      /* If this was a group that remembered the subject start, in order to break
1107        infinite repeats of empty string matches, retrieve the subject start from
1108        the chain. Otherwise, set it NULL. */
1109    
1110        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||      if (*prev >= OP_SBRA)
1111            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||        {
1112            *prev == OP_ONCE)        saved_eptr = eptrb->epb_saved_eptr;   /* Value at start of group */
1113          {        eptrb = eptrb->epb_prev;              /* Backup to previous group */
1114          md->end_match_ptr = eptr;      /* For ONCE */        }
1115          md->end_offset_top = offset_top;      else saved_eptr = NULL;
         RRETURN(MATCH_MATCH);  
         }  
1116    
1117        /* In all other cases except a conditional group we have to check the      /* If we are at the end of an assertion group, stop matching and return
1118        group number back at the start and if necessary complete handling an      MATCH_MATCH, but record the current high water mark for use by positive
1119        extraction by setting the offsets and bumping the high water mark. */      assertions. Do this also for the "once" (atomic) groups. */
1120    
1121        if (*prev != OP_COND)      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1122          {          *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1123          number = *prev - OP_BRA;          *prev == OP_ONCE)
1124          {
1125          md->end_match_ptr = eptr;      /* For ONCE */
1126          md->end_offset_top = offset_top;
1127          RRETURN(MATCH_MATCH);
1128          }
1129    
1130          /* For extended extraction brackets (large number), we have to fish out      /* For capturing groups we have to check the group number back at the start
1131          the number from a dummy opcode at the start. */      and if necessary complete handling an extraction by setting the offsets and
1132        bumping the high water mark. Note that whole-pattern recursion is coded as
1133        a recurse into group 0, so it won't be picked up here. Instead, we catch it
1134        when the OP_END is reached. Other recursion is handled here. */
1135    
1136          if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);      if (*prev == OP_CBRA || *prev == OP_SCBRA)
1137          offset = number << 1;        {
1138          number = GET2(prev, 1+LINK_SIZE);
1139          offset = number << 1;
1140    
1141  #ifdef DEBUG  #ifdef DEBUG
1142          printf("end bracket %d", number);        printf("end bracket %d", number);
1143          printf("\n");        printf("\n");
1144  #endif  #endif
1145    
1146          /* Test for a numbered group. This includes groups called as a result        md->capture_last = number;
1147          of recursion. Note that whole-pattern recursion is coded as a recurse        if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1148          into group 0, so it won't be picked up here. Instead, we catch it when          {
1149          the OP_END is reached. */          md->offset_vector[offset] =
1150              md->offset_vector[md->offset_end - number];
1151          if (number > 0)          md->offset_vector[offset+1] = eptr - md->start_subject;
1152            {          if (offset_top <= offset) offset_top = offset + 2;
           md->capture_last = number;  
           if (offset >= md->offset_max) md->offset_overflow = TRUE; else  
             {  
             md->offset_vector[offset] =  
               md->offset_vector[md->offset_end - number];  
             md->offset_vector[offset+1] = eptr - md->start_subject;  
             if (offset_top <= offset) offset_top = offset + 2;  
             }  
   
           /* Handle a recursively called group. Restore the offsets  
           appropriately and continue from after the call. */  
   
           if (md->recursive != NULL && md->recursive->group_num == number)  
             {  
             recursion_info *rec = md->recursive;  
             DPRINTF(("Recursion (%d) succeeded - continuing\n", number));  
             md->recursive = rec->prevrec;  
             md->start_match = rec->save_start;  
             memcpy(md->offset_vector, rec->offset_save,  
               rec->saved_max * sizeof(int));  
             ecode = rec->after_call;  
             ims = original_ims;  
             break;  
             }  
           }  
1153          }          }
1154    
1155        /* Reset the value of the ims flags, in case they got changed during        /* Handle a recursively called group. Restore the offsets
1156        the group. */        appropriately and continue from after the call. */
   
       ims = original_ims;  
       DPRINTF(("ims reset to %02lx\n", ims));  
   
       /* For a non-repeating ket, just continue at this level. This also  
       happens for a repeating ket if no characters were matched in the group.  
       This is the forcible breaking of infinite loops as implemented in Perl  
       5.005. If there is an options reset, it will get obeyed in the normal  
       course of events. */  
1157    
1158        if (*ecode == OP_KET || eptr == saved_eptr)        if (md->recursive != NULL && md->recursive->group_num == number)
1159          {          {
1160          ecode += 1 + LINK_SIZE;          recursion_info *rec = md->recursive;
1161            DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1162            md->recursive = rec->prevrec;
1163            md->start_match = rec->save_start;
1164            memcpy(md->offset_vector, rec->offset_save,
1165              rec->saved_max * sizeof(int));
1166            ecode = rec->after_call;
1167            ims = original_ims;
1168          break;          break;
1169          }          }
1170          }
1171    
1172        /* The repeating kets try the rest of the pattern or restart from the      /* For both capturing and non-capturing groups, reset the value of the ims
1173        preceding bracket, in the appropriate order. */      flags, in case they got changed during the group. */
1174    
1175        if (*ecode == OP_KETRMIN)      ims = original_ims;
1176          {      DPRINTF(("ims reset to %02lx\n", ims));
1177          RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);  
1178          if (rrc != MATCH_NOMATCH) RRETURN(rrc);      /* For a non-repeating ket, just continue at this level. This also
1179          RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);      happens for a repeating ket if no characters were matched in the group.
1180          if (rrc != MATCH_NOMATCH) RRETURN(rrc);      This is the forcible breaking of infinite loops as implemented in Perl
1181          }      5.005. If there is an options reset, it will get obeyed in the normal
1182        else  /* OP_KETRMAX */      course of events. */
1183          {  
1184          RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);      if (*ecode == OP_KET || eptr == saved_eptr)
1185          if (rrc != MATCH_NOMATCH) RRETURN(rrc);        {
1186          RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);        ecode += 1 + LINK_SIZE;
1187          if (rrc != MATCH_NOMATCH) RRETURN(rrc);        break;
         }  
1188        }        }
1189    
1190      RRETURN(MATCH_NOMATCH);      /* The repeating kets try the rest of the pattern or restart from the
1191        preceding bracket, in the appropriate order. In the second case, we can use
1192        tail recursion to avoid using another stack frame. */
1193    
1194        flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1195    
1196        if (*ecode == OP_KETRMIN)
1197          {
1198          RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1199          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1200          ecode = prev;
1201          flags |= match_tail_recursed;
1202          goto TAIL_RECURSE;
1203          }
1204        else  /* OP_KETRMAX */
1205          {
1206          RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
1207          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1208          ecode += 1 + LINK_SIZE;
1209          flags = match_tail_recursed;
1210          goto TAIL_RECURSE;
1211          }
1212        /* Control never gets here */
1213    
1214      /* Start of subject unless notbol, or after internal newline if multiline */      /* Start of subject unless notbol, or after internal newline if multiline */
1215    
# Line 1168  for (;;) Line 1217  for (;;)
1217      if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);      if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1218      if ((ims & PCRE_MULTILINE) != 0)      if ((ims & PCRE_MULTILINE) != 0)
1219        {        {
1220        if (eptr != md->start_subject && eptr[-1] != NEWLINE)        if (eptr != md->start_subject &&
1221              (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1222          RRETURN(MATCH_NOMATCH);          RRETURN(MATCH_NOMATCH);
1223        ecode++;        ecode++;
1224        break;        break;
# Line 1196  for (;;) Line 1246  for (;;)
1246      if ((ims & PCRE_MULTILINE) != 0)      if ((ims & PCRE_MULTILINE) != 0)
1247        {        {
1248        if (eptr < md->end_subject)        if (eptr < md->end_subject)
1249          { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }          { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1250        else        else
1251          { if (md->noteol) RRETURN(MATCH_NOMATCH); }          { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1252        ecode++;        ecode++;
# Line 1207  for (;;) Line 1257  for (;;)
1257        if (md->noteol) RRETURN(MATCH_NOMATCH);        if (md->noteol) RRETURN(MATCH_NOMATCH);
1258        if (!md->endonly)        if (!md->endonly)
1259          {          {
1260          if (eptr < md->end_subject - 1 ||          if (eptr != md->end_subject &&
1261             (eptr == md->end_subject - 1 && *eptr != NEWLINE))              (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1262            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1263          ecode++;          ecode++;
1264          break;          break;
1265          }          }
1266        }        }
1267      /* ... else fall through */      /* ... else fall through for endonly */
1268    
1269      /* End of subject assertion (\z) */      /* End of subject assertion (\z) */
1270    
# Line 1226  for (;;) Line 1276  for (;;)
1276      /* End of subject or ending \n assertion (\Z) */      /* End of subject or ending \n assertion (\Z) */
1277    
1278      case OP_EODN:      case OP_EODN:
1279      if (eptr < md->end_subject - 1 ||      if (eptr != md->end_subject &&
1280         (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);          (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1281          RRETURN(MATCH_NOMATCH);
1282      ecode++;      ecode++;
1283      break;      break;
1284    
# Line 1280  for (;;) Line 1331  for (;;)
1331      /* Match a single character type; inline for speed */      /* Match a single character type; inline for speed */
1332    
1333      case OP_ANY:      case OP_ANY:
1334      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)      if ((ims & PCRE_DOTALL) == 0)
1335        RRETURN(MATCH_NOMATCH);        {
1336          if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1337          }
1338      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
 #ifdef SUPPORT_UTF8  
1339      if (utf8)      if (utf8)
1340        while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;        while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
 #endif  
1341      ecode++;      ecode++;
1342      break;      break;
1343    
# Line 1376  for (;;) Line 1427  for (;;)
1427      ecode++;      ecode++;
1428      break;      break;
1429    
1430        case OP_ANYNL:
1431        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1432        GETCHARINCTEST(c, eptr);
1433        switch(c)
1434          {
1435          default: RRETURN(MATCH_NOMATCH);
1436          case 0x000d:
1437          if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1438          break;
1439          case 0x000a:
1440          case 0x000b:
1441          case 0x000c:
1442          case 0x0085:
1443          case 0x2028:
1444          case 0x2029:
1445          break;
1446          }
1447        ecode++;
1448        break;
1449    
1450  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1451      /* Check the next character by Unicode property. We will get here only      /* Check the next character by Unicode property. We will get here only
1452      if the support is in the binary; otherwise a compile-time error occurs. */      if the support is in the binary; otherwise a compile-time error occurs. */
# Line 1418  for (;;) Line 1489  for (;;)
1489    
1490          default:          default:
1491          RRETURN(PCRE_ERROR_INTERNAL);          RRETURN(PCRE_ERROR_INTERNAL);
         break;  
1492          }          }
1493    
1494        ecode += 3;        ecode += 3;
# Line 1888  for (;;) Line 1958  for (;;)
1958    
1959        else        else
1960          {          {
1961          int dc;          unsigned int dc;
1962          GETCHARINC(dc, eptr);          GETCHARINC(dc, eptr);
1963          ecode += length;          ecode += length;
1964    
# Line 1915  for (;;) Line 1985  for (;;)
1985        }        }
1986      break;      break;
1987    
1988      /* Match a single character repeatedly; different opcodes share code. */      /* Match a single character repeatedly. */
1989    
1990      case OP_EXACT:      case OP_EXACT:
1991      min = max = GET2(ecode, 1);      min = max = GET2(ecode, 1);
1992      ecode += 3;      ecode += 3;
1993      goto REPEATCHAR;      goto REPEATCHAR;
1994    
1995        case OP_POSUPTO:
1996        possessive = TRUE;
1997        /* Fall through */
1998    
1999      case OP_UPTO:      case OP_UPTO:
2000      case OP_MINUPTO:      case OP_MINUPTO:
2001      min = 0;      min = 0;
# Line 1930  for (;;) Line 2004  for (;;)
2004      ecode += 3;      ecode += 3;
2005      goto REPEATCHAR;      goto REPEATCHAR;
2006    
2007        case OP_POSSTAR:
2008        possessive = TRUE;
2009        min = 0;
2010        max = INT_MAX;
2011        ecode++;
2012        goto REPEATCHAR;
2013    
2014        case OP_POSPLUS:
2015        possessive = TRUE;
2016        min = 1;
2017        max = INT_MAX;
2018        ecode++;
2019        goto REPEATCHAR;
2020    
2021        case OP_POSQUERY:
2022        possessive = TRUE;
2023        min = 0;
2024        max = 1;
2025        ecode++;
2026        goto REPEATCHAR;
2027    
2028      case OP_STAR:      case OP_STAR:
2029      case OP_MINSTAR:      case OP_MINSTAR:
2030      case OP_PLUS:      case OP_PLUS:
# Line 1961  for (;;) Line 2056  for (;;)
2056    
2057        if (length > 1)        if (length > 1)
2058          {          {
         int oclength = 0;  
         uschar occhars[8];  
   
2059  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2060          int othercase;          unsigned int othercase;
2061          if ((ims & PCRE_CASELESS) != 0 &&          if ((ims & PCRE_CASELESS) != 0 &&
2062              (othercase = _pcre_ucp_othercase(fc)) >= 0 &&              (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
              othercase >= 0)  
2063            oclength = _pcre_ord2utf8(othercase, occhars);            oclength = _pcre_ord2utf8(othercase, occhars);
2064            else oclength = 0;
2065  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2066    
2067          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2068            {            {
2069            if (memcmp(eptr, charptr, length) == 0) eptr += length;            if (memcmp(eptr, charptr, length) == 0) eptr += length;
2070    #ifdef SUPPORT_UCP
2071            /* Need braces because of following else */            /* Need braces because of following else */
2072            else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }            else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2073            else            else
# Line 1982  for (;;) Line 2075  for (;;)
2075              if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);              if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2076              eptr += oclength;              eptr += oclength;
2077              }              }
2078    #else   /* without SUPPORT_UCP */
2079              else { RRETURN(MATCH_NOMATCH); }
2080    #endif  /* SUPPORT_UCP */
2081            }            }
2082    
2083          if (min == max) continue;          if (min == max) continue;
# Line 1994  for (;;) Line 2090  for (;;)
2090              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2091              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2092              if (memcmp(eptr, charptr, length) == 0) eptr += length;              if (memcmp(eptr, charptr, length) == 0) eptr += length;
2093    #ifdef SUPPORT_UCP
2094              /* Need braces because of following else */              /* Need braces because of following else */
2095              else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }              else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2096              else              else
# Line 2001  for (;;) Line 2098  for (;;)
2098                if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);                if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2099                eptr += oclength;                eptr += oclength;
2100                }                }
2101    #else   /* without SUPPORT_UCP */
2102                else { RRETURN (MATCH_NOMATCH); }
2103    #endif  /* SUPPORT_UCP */
2104              }              }
2105            /* Control never gets here */            /* Control never gets here */
2106            }            }
2107          else  
2108            else  /* Maximize */
2109            {            {
2110            pp = eptr;            pp = eptr;
2111            for (i = min; i < max; i++)            for (i = min; i < max; i++)
2112              {              {
2113              if (eptr > md->end_subject - length) break;              if (eptr > md->end_subject - length) break;
2114              if (memcmp(eptr, charptr, length) == 0) eptr += length;              if (memcmp(eptr, charptr, length) == 0) eptr += length;
2115    #ifdef SUPPORT_UCP
2116              else if (oclength == 0) break;              else if (oclength == 0) break;
2117              else              else
2118                {                {
2119                if (memcmp(eptr, occhars, oclength) != 0) break;                if (memcmp(eptr, occhars, oclength) != 0) break;
2120                eptr += oclength;                eptr += oclength;
2121                }                }
2122    #else   /* without SUPPORT_UCP */
2123                else break;
2124    #endif  /* SUPPORT_UCP */
2125              }              }
2126            while (eptr >= pp)  
2127              if (possessive) continue;
2128              for(;;)
2129             {             {
2130             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2131             if (rrc != MATCH_NOMATCH) RRETURN(rrc);             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2132               if (eptr == pp) RRETURN(MATCH_NOMATCH);
2133    #ifdef SUPPORT_UCP
2134               eptr--;
2135               BACKCHAR(eptr);
2136    #else   /* without SUPPORT_UCP */
2137             eptr -= length;             eptr -= length;
2138    #endif  /* SUPPORT_UCP */
2139             }             }
           RRETURN(MATCH_NOMATCH);  
2140            }            }
2141          /* Control never gets here */          /* Control never gets here */
2142          }          }
# Line 2072  for (;;) Line 2184  for (;;)
2184            }            }
2185          /* Control never gets here */          /* Control never gets here */
2186          }          }
2187        else        else  /* Maximize */
2188          {          {
2189          pp = eptr;          pp = eptr;
2190          for (i = min; i < max; i++)          for (i = min; i < max; i++)
# Line 2080  for (;;) Line 2192  for (;;)
2192            if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;            if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2193            eptr++;            eptr++;
2194            }            }
2195            if (possessive) continue;
2196          while (eptr >= pp)          while (eptr >= pp)
2197            {            {
2198            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2108  for (;;) Line 2221  for (;;)
2221            }            }
2222          /* Control never gets here */          /* Control never gets here */
2223          }          }
2224        else        else  /* Maximize */
2225          {          {
2226          pp = eptr;          pp = eptr;
2227          for (i = min; i < max; i++)          for (i = min; i < max; i++)
# Line 2116  for (;;) Line 2229  for (;;)
2229            if (eptr >= md->end_subject || fc != *eptr) break;            if (eptr >= md->end_subject || fc != *eptr) break;
2230            eptr++;            eptr++;
2231            }            }
2232            if (possessive) continue;
2233          while (eptr >= pp)          while (eptr >= pp)
2234            {            {
2235            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2168  for (;;) Line 2282  for (;;)
2282      ecode += 3;      ecode += 3;
2283      goto REPEATNOTCHAR;      goto REPEATNOTCHAR;
2284    
2285        case OP_NOTPOSSTAR:
2286        possessive = TRUE;
2287        min = 0;
2288        max = INT_MAX;
2289        ecode++;
2290        goto REPEATNOTCHAR;
2291    
2292        case OP_NOTPOSPLUS:
2293        possessive = TRUE;
2294        min = 1;
2295        max = INT_MAX;
2296        ecode++;
2297        goto REPEATNOTCHAR;
2298    
2299        case OP_NOTPOSQUERY:
2300        possessive = TRUE;
2301        min = 0;
2302        max = 1;
2303        ecode++;
2304        goto REPEATNOTCHAR;
2305    
2306        case OP_NOTPOSUPTO:
2307        possessive = TRUE;
2308        min = 0;
2309        max = GET2(ecode, 1);
2310        ecode += 3;
2311        goto REPEATNOTCHAR;
2312    
2313      case OP_NOTSTAR:      case OP_NOTSTAR:
2314      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
2315      case OP_NOTPLUS:      case OP_NOTPLUS:
# Line 2207  for (;;) Line 2349  for (;;)
2349        /* UTF-8 mode */        /* UTF-8 mode */
2350        if (utf8)        if (utf8)
2351          {          {
2352          register int d;          register unsigned int d;
2353          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2354            {            {
2355            GETCHARINC(d, eptr);            GETCHARINC(d, eptr);
# Line 2232  for (;;) Line 2374  for (;;)
2374          /* UTF-8 mode */          /* UTF-8 mode */
2375          if (utf8)          if (utf8)
2376            {            {
2377            register int d;            register unsigned int d;
2378            for (fi = min;; fi++)            for (fi = min;; fi++)
2379              {              {
2380              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2268  for (;;) Line 2410  for (;;)
2410          /* UTF-8 mode */          /* UTF-8 mode */
2411          if (utf8)          if (utf8)
2412            {            {
2413            register int d;            register unsigned int d;
2414            for (i = min; i < max; i++)            for (i = min; i < max; i++)
2415              {              {
2416              int len = 1;              int len = 1;
# Line 2278  for (;;) Line 2420  for (;;)
2420              if (fc == d) break;              if (fc == d) break;
2421              eptr += len;              eptr += len;
2422              }              }
2423            for(;;)          if (possessive) continue;
2424            for(;;)
2425              {              {
2426              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2427              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
# Line 2295  for (;;) Line 2438  for (;;)
2438              if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;              if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2439              eptr++;              eptr++;
2440              }              }
2441              if (possessive) continue;
2442            while (eptr >= pp)            while (eptr >= pp)
2443              {              {
2444              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2316  for (;;) Line 2460  for (;;)
2460        /* UTF-8 mode */        /* UTF-8 mode */
2461        if (utf8)        if (utf8)
2462          {          {
2463          register int d;          register unsigned int d;
2464          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2465            {            {
2466            GETCHARINC(d, eptr);            GETCHARINC(d, eptr);
# Line 2339  for (;;) Line 2483  for (;;)
2483          /* UTF-8 mode */          /* UTF-8 mode */
2484          if (utf8)          if (utf8)
2485            {            {
2486            register int d;            register unsigned int d;
2487            for (fi = min;; fi++)            for (fi = min;; fi++)
2488              {              {
2489              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2374  for (;;) Line 2518  for (;;)
2518          /* UTF-8 mode */          /* UTF-8 mode */
2519          if (utf8)          if (utf8)
2520            {            {
2521            register int d;            register unsigned int d;
2522            for (i = min; i < max; i++)            for (i = min; i < max; i++)
2523              {              {
2524              int len = 1;              int len = 1;
# Line 2383  for (;;) Line 2527  for (;;)
2527              if (fc == d) break;              if (fc == d) break;
2528              eptr += len;              eptr += len;
2529              }              }
2530              if (possessive) continue;
2531            for(;;)            for(;;)
2532              {              {
2533              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2400  for (;;) Line 2545  for (;;)
2545              if (eptr >= md->end_subject || fc == *eptr) break;              if (eptr >= md->end_subject || fc == *eptr) break;
2546              eptr++;              eptr++;
2547              }              }
2548              if (possessive) continue;
2549            while (eptr >= pp)            while (eptr >= pp)
2550              {              {
2551              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2431  for (;;) Line 2577  for (;;)
2577      ecode += 3;      ecode += 3;
2578      goto REPEATTYPE;      goto REPEATTYPE;
2579    
2580        case OP_TYPEPOSSTAR:
2581        possessive = TRUE;
2582        min = 0;
2583        max = INT_MAX;
2584        ecode++;
2585        goto REPEATTYPE;
2586    
2587        case OP_TYPEPOSPLUS:
2588        possessive = TRUE;
2589        min = 1;
2590        max = INT_MAX;
2591        ecode++;
2592        goto REPEATTYPE;
2593    
2594        case OP_TYPEPOSQUERY:
2595        possessive = TRUE;
2596        min = 0;
2597        max = 1;
2598        ecode++;
2599        goto REPEATTYPE;
2600    
2601        case OP_TYPEPOSUPTO:
2602        possessive = TRUE;
2603        min = 0;
2604        max = GET2(ecode, 1);
2605        ecode += 3;
2606        goto REPEATTYPE;
2607    
2608      case OP_TYPESTAR:      case OP_TYPESTAR:
2609      case OP_TYPEMINSTAR:      case OP_TYPEMINSTAR:
2610      case OP_TYPEPLUS:      case OP_TYPEPLUS:
# Line 2533  for (;;) Line 2707  for (;;)
2707    
2708            default:            default:
2709            RRETURN(PCRE_ERROR_INTERNAL);            RRETURN(PCRE_ERROR_INTERNAL);
           break;  
2710            }            }
2711          }          }
2712    
# Line 2573  for (;;) Line 2746  for (;;)
2746          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2747            {            {
2748            if (eptr >= md->end_subject ||            if (eptr >= md->end_subject ||
2749               (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))                 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2750              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
2751              eptr++;
2752            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2753            }            }
2754          break;          break;
# Line 2583  for (;;) Line 2757  for (;;)
2757          eptr += min;          eptr += min;
2758          break;          break;
2759    
2760            case OP_ANYNL:
2761            for (i = 1; i <= min; i++)
2762              {
2763              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2764              GETCHARINC(c, eptr);
2765              switch(c)
2766                {
2767                default: RRETURN(MATCH_NOMATCH);
2768                case 0x000d:
2769                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2770                break;
2771                case 0x000a:
2772                case 0x000b:
2773                case 0x000c:
2774                case 0x0085:
2775                case 0x2028:
2776                case 0x2029:
2777                break;
2778                }
2779              }
2780            break;
2781    
2782          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
2783          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2784            {            {
# Line 2651  for (;;) Line 2847  for (;;)
2847  #endif     /* SUPPORT_UTF8 */  #endif     /* SUPPORT_UTF8 */
2848    
2849        /* Code for the non-UTF-8 case for minimum matching of operators other        /* Code for the non-UTF-8 case for minimum matching of operators other
2850        than OP_PROP and OP_NOTPROP. */        than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2851          number of bytes present, as this was tested above. */
2852    
2853        switch(ctype)        switch(ctype)
2854          {          {
# Line 2659  for (;;) Line 2856  for (;;)
2856          if ((ims & PCRE_DOTALL) == 0)          if ((ims & PCRE_DOTALL) == 0)
2857            {            {
2858            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2859              if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);              {
2860                if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2861                eptr++;
2862                }
2863            }            }
2864          else eptr += min;          else eptr += min;
2865          break;          break;
# Line 2668  for (;;) Line 2868  for (;;)
2868          eptr += min;          eptr += min;
2869          break;          break;
2870    
2871            /* Because of the CRLF case, we can't assume the minimum number of
2872            bytes are present in this case. */
2873    
2874            case OP_ANYNL:
2875            for (i = 1; i <= min; i++)
2876              {
2877              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2878              switch(*eptr++)
2879                {
2880                default: RRETURN(MATCH_NOMATCH);
2881                case 0x000d:
2882                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2883                break;
2884                case 0x000a:
2885                case 0x000b:
2886                case 0x000c:
2887                case 0x0085:
2888                break;
2889                }
2890              }
2891            break;
2892    
2893          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
2894          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2895            if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);            if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
# Line 2729  for (;;) Line 2951  for (;;)
2951              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
2952              if (prop_fail_result) RRETURN(MATCH_NOMATCH);              if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2953              }              }
2954            break;            /* Control never gets here */
2955    
2956            case PT_LAMP:            case PT_LAMP:
2957            for (fi = min;; fi++)            for (fi = min;; fi++)
# Line 2744  for (;;) Line 2966  for (;;)
2966                   prop_chartype == ucp_Lt) == prop_fail_result)                   prop_chartype == ucp_Lt) == prop_fail_result)
2967                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2968              }              }
2969            break;            /* Control never gets here */
2970    
2971            case PT_GC:            case PT_GC:
2972            for (fi = min;; fi++)            for (fi = min;; fi++)
# Line 2757  for (;;) Line 2979  for (;;)
2979              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
2980                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2981              }              }
2982            break;            /* Control never gets here */
2983    
2984            case PT_PC:            case PT_PC:
2985            for (fi = min;; fi++)            for (fi = min;; fi++)
# Line 2770  for (;;) Line 2992  for (;;)
2992              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
2993                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2994              }              }
2995            break;            /* Control never gets here */
2996    
2997            case PT_SC:            case PT_SC:
2998            for (fi = min;; fi++)            for (fi = min;; fi++)
# Line 2783  for (;;) Line 3005  for (;;)
3005              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
3006                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
3007              }              }
3008            break;            /* Control never gets here */
3009    
3010            default:            default:
3011            RRETURN(PCRE_ERROR_INTERNAL);            RRETURN(PCRE_ERROR_INTERNAL);
           break;  
3012            }            }
3013          }          }
3014    
# Line 2829  for (;;) Line 3050  for (;;)
3050            {            {
3051            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3052            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3053            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject ||
3054                   (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3055                    IS_NEWLINE(eptr)))
3056                RRETURN(MATCH_NOMATCH);
3057    
3058            GETCHARINC(c, eptr);            GETCHARINC(c, eptr);
3059            switch(ctype)            switch(ctype)
3060              {              {
3061              case OP_ANY:              case OP_ANY:        /* This is the DOTALL case */
             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);  
3062              break;              break;
3063    
3064              case OP_ANYBYTE:              case OP_ANYBYTE:
3065              break;              break;
3066    
3067                case OP_ANYNL:
3068                switch(c)
3069                  {
3070                  default: RRETURN(MATCH_NOMATCH);
3071                  case 0x000d:
3072                  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3073                  break;
3074                  case 0x000a:
3075                  case 0x000b:
3076                  case 0x000c:
3077                  case 0x0085:
3078                  case 0x2028:
3079                  case 0x2029:
3080                  break;
3081                  }
3082                break;
3083    
3084              case OP_NOT_DIGIT:              case OP_NOT_DIGIT:
3085              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3086                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2884  for (;;) Line 3124  for (;;)
3124            {            {
3125            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3126            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3127            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject ||
3128                   ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3129                RRETURN(MATCH_NOMATCH);
3130    
3131            c = *eptr++;            c = *eptr++;
3132            switch(ctype)            switch(ctype)
3133              {              {
3134              case OP_ANY:              case OP_ANY:   /* This is the DOTALL case */
             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);  
3135              break;              break;
3136    
3137              case OP_ANYBYTE:              case OP_ANYBYTE:
3138              break;              break;
3139    
3140                case OP_ANYNL:
3141                switch(c)
3142                  {
3143                  default: RRETURN(MATCH_NOMATCH);
3144                  case 0x000d:
3145                  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3146                  break;
3147                  case 0x000a:
3148                  case 0x000b:
3149                  case 0x000c:
3150                  case 0x0085:
3151                  break;
3152                  }
3153                break;
3154    
3155              case OP_NOT_DIGIT:              case OP_NOT_DIGIT:
3156              if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);              if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3157              break;              break;
# Line 2927  for (;;) Line 3184  for (;;)
3184        /* Control never gets here */        /* Control never gets here */
3185        }        }
3186    
3187      /* If maximizing it is worth using inline code for speed, doing the type      /* If maximizing, it is worth using inline code for speed, doing the type
3188      test once at the start (i.e. keep it out of the loop). Again, keep the      test once at the start (i.e. keep it out of the loop). Again, keep the
3189      UTF-8 and UCP stuff separate. */      UTF-8 and UCP stuff separate. */
3190    
# Line 3008  for (;;) Line 3265  for (;;)
3265    
3266          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
3267    
3268            if (possessive) continue;
3269          for(;;)          for(;;)
3270            {            {
3271            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 3043  for (;;) Line 3301  for (;;)
3301    
3302          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
3303    
3304            if (possessive) continue;
3305          for(;;)          for(;;)
3306            {            {
3307            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 3075  for (;;) Line 3334  for (;;)
3334            {            {
3335            case OP_ANY:            case OP_ANY:
3336    
3337            /* Special code is required for UTF8, but when the maximum is unlimited            /* Special code is required for UTF8, but when the maximum is
3338            we don't need it, so we repeat the non-UTF8 code. This is probably            unlimited we don't need it, so we repeat the non-UTF8 code. This is
3339            worth it, because .* is quite a common idiom. */            probably worth it, because .* is quite a common idiom. */
3340    
3341            if (max < INT_MAX)            if (max < INT_MAX)
3342              {              {
# Line 3085  for (;;) Line 3344  for (;;)
3344                {                {
3345                for (i = min; i < max; i++)                for (i = min; i < max; i++)
3346                  {                  {
3347                  if (eptr >= md->end_subject || *eptr == NEWLINE) break;                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3348                  eptr++;                  eptr++;
3349                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3350                  }                  }
# Line 3094  for (;;) Line 3353  for (;;)
3353                {                {
3354                for (i = min; i < max; i++)                for (i = min; i < max; i++)
3355                  {                  {
3356                    if (eptr >= md->end_subject) break;
3357                  eptr++;                  eptr++;
3358                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3359                  }                  }
# Line 3108  for (;;) Line 3368  for (;;)
3368                {                {
3369                for (i = min; i < max; i++)                for (i = min; i < max; i++)
3370                  {                  {
3371                  if (eptr >= md->end_subject || *eptr == NEWLINE) break;                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3372                  eptr++;                  eptr++;
3373                  }                  }
3374                break;                break;
# Line 3116  for (;;) Line 3376  for (;;)
3376              else              else
3377                {                {
3378                c = max - min;                c = max - min;
3379                if (c > md->end_subject - eptr) c = md->end_subject - eptr;                if (c > (unsigned int)(md->end_subject - eptr))
3380                    c = md->end_subject - eptr;
3381                eptr += c;                eptr += c;
3382                }                }
3383              }              }
# Line 3126  for (;;) Line 3387  for (;;)
3387    
3388            case OP_ANYBYTE:            case OP_ANYBYTE:
3389            c = max - min;            c = max - min;
3390            if (c > md->end_subject - eptr) c = md->end_subject - eptr;            if (c > (unsigned int)(md->end_subject - eptr))
3391                c = md->end_subject - eptr;
3392            eptr += c;            eptr += c;
3393            break;            break;
3394    
3395              case OP_ANYNL:
3396              for (i = min; i < max; i++)
3397                {
3398                int len = 1;
3399                if (eptr >= md->end_subject) break;
3400                GETCHARLEN(c, eptr, len);
3401                if (c == 0x000d)
3402                  {
3403                  if (++eptr >= md->end_subject) break;
3404                  if (*eptr == 0x000a) eptr++;
3405                  }
3406                else
3407                  {
3408                  if (c != 0x000a && c != 0x000b && c != 0x000c &&
3409                      c != 0x0085 && c != 0x2028 && c != 0x2029)
3410                    break;
3411                  eptr += len;
3412                  }
3413                }
3414              break;
3415    
3416            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
3417            for (i = min; i < max; i++)            for (i = min; i < max; i++)
3418              {              {
# Line 3202  for (;;) Line 3485  for (;;)
3485    
3486          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
3487    
3488            if (possessive) continue;
3489          for(;;)          for(;;)
3490            {            {
3491            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 3222  for (;;) Line 3506  for (;;)
3506              {              {
3507              for (i = min; i < max; i++)              for (i = min; i < max; i++)
3508                {                {
3509                if (eptr >= md->end_subject || *eptr == NEWLINE) break;                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3510                eptr++;                eptr++;
3511                }                }
3512              break;              break;
# Line 3231  for (;;) Line 3515  for (;;)
3515    
3516            case OP_ANYBYTE:            case OP_ANYBYTE:
3517            c = max - min;            c = max - min;
3518            if (c > md->end_subject - eptr) c = md->end_subject - eptr;            if (c > (unsigned int)(md->end_subject - eptr))
3519                c = md->end_subject - eptr;
3520            eptr += c;            eptr += c;
3521            break;            break;
3522    
3523              case OP_ANYNL:
3524              for (i = min; i < max; i++)
3525                {
3526                if (eptr >= md->end_subject) break;
3527                c = *eptr;
3528                if (c == 0x000d)
3529                  {
3530                  if (++eptr >= md->end_subject) break;
3531                  if (*eptr == 0x000a) eptr++;
3532                  }
3533                else
3534                  {
3535                  if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3536                    break;
3537                  eptr++;
3538                  }
3539                }
3540              break;
3541    
3542            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
3543            for (i = min; i < max; i++)            for (i = min; i < max; i++)
3544              {              {
# Line 3295  for (;;) Line 3599  for (;;)
3599    
3600          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
3601    
3602            if (possessive) continue;
3603          while (eptr >= pp)          while (eptr >= pp)
3604            {            {
3605            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 3309  for (;;) Line 3614  for (;;)
3614        }        }
3615      /* Control never gets here */      /* Control never gets here */
3616    
3617      /* There's been some horrible disaster. Since all codes > OP_BRA are      /* There's been some horrible disaster. Arrival here can only mean there is
3618      for capturing brackets, and there shouldn't be any gaps between 0 and      something seriously wrong in the code above or the OP_xxx definitions. */
     OP_BRA, arrival here can only mean there is something seriously wrong  
     in the code above or the OP_xxx definitions. */  
3619    
3620      default:      default:
3621      DPRINTF(("Unknown opcode %d\n", *ecode));      DPRINTF(("Unknown opcode %d\n", *ecode));
3622      RRETURN(PCRE_ERROR_UNKNOWN_NODE);      RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3623      }      }
3624    
3625    /* Do not stick any code in here without much thought; it is assumed    /* Do not stick any code in here without much thought; it is assumed
# Line 3354  Undefine all the macros that were define Line 3657  Undefine all the macros that were define
3657    
3658  #undef cur_is_word  #undef cur_is_word
3659  #undef condition  #undef condition
 #undef minimize  
3660  #undef prev_is_word  #undef prev_is_word
3661    
3662  #undef original_ims  #undef original_ims
# Line 3410  Returns:          > 0 => success; value Line 3712  Returns:          > 0 => success; value
3712                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3713  */  */
3714    
3715  PCRE_DATA_SCOPE int  PCRE_EXP_DEFN int
3716  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3717    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3718    int offsetcount)    int offsetcount)
# Line 3419  int rc, resetcount, ocount; Line 3721  int rc, resetcount, ocount;
3721  int first_byte = -1;  int first_byte = -1;
3722  int req_byte = -1;  int req_byte = -1;
3723  int req_byte2 = -1;  int req_byte2 = -1;
3724  unsigned long int ims = 0;  int newline;
3725    unsigned long int ims;
3726  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
3727  BOOL anchored;  BOOL anchored;
3728  BOOL startline;  BOOL startline;
3729  BOOL firstline;  BOOL firstline;
3730  BOOL first_byte_caseless = FALSE;  BOOL first_byte_caseless = FALSE;
3731  BOOL req_byte_caseless = FALSE;  BOOL req_byte_caseless = FALSE;
3732    BOOL utf8;
3733  match_data match_block;  match_data match_block;
3734    match_data *md = &match_block;
3735  const uschar *tables;  const uschar *tables;
3736  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
3737  USPTR start_match = (USPTR)subject + start_offset;  USPTR start_match = (USPTR)subject + start_offset;
3738  USPTR end_subject;  USPTR end_subject;
3739  USPTR req_byte_ptr = start_match - 1;  USPTR req_byte_ptr = start_match - 1;
3740    eptrblock eptrchain[EPTR_WORK_SIZE];
3741    
3742  pcre_study_data internal_study;  pcre_study_data internal_study;
3743  const pcre_study_data *study;  const pcre_study_data *study;
# Line 3451  if (offsetcount < 0) return PCRE_ERROR_B Line 3757  if (offsetcount < 0) return PCRE_ERROR_B
3757  the default values. */  the default values. */
3758    
3759  study = NULL;  study = NULL;
3760  match_block.match_limit = MATCH_LIMIT;  md->match_limit = MATCH_LIMIT;
3761  match_block.match_limit_recursion = MATCH_LIMIT_RECURSION;  md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3762  match_block.callout_data = NULL;  md->callout_data = NULL;
3763    
3764  /* The table pointer is always in native byte order. */  /* The table pointer is always in native byte order. */
3765    
# Line 3465  if (extra_data != NULL) Line 3771  if (extra_data != NULL)
3771    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3772      study = (const pcre_study_data *)extra_data->study_data;      study = (const pcre_study_data *)extra_data->study_data;
3773    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3774      match_block.match_limit = extra_data->match_limit;      md->match_limit = extra_data->match_limit;
3775    if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)    if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3776      match_block.match_limit_recursion = extra_data->match_limit_recursion;      md->match_limit_recursion = extra_data->match_limit_recursion;
3777    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3778      match_block.callout_data = extra_data->callout_data;      md->callout_data = extra_data->callout_data;
3779    if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;    if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3780    }    }
3781    
# Line 3499  firstline = (re->options & PCRE_FIRSTLIN Line 3805  firstline = (re->options & PCRE_FIRSTLIN
3805    
3806  /* The code starts after the real_pcre block and the capture name table. */  /* The code starts after the real_pcre block and the capture name table. */
3807    
3808  match_block.start_code = (const uschar *)external_re + re->name_table_offset +  md->start_code = (const uschar *)external_re + re->name_table_offset +
3809    re->name_count * re->name_entry_size;    re->name_count * re->name_entry_size;
3810    
3811  match_block.start_subject = (USPTR)subject;  md->start_subject = (USPTR)subject;
3812  match_block.start_offset = start_offset;  md->start_offset = start_offset;
3813  match_block.end_subject = match_block.start_subject + length;  md->end_subject = md->start_subject + length;
3814  end_subject = match_block.end_subject;  end_subject = md->end_subject;
3815    
3816  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3817  match_block.utf8 = (re->options & PCRE_UTF8) != 0;  utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3818    
3819  match_block.notbol = (options & PCRE_NOTBOL) != 0;  md->notbol = (options & PCRE_NOTBOL) != 0;
3820  match_block.noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
3821  match_block.notempty = (options & PCRE_NOTEMPTY) != 0;  md->notempty = (options & PCRE_NOTEMPTY) != 0;
3822  match_block.partial = (options & PCRE_PARTIAL) != 0;  md->partial = (options & PCRE_PARTIAL) != 0;
3823  match_block.hitend = FALSE;  md->hitend = FALSE;
3824    
3825    md->recursive = NULL;                   /* No recursion at top level */
3826    md->eptrchain = eptrchain;              /* Make workspace generally available */
3827    
3828  match_block.recursive = NULL;                   /* No recursion at top level */  md->lcc = tables + lcc_offset;
3829    md->ctypes = tables + ctypes_offset;
3830    
3831  match_block.lcc = tables + lcc_offset;  /* Handle different types of newline. The three bits give eight cases. If
3832  match_block.ctypes = tables + ctypes_offset;  nothing is set at run time, whatever was used at compile time applies. */
3833    
3834    switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3835           PCRE_NEWLINE_BITS)
3836      {
3837      case 0: newline = NEWLINE; break;   /* Compile-time default */
3838      case PCRE_NEWLINE_CR: newline = '\r'; break;
3839      case PCRE_NEWLINE_LF: newline = '\n'; break;
3840      case PCRE_NEWLINE_CR+
3841           PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3842      case PCRE_NEWLINE_ANY: newline = -1; break;
3843      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3844      default: return PCRE_ERROR_BADNEWLINE;
3845      }
3846    
3847    if (newline == -2)
3848      {
3849      md->nltype = NLTYPE_ANYCRLF;
3850      }
3851    else if (newline < 0)
3852      {
3853      md->nltype = NLTYPE_ANY;
3854      }
3855    else
3856      {
3857      md->nltype = NLTYPE_FIXED;
3858      if (newline > 255)
3859        {
3860        md->nllen = 2;
3861        md->nl[0] = (newline >> 8) & 255;
3862        md->nl[1] = newline & 255;
3863        }
3864      else
3865        {
3866        md->nllen = 1;
3867        md->nl[0] = newline;
3868        }
3869      }
3870    
3871  /* Partial matching is supported only for a restricted set of regexes at the  /* Partial matching is supported only for a restricted set of regexes at the
3872  moment. */  moment. */
3873    
3874  if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)  if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3875    return PCRE_ERROR_BADPARTIAL;    return PCRE_ERROR_BADPARTIAL;
3876    
3877  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3878  back the character offset. */  back the character offset. */
3879    
3880  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3881  if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3882    {    {
3883    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3884      return PCRE_ERROR_BADUTF8;      return PCRE_ERROR_BADUTF8;
# Line 3563  ocount = offsetcount - (offsetcount % 3) Line 3910  ocount = offsetcount - (offsetcount % 3)
3910  if (re->top_backref > 0 && re->top_backref >= ocount/3)  if (re->top_backref > 0 && re->top_backref >= ocount/3)
3911    {    {
3912    ocount = re->top_backref * 3 + 3;    ocount = re->top_backref * 3 + 3;
3913    match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));    md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3914    if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;    if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3915    using_temporary_offsets = TRUE;    using_temporary_offsets = TRUE;
3916    DPRINTF(("Got memory to hold back references\n"));    DPRINTF(("Got memory to hold back references\n"));
3917    }    }
3918  else match_block.offset_vector = offsets;  else md->offset_vector = offsets;
3919    
3920  match_block.offset_end = ocount;  md->offset_end = ocount;
3921  match_block.offset_max = (2*ocount)/3;  md->offset_max = (2*ocount)/3;
3922  match_block.offset_overflow = FALSE;  md->offset_overflow = FALSE;
3923  match_block.capture_last = -1;  md->capture_last = -1;
3924    
3925  /* Compute the minimum number of offsets that we need to reset each time. Doing  /* Compute the minimum number of offsets that we need to reset each time. Doing
3926  this makes a huge difference to execution time when there aren't many brackets  this makes a huge difference to execution time when there aren't many brackets
# Line 3586  if (resetcount > offsetcount) resetcount Line 3933  if (resetcount > offsetcount) resetcount
3933  never be used unless previously set, but they get saved and restored, and so we  never be used unless previously set, but they get saved and restored, and so we
3934  initialize them to avoid reading uninitialized locations. */  initialize them to avoid reading uninitialized locations. */
3935    
3936  if (match_block.offset_vector != NULL)  if (md->offset_vector != NULL)
3937    {    {
3938    register int *iptr = match_block.offset_vector + ocount;    register int *iptr = md->offset_vector + ocount;
3939    register int *iend = iptr - resetcount/2 + 1;    register int *iend = iptr - resetcount/2 + 1;
3940    while (--iptr >= iend) *iptr = -1;    while (--iptr >= iend) *iptr = -1;
3941    }    }
# Line 3605  if (!anchored) Line 3952  if (!anchored)
3952      {      {
3953      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
3954      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3955        first_byte = match_block.lcc[first_byte];        first_byte = md->lcc[first_byte];
3956      }      }
3957    else    else
3958      if (!startline && study != NULL &&      if (!startline && study != NULL &&
# Line 3623  if ((re->options & PCRE_REQCHSET) != 0) Line 3970  if ((re->options & PCRE_REQCHSET) != 0)
3970    req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */    req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */
3971    }    }
3972    
3973    
3974    /* ==========================================================================*/
3975    
3976  /* Loop for handling unanchored repeated matching attempts; for anchored regexs  /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3977  the loop runs just once. */  the loop runs just once. */
3978    
3979  do  for(;;)
3980    {    {
3981    USPTR save_end_subject = end_subject;    USPTR save_end_subject = end_subject;
3982    
3983    /* Reset the maximum number of extractions we might see. */    /* Reset the maximum number of extractions we might see. */
3984    
3985    if (match_block.offset_vector != NULL)    if (md->offset_vector != NULL)
3986      {      {
3987      register int *iptr = match_block.offset_vector;      register int *iptr = md->offset_vector;
3988      register int *iend = iptr + resetcount;      register int *iend = iptr + resetcount;
3989      while (iptr < iend) *iptr++ = -1;      while (iptr < iend) *iptr++ = -1;
3990      }      }
3991    
3992    /* Advance to a unique first char if possible. If firstline is TRUE, the    /* Advance to a unique first char if possible. If firstline is TRUE, the
3993    start of the match is constrained to the first line of a multiline string.    start of the match is constrained to the first line of a multiline string.
3994    Implement this by temporarily adjusting end_subject so that we stop scanning    That is, the match must be before or at the first newline. Implement this by
3995    at a newline. If the match fails at the newline, later code breaks this loop.    temporarily adjusting end_subject so that we stop scanning at a newline. If
3996    */    the match fails at the newline, later code breaks this loop. */
3997    
3998    if (firstline)    if (firstline)
3999      {      {
4000      USPTR t = start_match;      USPTR t = start_match;
4001      while (t < save_end_subject && *t != '\n') t++;      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4002      end_subject = t;      end_subject = t;
4003      }      }
4004    
# Line 3658  do Line 4008  do
4008      {      {
4009      if (first_byte_caseless)      if (first_byte_caseless)
4010        while (start_match < end_subject &&        while (start_match < end_subject &&
4011               match_block.lcc[*start_match] != first_byte)               md->lcc[*start_match] != first_byte)
4012          start_match++;          start_match++;
4013      else      else
4014        while (start_match < end_subject && *start_match != first_byte)        while (start_match < end_subject && *start_match != first_byte)
4015          start_match++;          start_match++;
4016      }      }
4017    
4018    /* Or to just after \n for a multiline match if possible */    /* Or to just after a linebreak for a multiline match if possible */
4019    
4020    else if (startline)    else if (startline)
4021      {      {
4022      if (start_match > match_block.start_subject + start_offset)      if (start_match > md->start_subject + start_offset)
4023        {        {
4024        while (start_match < end_subject && start_match[-1] != NEWLINE)        while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4025            start_match++;
4026    
4027          /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4028          and we are now at a LF, advance the match position by one more character.
4029          */
4030    
4031          if (start_match[-1] == '\r' &&
4032               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4033               start_match < end_subject &&
4034               *start_match == '\n')
4035          start_match++;          start_match++;
4036        }        }
4037      }      }
# Line 3693  do Line 4053  do
4053    
4054  #ifdef DEBUG  /* Sigh. Some compilers never learn. */  #ifdef DEBUG  /* Sigh. Some compilers never learn. */
4055    printf(">>>> Match against: ");    printf(">>>> Match against: ");
4056    pchars(start_match, end_subject - start_match, TRUE, &match_block);    pchars(start_match, end_subject - start_match, TRUE, md);
4057    printf("\n");    printf("\n");
4058  #endif  #endif
4059    
# Line 3707  do Line 4067  do
4067    
4068    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end can
4069    take a long time, and give bad performance on quite ordinary patterns. This    take a long time, and give bad performance on quite ordinary patterns. This
4070    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4071    don't do this when the string is sufficiently long.    string... so we don't do this when the string is sufficiently long.
4072    
4073    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested.
4074    */    */
4075    
4076    if (req_byte >= 0 &&    if (req_byte >= 0 &&
4077        end_subject - start_match < REQ_BYTE_MAX &&        end_subject - start_match < REQ_BYTE_MAX &&
4078        !match_block.partial)        !md->partial)
4079      {      {
4080      register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);      register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4081    
# Line 3740  do Line 4100  do
4100            }            }
4101          }          }
4102    
4103        /* If we can't find the required character, break the matching loop */        /* If we can't find the required character, break the matching loop,
4104          forcing a match failure. */
4105    
4106        if (p >= end_subject) break;        if (p >= end_subject)
4107            {
4108            rc = MATCH_NOMATCH;
4109            break;
4110            }
4111    
4112        /* If we have found the required character, save the point where we        /* If we have found the required character, save the point where we
4113        found it, so that we don't search again next time round the loop if        found it, so that we don't search again next time round the loop if
# Line 3752  do Line 4117  do
4117        }        }
4118      }      }
4119    
4120    /* When a match occurs, substrings will be set for all internal extractions;    /* OK, we can now run the match. */
   we just need to set up the whole thing as substring 0 before returning. If  
   there were too many extractions, set the return code to zero. In the case  
   where we had to get some local store to hold offsets for backreferences, copy  
   those back references that we can. In this case there need not be overflow  
   if certain parts of the pattern were not used. */  
   
   match_block.start_match = start_match;  
   match_block.match_call_count = 0;  
   
   rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,  
     match_isgroup, 0);  
   
   /* When the result is no match, if the subject's first character was a  
   newline and the PCRE_FIRSTLINE option is set, break (which will return  
   PCRE_ERROR_NOMATCH). The option requests that a match occur before the first  
   newline in the subject. Otherwise, advance the pointer to the next character  
   and continue - but the continuation will actually happen only when the  
   pattern is not anchored. */  
4121    
4122    if (rc == MATCH_NOMATCH)    md->start_match = start_match;
4123      {    md->match_call_count = 0;
4124      if (firstline && *start_match == NEWLINE) break;    md->eptrn = 0;                          /* Next free eptrchain slot */
4125      start_match++;    rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
4126    
4127      /* Any return other than MATCH_NOMATCH breaks the loop. */
4128    
4129      if (rc != MATCH_NOMATCH) break;
4130    
4131      /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4132      newline in the subject (though it may continue over the newline). Therefore,
4133      if we have just failed to match, starting at a newline, do not continue. */
4134    
4135      if (firstline && IS_NEWLINE(start_match)) break;
4136    
4137      /* Advance the match position by one character. */
4138    
4139      start_match++;
4140  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4141      if (match_block.utf8)    if (utf8)
4142        while(start_match < end_subject && (*start_match & 0xc0) == 0x80)      while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4143          start_match++;        start_match++;
4144  #endif  #endif
4145      continue;  
4146      }    /* Break the loop if the pattern is anchored or if we have passed the end of
4147      the subject. */
4148    
4149      if (anchored || start_match > end_subject) break;
4150    
4151      /* If we have just passed a CR and the newline option is CRLF or ANY or
4152      ANYCRLF, and we are now at a LF, advance the match position by one more
4153      character. */
4154    
4155      if (start_match[-1] == '\r' &&
4156           (md->nltype == NLTYPE_ANY ||
4157            md->nltype == NLTYPE_ANYCRLF ||
4158            md->nllen == 2) &&
4159           start_match < end_subject &&
4160           *start_match == '\n')
4161        start_match++;
4162    
4163    if (rc != MATCH_MATCH)    }   /* End of for(;;) "bumpalong" loop */
4164      {  
4165      DPRINTF((">>>> error: returning %d\n", rc));  /* ==========================================================================*/
4166      return rc;  
4167      }  /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4168    conditions is true:
4169    
4170    /* We have a match! Copy the offset information from temporary store if  (1) The pattern is anchored;
   necessary */  
4171    
4172    (2) We are past the end of the subject;
4173    
4174    (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4175        this option requests that a match occur at or before the first newline in
4176        the subject.
4177    
4178    When we have a match and the offset vector is big enough to deal with any
4179    backreferences, captured substring offsets will already be set up. In the case
4180    where we had to get some local store to hold offsets for backreference
4181    processing, copy those that we can. In this case there need not be overflow if
4182    certain parts of the pattern were not used, even though there are more
4183    capturing parentheses than vector slots. */
4184    
4185    if (rc == MATCH_MATCH)
4186      {
4187    if (using_temporary_offsets)    if (using_temporary_offsets)
4188      {      {
4189      if (offsetcount >= 4)      if (offsetcount >= 4)
4190        {        {
4191        memcpy(offsets + 2, match_block.offset_vector + 2,        memcpy(offsets + 2, md->offset_vector + 2,
4192          (offsetcount - 2) * sizeof(int));          (offsetcount - 2) * sizeof(int));
4193        DPRINTF(("Copied offsets from temporary memory\n"));        DPRINTF(("Copied offsets from temporary memory\n"));
4194        }        }
4195      if (match_block.end_offset_top > offsetcount)      if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
       match_block.offset_overflow = TRUE;  
   
4196      DPRINTF(("Freeing temporary memory\n"));      DPRINTF(("Freeing temporary memory\n"));
4197      (pcre_free)(match_block.offset_vector);      (pcre_free)(md->offset_vector);
4198      }      }
4199    
4200    rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;    /* Set the return code to the number of captured strings, or 0 if there are
4201      too many to fit into the vector. */
4202    
4203      rc = md->offset_overflow? 0 : md->end_offset_top/2;
4204    
4205      /* If there is space, set up the whole thing as substring 0. */
4206    
4207    if (offsetcount < 2) rc = 0; else    if (offsetcount < 2) rc = 0; else
4208      {      {
4209      offsets[0] = start_match - match_block.start_subject;      offsets[0] = start_match - md->start_subject;
4210      offsets[1] = match_block.end_match_ptr - match_block.start_subject;      offsets[1] = md->end_match_ptr - md->start_subject;
4211      }      }
4212    
4213    DPRINTF((">>>> returning %d\n", rc));    DPRINTF((">>>> returning %d\n", rc));
4214    return rc;    return rc;
4215    }    }
4216    
4217  /* This "while" is the end of the "do" above */  /* Control gets here if there has been an error, or if the overall match
4218    attempt has failed at all permitted starting positions. */
 while (!anchored && start_match <= end_subject);  
4219    
4220  if (using_temporary_offsets)  if (using_temporary_offsets)
4221    {    {
4222    DPRINTF(("Freeing temporary memory\n"));    DPRINTF(("Freeing temporary memory\n"));
4223    (pcre_free)(match_block.offset_vector);    (pcre_free)(md->offset_vector);
4224    }    }
4225    
4226  if (match_block.partial && match_block.hitend)  if (rc != MATCH_NOMATCH)
4227      {
4228      DPRINTF((">>>> error: returning %d\n", rc));
4229      return rc;
4230      }
4231    else if (md->partial && md->hitend)
4232    {    {
4233    DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));    DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4234    return PCRE_ERROR_PARTIAL;    return PCRE_ERROR_PARTIAL;

Legend:
Removed from v.87  
changed lines
  Added in v.149

  ViewVC Help
Powered by ViewVC 1.1.5