/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 182 by ph10, Wed Jun 13 15:09:54 2007 UTC revision 200 by ph10, Wed Aug 1 09:10:40 2007 UTC
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  pattern matching using an NFA algorithm, trying to mimic Perl as closely as  pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43  possible. There are also some static supporting functions. */  possible. There are also some static supporting functions. */
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
50  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
51  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 53  possible. There are also some static sup Line 57  possible. There are also some static sup
57  #undef min  #undef min
58  #undef max  #undef max
59    
 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,  
 obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */  
   
 #define EPTR_WORK_SIZE (1000)  
   
60  /* Flag bits for the match() function */  /* Flag bits for the match() function */
61    
62  #define match_condassert     0x01  /* Called to check a condition assertion */  #define match_condassert     0x01  /* Called to check a condition assertion */
63  #define match_cbegroup       0x02  /* Could-be-empty unlimited repeat group */  #define match_cbegroup       0x02  /* Could-be-empty unlimited repeat group */
 #define match_tail_recursed  0x04  /* Tail recursive call */  
64    
65  /* Non-error returns from the match() function. Error returns are externally  /* Non-error returns from the match() function. Error returns are externally
66  defined PCRE_ERROR_xxx codes, which are all negative. */  defined PCRE_ERROR_xxx codes, which are all negative. */
# Line 212  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM Line 210  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM
210         RM11,  RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,         RM11,  RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
211         RM21,  RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,         RM21,  RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
212         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
213         RM41,  RM42, RM43, RM44, RM45, RM46, RM47 };         RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50 };
214    
215    
216  /* These versions of the macros use the stack, as normal. There are debugging  /* These versions of the macros use the stack, as normal. There are debugging
# Line 384  Arguments: Line 382  Arguments:
382                   match_condassert - this is an assertion condition                   match_condassert - this is an assertion condition
383                   match_cbegroup - this is the start of an unlimited repeat                   match_cbegroup - this is the start of an unlimited repeat
384                     group that can match an empty string                     group that can match an empty string
                  match_tail_recursed - this is a tail_recursed group  
385     rdepth      the recursion depth     rdepth      the recursion depth
386    
387  Returns:       MATCH_MATCH if matched            )  these values are >= 0  Returns:       MATCH_MATCH if matched            )  these values are >= 0
# Line 586  original_ims = ims;    /* Save for reset Line 583  original_ims = ims;    /* Save for reset
583  string, the match_cbegroup flag is set. When this is the case, add the current  string, the match_cbegroup flag is set. When this is the case, add the current
584  subject pointer to the chain of such remembered pointers, to be checked when we  subject pointer to the chain of such remembered pointers, to be checked when we
585  hit the closing ket, in order to break infinite loops that match no characters.  hit the closing ket, in order to break infinite loops that match no characters.
586  When match() is called in other circumstances, don't add to the chain. If this  When match() is called in other circumstances, don't add to the chain. The
587  is a tail recursion, use a block from the workspace, as the one on the stack is  match_cbegroup flag must NOT be used with tail recursion, because the memory
588  already used. */  block that is used is on the stack, so a new one may be required for each
589    match(). */
590    
591  if ((flags & match_cbegroup) != 0)  if ((flags & match_cbegroup) != 0)
592    {    {
593    eptrblock *p;    newptrb.epb_saved_eptr = eptr;
594    if ((flags & match_tail_recursed) != 0)    newptrb.epb_prev = eptrb;
595      {    eptrb = &newptrb;
     if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);  
     p = md->eptrchain + md->eptrn++;  
     }  
   else p = &newptrb;  
   p->epb_saved_eptr = eptr;  
   p->epb_prev = eptrb;  
   eptrb = p;  
596    }    }
597    
598  /* Now start processing the opcodes. */  /* Now start processing the opcodes. */
# Line 677  for (;;) Line 668  for (;;)
668        RRETURN(MATCH_NOMATCH);        RRETURN(MATCH_NOMATCH);
669        }        }
670    
671      /* Insufficient room for saving captured contents. Treat as a non-capturing      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
672      bracket. */      as a non-capturing bracket. */
673    
674        /* VVVVVVVVVVVVVVVVVVVVVVVVV */
675        /* VVVVVVVVVVVVVVVVVVVVVVVVV */
676    
677      DPRINTF(("insufficient capture room: treat as non-capturing\n"));      DPRINTF(("insufficient capture room: treat as non-capturing\n"));
678    
679        /* VVVVVVVVVVVVVVVVVVVVVVVVV */
680        /* VVVVVVVVVVVVVVVVVVVVVVVVV */
681    
682      /* Non-capturing bracket. Loop for all the alternatives. When we get to the      /* Non-capturing bracket. Loop for all the alternatives. When we get to the
683      final alternative within the brackets, we would return the result of a      final alternative within the brackets, we would return the result of a
684      recursive call to match() whatever happened. We can reduce stack usage by      recursive call to match() whatever happened. We can reduce stack usage by
685      turning this into a tail recursion. */      turning this into a tail recursion, except in the case when match_cbegroup
686        is set.*/
687    
688      case OP_BRA:      case OP_BRA:
689      case OP_SBRA:      case OP_SBRA:
# Line 693  for (;;) Line 691  for (;;)
691      flags = (op >= OP_SBRA)? match_cbegroup : 0;      flags = (op >= OP_SBRA)? match_cbegroup : 0;
692      for (;;)      for (;;)
693        {        {
694        if (ecode[GET(ecode, 1)] != OP_ALT)        if (ecode[GET(ecode, 1)] != OP_ALT)   /* Final alternative */
695          {          {
696          ecode += _pcre_OP_lengths[*ecode];          if (flags == 0)    /* Not a possibly empty group */
697          flags |= match_tail_recursed;            {
698          DPRINTF(("bracket 0 tail recursion\n"));            ecode += _pcre_OP_lengths[*ecode];
699          goto TAIL_RECURSE;            DPRINTF(("bracket 0 tail recursion\n"));
700              goto TAIL_RECURSE;
701              }
702    
703            /* Possibly empty group; can't use tail recursion. */
704    
705            RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
706              eptrb, flags, RM48);
707            RRETURN(rrc);
708          }          }
709    
710        /* For non-final alternatives, continue the loop for a NOMATCH result;        /* For non-final alternatives, continue the loop for a NOMATCH result;
# Line 766  for (;;) Line 772  for (;;)
772        }        }
773    
774      /* We are now at the branch that is to be obeyed. As there is only one,      /* We are now at the branch that is to be obeyed. As there is only one,
775      we can use tail recursion to avoid using another stack frame. If the second      we can use tail recursion to avoid using another stack frame, except when
776      alternative doesn't exist, we can just plough on. */      match_cbegroup is required for an unlimited repeat of a possibly empty
777        group. If the second alternative doesn't exist, we can just plough on. */
778    
779      if (condition || *ecode == OP_ALT)      if (condition || *ecode == OP_ALT)
780        {        {
781        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
782        flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);        if (op == OP_SCOND)        /* Possibly empty group */
783        goto TAIL_RECURSE;          {
784            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
785            RRETURN(rrc);
786            }
787          else                       /* Group must match something */
788            {
789            flags = 0;
790            goto TAIL_RECURSE;
791            }
792        }        }
793      else      else                         /* Condition false & no 2nd alternative */
794        {        {
795        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
796        }        }
# Line 1027  for (;;) Line 1042  for (;;)
1042    
1043      do      do
1044        {        {
1045        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
         eptrb, 0, RM7);  
1046        if (rrc == MATCH_MATCH) break;        if (rrc == MATCH_MATCH) break;
1047        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1048        ecode += GET(ecode,1);        ecode += GET(ecode,1);
# Line 1073  for (;;) Line 1087  for (;;)
1087    
1088      if (*ecode == OP_KETRMIN)      if (*ecode == OP_KETRMIN)
1089        {        {
1090        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0,        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
         RM8);  
1091        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1092        ecode = prev;        ecode = prev;
1093        flags = match_tail_recursed;        flags = 0;
1094        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1095        }        }
1096      else  /* OP_KETRMAX */      else  /* OP_KETRMAX */
# Line 1085  for (;;) Line 1098  for (;;)
1098        RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);        RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1099        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1100        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1101        flags = match_tail_recursed;        flags = 0;
1102        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1103        }        }
1104      /* Control never gets here */      /* Control never gets here */
# Line 1216  for (;;) Line 1229  for (;;)
1229    
1230      /* The repeating kets try the rest of the pattern or restart from the      /* The repeating kets try the rest of the pattern or restart from the
1231      preceding bracket, in the appropriate order. In the second case, we can use      preceding bracket, in the appropriate order. In the second case, we can use
1232      tail recursion to avoid using another stack frame. */      tail recursion to avoid using another stack frame, unless we have an
1233        unlimited repeat of a group that can match an empty string. */
1234    
1235      flags = (*prev >= OP_SBRA)? match_cbegroup : 0;      flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1236    
1237      if (*ecode == OP_KETRMIN)      if (*ecode == OP_KETRMIN)
1238        {        {
1239        RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0,        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
         RM12);  
1240        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1241          if (flags != 0)    /* Could match an empty string */
1242            {
1243            RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1244            RRETURN(rrc);
1245            }
1246        ecode = prev;        ecode = prev;
       flags |= match_tail_recursed;  
1247        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1248        }        }
1249      else  /* OP_KETRMAX */      else  /* OP_KETRMAX */
# Line 1234  for (;;) Line 1251  for (;;)
1251        RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);        RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1252        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1253        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1254        flags = match_tail_recursed;        flags = 0;
1255        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1256        }        }
1257      /* Control never gets here */      /* Control never gets here */
# Line 2786  for (;;) Line 2803  for (;;)
2803            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2804              {              {
2805              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2806              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2807              }              }
2808            break;            break;
2809    
# Line 2794  for (;;) Line 2811  for (;;)
2811            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2812              {              {
2813              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2814              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2815              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2816              if ((prop_chartype == ucp_Lu ||              if ((prop_chartype == ucp_Lu ||
2817                   prop_chartype == ucp_Ll ||                   prop_chartype == ucp_Ll ||
# Line 2807  for (;;) Line 2824  for (;;)
2824            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2825              {              {
2826              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2827              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2828              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2829              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
2830                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2818  for (;;) Line 2835  for (;;)
2835            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2836              {              {
2837              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2838              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2839              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2840              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
2841                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2829  for (;;) Line 2846  for (;;)
2846            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2847              {              {
2848              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2849              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2850              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2851              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
2852                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 3764  for (;;) Line 3781  for (;;)
3781          switch(ctype)          switch(ctype)
3782            {            {
3783            case OP_ANY:            case OP_ANY:
   
           /* Special code is required for UTF8, but when the maximum is  
           unlimited we don't need it, so we repeat the non-UTF8 code. This is  
           probably worth it, because .* is quite a common idiom. */  
   
3784            if (max < INT_MAX)            if (max < INT_MAX)
3785              {              {
3786              if ((ims & PCRE_DOTALL) == 0)              if ((ims & PCRE_DOTALL) == 0)
# Line 3801  for (;;) Line 3813  for (;;)
3813                  {                  {
3814                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3815                  eptr++;                  eptr++;
3816                    while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3817                  }                  }
               break;  
3818                }                }
3819              else              else
3820                {                {
3821                c = max - min;                eptr = md->end_subject;
               if (c > (unsigned int)(md->end_subject - eptr))  
                 c = md->end_subject - eptr;  
               eptr += c;  
3822                }                }
3823              }              }
3824            break;            break;
# Line 4298  const uschar *start_bits = NULL; Line 4307  const uschar *start_bits = NULL;
4307  USPTR start_match = (USPTR)subject + start_offset;  USPTR start_match = (USPTR)subject + start_offset;
4308  USPTR end_subject;  USPTR end_subject;
4309  USPTR req_byte_ptr = start_match - 1;  USPTR req_byte_ptr = start_match - 1;
 eptrblock eptrchain[EPTR_WORK_SIZE];  
4310    
4311  pcre_study_data internal_study;  pcre_study_data internal_study;
4312  const pcre_study_data *study;  const pcre_study_data *study;
# Line 4384  md->partial = (options & PCRE_PARTIAL) ! Line 4392  md->partial = (options & PCRE_PARTIAL) !
4392  md->hitend = FALSE;  md->hitend = FALSE;
4393    
4394  md->recursive = NULL;                   /* No recursion at top level */  md->recursive = NULL;                   /* No recursion at top level */
 md->eptrchain = eptrchain;              /* Make workspace generally available */  
4395    
4396  md->lcc = tables + lcc_offset;  md->lcc = tables + lcc_offset;
4397  md->ctypes = tables + ctypes_offset;  md->ctypes = tables + ctypes_offset;
# Line 4682  for(;;) Line 4689  for(;;)
4689    
4690    md->start_match_ptr = start_match;      /* Insurance */    md->start_match_ptr = start_match;      /* Insurance */
4691    md->match_call_count = 0;    md->match_call_count = 0;
4692    md->eptrn = 0;                          /* Next free eptrchain slot */    rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
   rc = match(start_match, md->start_code, start_match, 2, md,  
     ims, NULL, 0, 0);  
4693    
4694    /* Any return other than MATCH_NOMATCH breaks the loop. */    /* Any return other than MATCH_NOMATCH breaks the loop. */
4695    

Legend:
Removed from v.182  
changed lines
  Added in v.200

  ViewVC Help
Powered by ViewVC 1.1.5