/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 168 by ph10, Tue May 29 15:18:18 2007 UTC revision 197 by ph10, Tue Jul 31 10:50:18 2007 UTC
# Line 53  possible. There are also some static sup Line 53  possible. There are also some static sup
53  #undef min  #undef min
54  #undef max  #undef max
55    
 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,  
 obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */  
   
 #define EPTR_WORK_SIZE (1000)  
   
56  /* Flag bits for the match() function */  /* Flag bits for the match() function */
57    
58  #define match_condassert     0x01  /* Called to check a condition assertion */  #define match_condassert     0x01  /* Called to check a condition assertion */
59  #define match_cbegroup       0x02  /* Could-be-empty unlimited repeat group */  #define match_cbegroup       0x02  /* Could-be-empty unlimited repeat group */
 #define match_tail_recursed  0x04  /* Tail recursive call */  
60    
61  /* Non-error returns from the match() function. Error returns are externally  /* Non-error returns from the match() function. Error returns are externally
62  defined PCRE_ERROR_xxx codes, which are all negative. */  defined PCRE_ERROR_xxx codes, which are all negative. */
# Line 212  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM Line 206  enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM
206         RM11,  RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,         RM11,  RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
207         RM21,  RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,         RM21,  RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
208         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
209         RM41,  RM42, RM43, RM44, RM45, RM46, RM47 };         RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50 };
210    
211    
212  /* These versions of the macros use the stack, as normal. There are debugging  /* These versions of the macros use the stack, as normal. There are debugging
# Line 292  typedef struct heapframe { Line 286  typedef struct heapframe {
286    
287    const uschar *Xeptr;    const uschar *Xeptr;
288    const uschar *Xecode;    const uschar *Xecode;
289    const uschar *Xmstart;    const uschar *Xmstart;
290    int Xoffset_top;    int Xoffset_top;
291    long int Xims;    long int Xims;
292    eptrblock *Xeptrb;    eptrblock *Xeptrb;
# Line 374  Arguments: Line 368  Arguments:
368     eptr        pointer to current character in subject     eptr        pointer to current character in subject
369     ecode       pointer to current position in compiled code     ecode       pointer to current position in compiled code
370     mstart      pointer to the current match start position (can be modified     mstart      pointer to the current match start position (can be modified
371                   by encountering \K)                   by encountering \K)
372     offset_top  current top pointer     offset_top  current top pointer
373     md          pointer to "static" info for the match     md          pointer to "static" info for the match
374     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
# Line 384  Arguments: Line 378  Arguments:
378                   match_condassert - this is an assertion condition                   match_condassert - this is an assertion condition
379                   match_cbegroup - this is the start of an unlimited repeat                   match_cbegroup - this is the start of an unlimited repeat
380                     group that can match an empty string                     group that can match an empty string
                  match_tail_recursed - this is a tail_recursed group  
381     rdepth      the recursion depth     rdepth      the recursion depth
382    
383  Returns:       MATCH_MATCH if matched            )  these values are >= 0  Returns:       MATCH_MATCH if matched            )  these values are >= 0
# Line 394  Returns:       MATCH_MATCH if matched Line 387  Returns:       MATCH_MATCH if matched
387  */  */
388    
389  static int  static int
390  match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,  match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
391    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
392    int flags, unsigned int rdepth)    int flags, unsigned int rdepth)
393  {  {
# Line 586  original_ims = ims;    /* Save for reset Line 579  original_ims = ims;    /* Save for reset
579  string, the match_cbegroup flag is set. When this is the case, add the current  string, the match_cbegroup flag is set. When this is the case, add the current
580  subject pointer to the chain of such remembered pointers, to be checked when we  subject pointer to the chain of such remembered pointers, to be checked when we
581  hit the closing ket, in order to break infinite loops that match no characters.  hit the closing ket, in order to break infinite loops that match no characters.
582  When match() is called in other circumstances, don't add to the chain. If this  When match() is called in other circumstances, don't add to the chain. The
583  is a tail recursion, use a block from the workspace, as the one on the stack is  match_cbegroup flag must NOT be used with tail recursion, because the memory
584  already used. */  block that is used is on the stack, so a new one may be required for each
585    match(). */
586    
587  if ((flags & match_cbegroup) != 0)  if ((flags & match_cbegroup) != 0)
588    {    {
589    eptrblock *p;    newptrb.epb_saved_eptr = eptr;
590    if ((flags & match_tail_recursed) != 0)    newptrb.epb_prev = eptrb;
591      {    eptrb = &newptrb;
     if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);  
     p = md->eptrchain + md->eptrn++;  
     }  
   else p = &newptrb;  
   p->epb_saved_eptr = eptr;  
   p->epb_prev = eptrb;  
   eptrb = p;  
592    }    }
593    
594  /* Now start processing the opcodes. */  /* Now start processing the opcodes. */
# Line 677  for (;;) Line 664  for (;;)
664        RRETURN(MATCH_NOMATCH);        RRETURN(MATCH_NOMATCH);
665        }        }
666    
667      /* Insufficient room for saving captured contents. Treat as a non-capturing      /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
668      bracket. */      as a non-capturing bracket. */
669    
670        /* VVVVVVVVVVVVVVVVVVVVVVVVV */
671        /* VVVVVVVVVVVVVVVVVVVVVVVVV */
672    
673      DPRINTF(("insufficient capture room: treat as non-capturing\n"));      DPRINTF(("insufficient capture room: treat as non-capturing\n"));
674    
675        /* VVVVVVVVVVVVVVVVVVVVVVVVV */
676        /* VVVVVVVVVVVVVVVVVVVVVVVVV */
677    
678      /* Non-capturing bracket. Loop for all the alternatives. When we get to the      /* Non-capturing bracket. Loop for all the alternatives. When we get to the
679      final alternative within the brackets, we would return the result of a      final alternative within the brackets, we would return the result of a
680      recursive call to match() whatever happened. We can reduce stack usage by      recursive call to match() whatever happened. We can reduce stack usage by
681      turning this into a tail recursion. */      turning this into a tail recursion, except in the case when match_cbegroup
682        is set.*/
683    
684      case OP_BRA:      case OP_BRA:
685      case OP_SBRA:      case OP_SBRA:
# Line 693  for (;;) Line 687  for (;;)
687      flags = (op >= OP_SBRA)? match_cbegroup : 0;      flags = (op >= OP_SBRA)? match_cbegroup : 0;
688      for (;;)      for (;;)
689        {        {
690        if (ecode[GET(ecode, 1)] != OP_ALT)        if (ecode[GET(ecode, 1)] != OP_ALT)   /* Final alternative */
691          {          {
692          ecode += _pcre_OP_lengths[*ecode];          if (flags == 0)    /* Not a possibly empty group */
693          flags |= match_tail_recursed;            {
694          DPRINTF(("bracket 0 tail recursion\n"));            ecode += _pcre_OP_lengths[*ecode];
695          goto TAIL_RECURSE;            DPRINTF(("bracket 0 tail recursion\n"));
696              goto TAIL_RECURSE;
697              }
698    
699            /* Possibly empty group; can't use tail recursion. */
700    
701            RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
702              eptrb, flags, RM48);
703            RRETURN(rrc);
704          }          }
705    
706        /* For non-final alternatives, continue the loop for a NOMATCH result;        /* For non-final alternatives, continue the loop for a NOMATCH result;
# Line 766  for (;;) Line 768  for (;;)
768        }        }
769    
770      /* We are now at the branch that is to be obeyed. As there is only one,      /* We are now at the branch that is to be obeyed. As there is only one,
771      we can use tail recursion to avoid using another stack frame. If the second      we can use tail recursion to avoid using another stack frame, except when
772      alternative doesn't exist, we can just plough on. */      match_cbegroup is required for an unlimited repeat of a possibly empty
773        group. If the second alternative doesn't exist, we can just plough on. */
774    
775      if (condition || *ecode == OP_ALT)      if (condition || *ecode == OP_ALT)
776        {        {
777        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
778        flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);        if (op == OP_SCOND)        /* Possibly empty group */
779        goto TAIL_RECURSE;          {
780            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
781            RRETURN(rrc);
782            }
783          else                       /* Group must match something */
784            {
785            flags = 0;
786            goto TAIL_RECURSE;
787            }
788        }        }
789      else      else                         /* Condition false & no 2nd alternative */
790        {        {
791        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
792        }        }
# Line 1027  for (;;) Line 1038  for (;;)
1038    
1039      do      do
1040        {        {
1041        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
         eptrb, 0, RM7);  
1042        if (rrc == MATCH_MATCH) break;        if (rrc == MATCH_MATCH) break;
1043        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1044        ecode += GET(ecode,1);        ecode += GET(ecode,1);
# Line 1073  for (;;) Line 1083  for (;;)
1083    
1084      if (*ecode == OP_KETRMIN)      if (*ecode == OP_KETRMIN)
1085        {        {
1086        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0,        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
         RM8);  
1087        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1088        ecode = prev;        ecode = prev;
1089        flags = match_tail_recursed;        flags = 0;
1090        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1091        }        }
1092      else  /* OP_KETRMAX */      else  /* OP_KETRMAX */
# Line 1085  for (;;) Line 1094  for (;;)
1094        RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);        RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1095        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1096        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1097        flags = match_tail_recursed;        flags = 0;
1098        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1099        }        }
1100      /* Control never gets here */      /* Control never gets here */
# Line 1216  for (;;) Line 1225  for (;;)
1225    
1226      /* The repeating kets try the rest of the pattern or restart from the      /* The repeating kets try the rest of the pattern or restart from the
1227      preceding bracket, in the appropriate order. In the second case, we can use      preceding bracket, in the appropriate order. In the second case, we can use
1228      tail recursion to avoid using another stack frame. */      tail recursion to avoid using another stack frame, unless we have an
1229        unlimited repeat of a group that can match an empty string. */
1230    
1231      flags = (*prev >= OP_SBRA)? match_cbegroup : 0;      flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1232    
1233      if (*ecode == OP_KETRMIN)      if (*ecode == OP_KETRMIN)
1234        {        {
1235        RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0,        RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
         RM12);  
1236        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1237          if (flags != 0)    /* Could match an empty string */
1238            {
1239            RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1240            RRETURN(rrc);
1241            }
1242        ecode = prev;        ecode = prev;
       flags |= match_tail_recursed;  
1243        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1244        }        }
1245      else  /* OP_KETRMAX */      else  /* OP_KETRMAX */
# Line 1234  for (;;) Line 1247  for (;;)
1247        RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);        RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1248        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1249        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1250        flags = match_tail_recursed;        flags = 0;
1251        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1252        }        }
1253      /* Control never gets here */      /* Control never gets here */
# Line 1266  for (;;) Line 1279  for (;;)
1279      if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);      if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1280      ecode++;      ecode++;
1281      break;      break;
1282    
1283      /* Reset the start of match point */      /* Reset the start of match point */
1284    
1285      case OP_SET_SOM:      case OP_SET_SOM:
1286      mstart = eptr;      mstart = eptr;
1287      ecode++;      ecode++;
1288      break;      break;
1289    
1290      /* Assert before internal newline if multiline, or before a terminating      /* Assert before internal newline if multiline, or before a terminating
1291      newline unless endonly is set, else end of subject unless noteol is set. */      newline unless endonly is set, else end of subject unless noteol is set. */
# Line 1482  for (;;) Line 1495  for (;;)
1495      ecode++;      ecode++;
1496      break;      break;
1497    
1498        case OP_NOT_HSPACE:
1499        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1500        GETCHARINCTEST(c, eptr);
1501        switch(c)
1502          {
1503          default: break;
1504          case 0x09:      /* HT */
1505          case 0x20:      /* SPACE */
1506          case 0xa0:      /* NBSP */
1507          case 0x1680:    /* OGHAM SPACE MARK */
1508          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1509          case 0x2000:    /* EN QUAD */
1510          case 0x2001:    /* EM QUAD */
1511          case 0x2002:    /* EN SPACE */
1512          case 0x2003:    /* EM SPACE */
1513          case 0x2004:    /* THREE-PER-EM SPACE */
1514          case 0x2005:    /* FOUR-PER-EM SPACE */
1515          case 0x2006:    /* SIX-PER-EM SPACE */
1516          case 0x2007:    /* FIGURE SPACE */
1517          case 0x2008:    /* PUNCTUATION SPACE */
1518          case 0x2009:    /* THIN SPACE */
1519          case 0x200A:    /* HAIR SPACE */
1520          case 0x202f:    /* NARROW NO-BREAK SPACE */
1521          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1522          case 0x3000:    /* IDEOGRAPHIC SPACE */
1523          RRETURN(MATCH_NOMATCH);
1524          }
1525        ecode++;
1526        break;
1527    
1528        case OP_HSPACE:
1529        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1530        GETCHARINCTEST(c, eptr);
1531        switch(c)
1532          {
1533          default: RRETURN(MATCH_NOMATCH);
1534          case 0x09:      /* HT */
1535          case 0x20:      /* SPACE */
1536          case 0xa0:      /* NBSP */
1537          case 0x1680:    /* OGHAM SPACE MARK */
1538          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1539          case 0x2000:    /* EN QUAD */
1540          case 0x2001:    /* EM QUAD */
1541          case 0x2002:    /* EN SPACE */
1542          case 0x2003:    /* EM SPACE */
1543          case 0x2004:    /* THREE-PER-EM SPACE */
1544          case 0x2005:    /* FOUR-PER-EM SPACE */
1545          case 0x2006:    /* SIX-PER-EM SPACE */
1546          case 0x2007:    /* FIGURE SPACE */
1547          case 0x2008:    /* PUNCTUATION SPACE */
1548          case 0x2009:    /* THIN SPACE */
1549          case 0x200A:    /* HAIR SPACE */
1550          case 0x202f:    /* NARROW NO-BREAK SPACE */
1551          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1552          case 0x3000:    /* IDEOGRAPHIC SPACE */
1553          break;
1554          }
1555        ecode++;
1556        break;
1557    
1558        case OP_NOT_VSPACE:
1559        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1560        GETCHARINCTEST(c, eptr);
1561        switch(c)
1562          {
1563          default: break;
1564          case 0x0a:      /* LF */
1565          case 0x0b:      /* VT */
1566          case 0x0c:      /* FF */
1567          case 0x0d:      /* CR */
1568          case 0x85:      /* NEL */
1569          case 0x2028:    /* LINE SEPARATOR */
1570          case 0x2029:    /* PARAGRAPH SEPARATOR */
1571          RRETURN(MATCH_NOMATCH);
1572          }
1573        ecode++;
1574        break;
1575    
1576        case OP_VSPACE:
1577        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1578        GETCHARINCTEST(c, eptr);
1579        switch(c)
1580          {
1581          default: RRETURN(MATCH_NOMATCH);
1582          case 0x0a:      /* LF */
1583          case 0x0b:      /* VT */
1584          case 0x0c:      /* FF */
1585          case 0x0d:      /* CR */
1586          case 0x85:      /* NEL */
1587          case 0x2028:    /* LINE SEPARATOR */
1588          case 0x2029:    /* PARAGRAPH SEPARATOR */
1589          break;
1590          }
1591        ecode++;
1592        break;
1593    
1594  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1595      /* Check the next character by Unicode property. We will get here only      /* Check the next character by Unicode property. We will get here only
1596      if the support is in the binary; otherwise a compile-time error occurs. */      if the support is in the binary; otherwise a compile-time error occurs. */
# Line 2690  for (;;) Line 2799  for (;;)
2799            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2800              {              {
2801              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2802              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2803              }              }
2804            break;            break;
2805    
# Line 2698  for (;;) Line 2807  for (;;)
2807            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2808              {              {
2809              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2810              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2811              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2812              if ((prop_chartype == ucp_Lu ||              if ((prop_chartype == ucp_Lu ||
2813                   prop_chartype == ucp_Ll ||                   prop_chartype == ucp_Ll ||
# Line 2711  for (;;) Line 2820  for (;;)
2820            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2821              {              {
2822              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2823              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2824              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2825              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
2826                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2722  for (;;) Line 2831  for (;;)
2831            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2832              {              {
2833              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2834              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2835              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2836              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
2837                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2733  for (;;) Line 2842  for (;;)
2842            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2843              {              {
2844              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2845              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2846              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2847              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
2848                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2814  for (;;) Line 2923  for (;;)
2923            }            }
2924          break;          break;
2925    
2926            case OP_NOT_HSPACE:
2927            for (i = 1; i <= min; i++)
2928              {
2929              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2930              GETCHARINC(c, eptr);
2931              switch(c)
2932                {
2933                default: break;
2934                case 0x09:      /* HT */
2935                case 0x20:      /* SPACE */
2936                case 0xa0:      /* NBSP */
2937                case 0x1680:    /* OGHAM SPACE MARK */
2938                case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2939                case 0x2000:    /* EN QUAD */
2940                case 0x2001:    /* EM QUAD */
2941                case 0x2002:    /* EN SPACE */
2942                case 0x2003:    /* EM SPACE */
2943                case 0x2004:    /* THREE-PER-EM SPACE */
2944                case 0x2005:    /* FOUR-PER-EM SPACE */
2945                case 0x2006:    /* SIX-PER-EM SPACE */
2946                case 0x2007:    /* FIGURE SPACE */
2947                case 0x2008:    /* PUNCTUATION SPACE */
2948                case 0x2009:    /* THIN SPACE */
2949                case 0x200A:    /* HAIR SPACE */
2950                case 0x202f:    /* NARROW NO-BREAK SPACE */
2951                case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2952                case 0x3000:    /* IDEOGRAPHIC SPACE */
2953                RRETURN(MATCH_NOMATCH);
2954                }
2955              }
2956            break;
2957    
2958            case OP_HSPACE:
2959            for (i = 1; i <= min; i++)
2960              {
2961              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962              GETCHARINC(c, eptr);
2963              switch(c)
2964                {
2965                default: RRETURN(MATCH_NOMATCH);
2966                case 0x09:      /* HT */
2967                case 0x20:      /* SPACE */
2968                case 0xa0:      /* NBSP */
2969                case 0x1680:    /* OGHAM SPACE MARK */
2970                case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2971                case 0x2000:    /* EN QUAD */
2972                case 0x2001:    /* EM QUAD */
2973                case 0x2002:    /* EN SPACE */
2974                case 0x2003:    /* EM SPACE */
2975                case 0x2004:    /* THREE-PER-EM SPACE */
2976                case 0x2005:    /* FOUR-PER-EM SPACE */
2977                case 0x2006:    /* SIX-PER-EM SPACE */
2978                case 0x2007:    /* FIGURE SPACE */
2979                case 0x2008:    /* PUNCTUATION SPACE */
2980                case 0x2009:    /* THIN SPACE */
2981                case 0x200A:    /* HAIR SPACE */
2982                case 0x202f:    /* NARROW NO-BREAK SPACE */
2983                case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2984                case 0x3000:    /* IDEOGRAPHIC SPACE */
2985                break;
2986                }
2987              }
2988            break;
2989    
2990            case OP_NOT_VSPACE:
2991            for (i = 1; i <= min; i++)
2992              {
2993              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2994              GETCHARINC(c, eptr);
2995              switch(c)
2996                {
2997                default: break;
2998                case 0x0a:      /* LF */
2999                case 0x0b:      /* VT */
3000                case 0x0c:      /* FF */
3001                case 0x0d:      /* CR */
3002                case 0x85:      /* NEL */
3003                case 0x2028:    /* LINE SEPARATOR */
3004                case 0x2029:    /* PARAGRAPH SEPARATOR */
3005                RRETURN(MATCH_NOMATCH);
3006                }
3007              }
3008            break;
3009    
3010            case OP_VSPACE:
3011            for (i = 1; i <= min; i++)
3012              {
3013              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3014              GETCHARINC(c, eptr);
3015              switch(c)
3016                {
3017                default: RRETURN(MATCH_NOMATCH);
3018                case 0x0a:      /* LF */
3019                case 0x0b:      /* VT */
3020                case 0x0c:      /* FF */
3021                case 0x0d:      /* CR */
3022                case 0x85:      /* NEL */
3023                case 0x2028:    /* LINE SEPARATOR */
3024                case 0x2029:    /* PARAGRAPH SEPARATOR */
3025                break;
3026                }
3027              }
3028            break;
3029    
3030          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
3031          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
3032            {            {
# Line 2925  for (;;) Line 3138  for (;;)
3138            }            }
3139          break;          break;
3140    
3141            case OP_NOT_HSPACE:
3142            for (i = 1; i <= min; i++)
3143              {
3144              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3145              switch(*eptr++)
3146                {
3147                default: break;
3148                case 0x09:      /* HT */
3149                case 0x20:      /* SPACE */
3150                case 0xa0:      /* NBSP */
3151                RRETURN(MATCH_NOMATCH);
3152                }
3153              }
3154            break;
3155    
3156            case OP_HSPACE:
3157            for (i = 1; i <= min; i++)
3158              {
3159              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3160              switch(*eptr++)
3161                {
3162                default: RRETURN(MATCH_NOMATCH);
3163                case 0x09:      /* HT */
3164                case 0x20:      /* SPACE */
3165                case 0xa0:      /* NBSP */
3166                break;
3167                }
3168              }
3169            break;
3170    
3171            case OP_NOT_VSPACE:
3172            for (i = 1; i <= min; i++)
3173              {
3174              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3175              switch(*eptr++)
3176                {
3177                default: break;
3178                case 0x0a:      /* LF */
3179                case 0x0b:      /* VT */
3180                case 0x0c:      /* FF */
3181                case 0x0d:      /* CR */
3182                case 0x85:      /* NEL */
3183                RRETURN(MATCH_NOMATCH);
3184                }
3185              }
3186            break;
3187    
3188            case OP_VSPACE:
3189            for (i = 1; i <= min; i++)
3190              {
3191              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3192              switch(*eptr++)
3193                {
3194                default: RRETURN(MATCH_NOMATCH);
3195                case 0x0a:      /* LF */
3196                case 0x0b:      /* VT */
3197                case 0x0c:      /* FF */
3198                case 0x0d:      /* CR */
3199                case 0x85:      /* NEL */
3200                break;
3201                }
3202              }
3203            break;
3204    
3205          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
3206          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
3207            if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);            if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
# Line 3116  for (;;) Line 3393  for (;;)
3393                }                }
3394              break;              break;
3395    
3396                case OP_NOT_HSPACE:
3397                switch(c)
3398                  {
3399                  default: break;
3400                  case 0x09:      /* HT */
3401                  case 0x20:      /* SPACE */
3402                  case 0xa0:      /* NBSP */
3403                  case 0x1680:    /* OGHAM SPACE MARK */
3404                  case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
3405                  case 0x2000:    /* EN QUAD */
3406                  case 0x2001:    /* EM QUAD */
3407                  case 0x2002:    /* EN SPACE */
3408                  case 0x2003:    /* EM SPACE */
3409                  case 0x2004:    /* THREE-PER-EM SPACE */
3410                  case 0x2005:    /* FOUR-PER-EM SPACE */
3411                  case 0x2006:    /* SIX-PER-EM SPACE */
3412                  case 0x2007:    /* FIGURE SPACE */
3413                  case 0x2008:    /* PUNCTUATION SPACE */
3414                  case 0x2009:    /* THIN SPACE */
3415                  case 0x200A:    /* HAIR SPACE */
3416                  case 0x202f:    /* NARROW NO-BREAK SPACE */
3417                  case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
3418                  case 0x3000:    /* IDEOGRAPHIC SPACE */
3419                  RRETURN(MATCH_NOMATCH);
3420                  }
3421                break;
3422    
3423                case OP_HSPACE:
3424                switch(c)
3425                  {
3426                  default: RRETURN(MATCH_NOMATCH);
3427                  case 0x09:      /* HT */
3428                  case 0x20:      /* SPACE */
3429                  case 0xa0:      /* NBSP */
3430                  case 0x1680:    /* OGHAM SPACE MARK */
3431                  case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
3432                  case 0x2000:    /* EN QUAD */
3433                  case 0x2001:    /* EM QUAD */
3434                  case 0x2002:    /* EN SPACE */
3435                  case 0x2003:    /* EM SPACE */
3436                  case 0x2004:    /* THREE-PER-EM SPACE */
3437                  case 0x2005:    /* FOUR-PER-EM SPACE */
3438                  case 0x2006:    /* SIX-PER-EM SPACE */
3439                  case 0x2007:    /* FIGURE SPACE */
3440                  case 0x2008:    /* PUNCTUATION SPACE */
3441                  case 0x2009:    /* THIN SPACE */
3442                  case 0x200A:    /* HAIR SPACE */
3443                  case 0x202f:    /* NARROW NO-BREAK SPACE */
3444                  case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
3445                  case 0x3000:    /* IDEOGRAPHIC SPACE */
3446                  break;
3447                  }
3448                break;
3449    
3450                case OP_NOT_VSPACE:
3451                switch(c)
3452                  {
3453                  default: break;
3454                  case 0x0a:      /* LF */
3455                  case 0x0b:      /* VT */
3456                  case 0x0c:      /* FF */
3457                  case 0x0d:      /* CR */
3458                  case 0x85:      /* NEL */
3459                  case 0x2028:    /* LINE SEPARATOR */
3460                  case 0x2029:    /* PARAGRAPH SEPARATOR */
3461                  RRETURN(MATCH_NOMATCH);
3462                  }
3463                break;
3464    
3465                case OP_VSPACE:
3466                switch(c)
3467                  {
3468                  default: RRETURN(MATCH_NOMATCH);
3469                  case 0x0a:      /* LF */
3470                  case 0x0b:      /* VT */
3471                  case 0x0c:      /* FF */
3472                  case 0x0d:      /* CR */
3473                  case 0x85:      /* NEL */
3474                  case 0x2028:    /* LINE SEPARATOR */
3475                  case 0x2029:    /* PARAGRAPH SEPARATOR */
3476                  break;
3477                  }
3478                break;
3479    
3480              case OP_NOT_DIGIT:              case OP_NOT_DIGIT:
3481              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3482                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 3187  for (;;) Line 3548  for (;;)
3548                }                }
3549              break;              break;
3550    
3551                case OP_NOT_HSPACE:
3552                switch(c)
3553                  {
3554                  default: break;
3555                  case 0x09:      /* HT */
3556                  case 0x20:      /* SPACE */
3557                  case 0xa0:      /* NBSP */
3558                  RRETURN(MATCH_NOMATCH);
3559                  }
3560                break;
3561    
3562                case OP_HSPACE:
3563                switch(c)
3564                  {
3565                  default: RRETURN(MATCH_NOMATCH);
3566                  case 0x09:      /* HT */
3567                  case 0x20:      /* SPACE */
3568                  case 0xa0:      /* NBSP */
3569                  break;
3570                  }
3571                break;
3572    
3573                case OP_NOT_VSPACE:
3574                switch(c)
3575                  {
3576                  default: break;
3577                  case 0x0a:      /* LF */
3578                  case 0x0b:      /* VT */
3579                  case 0x0c:      /* FF */
3580                  case 0x0d:      /* CR */
3581                  case 0x85:      /* NEL */
3582                  RRETURN(MATCH_NOMATCH);
3583                  }
3584                break;
3585    
3586                case OP_VSPACE:
3587                switch(c)
3588                  {
3589                  default: RRETURN(MATCH_NOMATCH);
3590                  case 0x0a:      /* LF */
3591                  case 0x0b:      /* VT */
3592                  case 0x0c:      /* FF */
3593                  case 0x0d:      /* CR */
3594                  case 0x85:      /* NEL */
3595                  break;
3596                  }
3597                break;
3598    
3599              case OP_NOT_DIGIT:              case OP_NOT_DIGIT:
3600              if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);              if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3601              break;              break;
# Line 3368  for (;;) Line 3777  for (;;)
3777          switch(ctype)          switch(ctype)
3778            {            {
3779            case OP_ANY:            case OP_ANY:
   
           /* Special code is required for UTF8, but when the maximum is  
           unlimited we don't need it, so we repeat the non-UTF8 code. This is  
           probably worth it, because .* is quite a common idiom. */  
   
3780            if (max < INT_MAX)            if (max < INT_MAX)
3781              {              {
3782              if ((ims & PCRE_DOTALL) == 0)              if ((ims & PCRE_DOTALL) == 0)
# Line 3405  for (;;) Line 3809  for (;;)
3809                  {                  {
3810                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3811                  eptr++;                  eptr++;
3812                    while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3813                  }                  }
               break;  
3814                }                }
3815              else              else
3816                {                {
3817                c = max - min;                eptr = md->end_subject;
               if (c > (unsigned int)(md->end_subject - eptr))  
                 c = md->end_subject - eptr;  
               eptr += c;  
3818                }                }
3819              }              }
3820            break;            break;
# Line 3448  for (;;) Line 3849  for (;;)
3849              }              }
3850            break;            break;
3851    
3852              case OP_NOT_HSPACE:
3853              case OP_HSPACE:
3854              for (i = min; i < max; i++)
3855                {
3856                BOOL gotspace;
3857                int len = 1;
3858                if (eptr >= md->end_subject) break;
3859                GETCHARLEN(c, eptr, len);
3860                switch(c)
3861                  {
3862                  default: gotspace = FALSE; break;
3863                  case 0x09:      /* HT */
3864                  case 0x20:      /* SPACE */
3865                  case 0xa0:      /* NBSP */
3866                  case 0x1680:    /* OGHAM SPACE MARK */
3867                  case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
3868                  case 0x2000:    /* EN QUAD */
3869                  case 0x2001:    /* EM QUAD */
3870                  case 0x2002:    /* EN SPACE */
3871                  case 0x2003:    /* EM SPACE */
3872                  case 0x2004:    /* THREE-PER-EM SPACE */
3873                  case 0x2005:    /* FOUR-PER-EM SPACE */
3874                  case 0x2006:    /* SIX-PER-EM SPACE */
3875                  case 0x2007:    /* FIGURE SPACE */
3876                  case 0x2008:    /* PUNCTUATION SPACE */
3877                  case 0x2009:    /* THIN SPACE */
3878                  case 0x200A:    /* HAIR SPACE */
3879                  case 0x202f:    /* NARROW NO-BREAK SPACE */
3880                  case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
3881                  case 0x3000:    /* IDEOGRAPHIC SPACE */
3882                  gotspace = TRUE;
3883                  break;
3884                  }
3885                if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3886                eptr += len;
3887                }
3888              break;
3889    
3890              case OP_NOT_VSPACE:
3891              case OP_VSPACE:
3892              for (i = min; i < max; i++)
3893                {
3894                BOOL gotspace;
3895                int len = 1;
3896                if (eptr >= md->end_subject) break;
3897                GETCHARLEN(c, eptr, len);
3898                switch(c)
3899                  {
3900                  default: gotspace = FALSE; break;
3901                  case 0x0a:      /* LF */
3902                  case 0x0b:      /* VT */
3903                  case 0x0c:      /* FF */
3904                  case 0x0d:      /* CR */
3905                  case 0x85:      /* NEL */
3906                  case 0x2028:    /* LINE SEPARATOR */
3907                  case 0x2029:    /* PARAGRAPH SEPARATOR */
3908                  gotspace = TRUE;
3909                  break;
3910                  }
3911                if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3912                eptr += len;
3913                }
3914              break;
3915    
3916            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
3917            for (i = min; i < max; i++)            for (i = min; i < max; i++)
3918              {              {
# Line 3574  for (;;) Line 4039  for (;;)
4039              }              }
4040            break;            break;
4041    
4042              case OP_NOT_HSPACE:
4043              for (i = min; i < max; i++)
4044                {
4045                if (eptr >= md->end_subject) break;
4046                c = *eptr;
4047                if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4048                eptr++;
4049                }
4050              break;
4051    
4052              case OP_HSPACE:
4053              for (i = min; i < max; i++)
4054                {
4055                if (eptr >= md->end_subject) break;
4056                c = *eptr;
4057                if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4058                eptr++;
4059                }
4060              break;
4061    
4062              case OP_NOT_VSPACE:
4063              for (i = min; i < max; i++)
4064                {
4065                if (eptr >= md->end_subject) break;
4066                c = *eptr;
4067                if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4068                  break;
4069                eptr++;
4070                }
4071              break;
4072    
4073              case OP_VSPACE:
4074              for (i = min; i < max; i++)
4075                {
4076                if (eptr >= md->end_subject) break;
4077                c = *eptr;
4078                if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4079                  break;
4080                eptr++;
4081                }
4082              break;
4083    
4084            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
4085            for (i = min; i < max; i++)            for (i = min; i < max; i++)
4086              {              {
# Line 3796  const uschar *start_bits = NULL; Line 4303  const uschar *start_bits = NULL;
4303  USPTR start_match = (USPTR)subject + start_offset;  USPTR start_match = (USPTR)subject + start_offset;
4304  USPTR end_subject;  USPTR end_subject;
4305  USPTR req_byte_ptr = start_match - 1;  USPTR req_byte_ptr = start_match - 1;
 eptrblock eptrchain[EPTR_WORK_SIZE];  
4306    
4307  pcre_study_data internal_study;  pcre_study_data internal_study;
4308  const pcre_study_data *study;  const pcre_study_data *study;
# Line 3882  md->partial = (options & PCRE_PARTIAL) ! Line 4388  md->partial = (options & PCRE_PARTIAL) !
4388  md->hitend = FALSE;  md->hitend = FALSE;
4389    
4390  md->recursive = NULL;                   /* No recursion at top level */  md->recursive = NULL;                   /* No recursion at top level */
 md->eptrchain = eptrchain;              /* Make workspace generally available */  
4391    
4392  md->lcc = tables + lcc_offset;  md->lcc = tables + lcc_offset;
4393  md->ctypes = tables + ctypes_offset;  md->ctypes = tables + ctypes_offset;
# Line 4180  for(;;) Line 4685  for(;;)
4685    
4686    md->start_match_ptr = start_match;      /* Insurance */    md->start_match_ptr = start_match;      /* Insurance */
4687    md->match_call_count = 0;    md->match_call_count = 0;
4688    md->eptrn = 0;                          /* Next free eptrchain slot */    rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
   rc = match(start_match, md->start_code, start_match, 2, md,  
     ims, NULL, 0, 0);  
4689    
4690    /* Any return other than MATCH_NOMATCH breaks the loop. */    /* Any return other than MATCH_NOMATCH breaks the loop. */
4691    
# Line 4263  if (rc == MATCH_MATCH) Line 4766  if (rc == MATCH_MATCH)
4766    rc = md->offset_overflow? 0 : md->end_offset_top/2;    rc = md->offset_overflow? 0 : md->end_offset_top/2;
4767    
4768    /* If there is space, set up the whole thing as substring 0. The value of    /* If there is space, set up the whole thing as substring 0. The value of
4769    md->start_match_ptr might be modified if \K was encountered on the success    md->start_match_ptr might be modified if \K was encountered on the success
4770    matching path. */    matching path. */
4771    
4772    if (offsetcount < 2) rc = 0; else    if (offsetcount < 2) rc = 0; else

Legend:
Removed from v.168  
changed lines
  Added in v.197

  ViewVC Help
Powered by ViewVC 1.1.5