/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 569 by ph10, Sun Nov 7 16:14:50 2010 UTC revision 595 by ph10, Mon May 2 10:33:29 2011 UTC
# Line 132  while (length-- > 0) Line 132  while (length-- > 0)
132  *          Match a back-reference                *  *          Match a back-reference                *
133  *************************************************/  *************************************************/
134    
135  /* If a back reference hasn't been set, the length that is passed is greater  /* Normally, if a back reference hasn't been set, the length that is passed is
136  than the number of characters left in the string, so the match fails.  negative, so the match always fails. However, in JavaScript compatibility mode,
137    the length passed is zero. Note that in caseless UTF-8 mode, the number of
138    subject bytes matched may be different to the number of reference bytes.
139    
140  Arguments:  Arguments:
141    offset      index into the offset vector    offset      index into the offset vector
142    eptr        points into the subject    eptr        pointer into the subject
143    length      length to be matched    length      length of reference to be matched (number of bytes)
144    md          points to match data block    md          points to match data block
145    ims         the ims flags    ims         the ims flags
146    
147  Returns:      TRUE if matched  Returns:      < 0 if not matched, otherwise the number of subject bytes matched
148  */  */
149    
150  static BOOL  static int
151  match_ref(int offset, register USPTR eptr, int length, match_data *md,  match_ref(int offset, register USPTR eptr, int length, match_data *md,
152    unsigned long int ims)    unsigned long int ims)
153  {  {
154  USPTR p = md->start_subject + md->offset_vector[offset];  USPTR eptr_start = eptr;
155    register USPTR p = md->start_subject + md->offset_vector[offset];
156    
157  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
158  if (eptr >= md->end_subject)  if (eptr >= md->end_subject)
# Line 164  pchars(p, length, FALSE, md); Line 167  pchars(p, length, FALSE, md);
167  printf("\n");  printf("\n");
168  #endif  #endif
169    
170  /* Always fail if not enough characters left */  /* Always fail if reference not set (and not JavaScript compatible). */
171    
172  if (length > md->end_subject - eptr) return FALSE;  if (length < 0) return -1;
173    
174  /* Separate the caseless case for speed. In UTF-8 mode we can only do this  /* Separate the caseless case for speed. In UTF-8 mode we can only do this
175  properly if Unicode properties are supported. Otherwise, we can check only  properly if Unicode properties are supported. Otherwise, we can check only
# Line 178  if ((ims & PCRE_CASELESS) != 0) Line 181  if ((ims & PCRE_CASELESS) != 0)
181  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
182    if (md->utf8)    if (md->utf8)
183      {      {
184      USPTR endptr = eptr + length;      /* Match characters up to the end of the reference. NOTE: the number of
185      while (eptr < endptr)      bytes matched may differ, because there are some characters whose upper and
186        lower case versions code as different numbers of bytes. For example, U+023A
187        (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
188        a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
189        the latter. It is important, therefore, to check the length along the
190        reference, not along the subject (earlier code did this wrong). */
191    
192        USPTR endptr = p + length;
193        while (p < endptr)
194        {        {
195        int c, d;        int c, d;
196        GETCHARINC(c, eptr);        GETCHARINC(c, eptr);
197        GETCHARINC(d, p);        GETCHARINC(d, p);
198        if (c != d && c != UCD_OTHERCASE(d)) return FALSE;        if (c != d && c != UCD_OTHERCASE(d)) return -1;
199        }        }
200      }      }
201    else    else
# Line 195  if ((ims & PCRE_CASELESS) != 0) Line 206  if ((ims & PCRE_CASELESS) != 0)
206    is no UCP support. */    is no UCP support. */
207    
208    while (length-- > 0)    while (length-- > 0)
209      { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }      { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
210    }    }
211    
212  /* In the caseful case, we can just compare the bytes, whether or not we  /* In the caseful case, we can just compare the bytes, whether or not we
213  are in UTF-8 mode. */  are in UTF-8 mode. */
214    
215  else  else
216    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return -1; }
217    
218  return TRUE;  return eptr - eptr_start;
219  }  }
220    
221    
# Line 1705  for (;;) Line 1716  for (;;)
1716        if (eptr < md->end_subject)        if (eptr < md->end_subject)
1717          { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }          { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1718        else        else
1719          {          {
1720          if (md->noteol) MRRETURN(MATCH_NOMATCH);          if (md->noteol) MRRETURN(MATCH_NOMATCH);
1721          SCHECK_PARTIAL();          SCHECK_PARTIAL();
1722          }          }
1723        ecode++;        ecode++;
# Line 1717  for (;;) Line 1728  for (;;)
1728        if (md->noteol) MRRETURN(MATCH_NOMATCH);        if (md->noteol) MRRETURN(MATCH_NOMATCH);
1729        if (!md->endonly) goto ASSERT_NL_OR_EOS;        if (!md->endonly) goto ASSERT_NL_OR_EOS;
1730        }        }
1731    
1732      /* ... else fall through for endonly */      /* ... else fall through for endonly */
1733    
1734      /* End of subject assertion (\z) */      /* End of subject assertion (\z) */
# Line 1735  for (;;) Line 1746  for (;;)
1746      if (eptr < md->end_subject &&      if (eptr < md->end_subject &&
1747          (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))          (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1748        MRRETURN(MATCH_NOMATCH);        MRRETURN(MATCH_NOMATCH);
1749    
1750      /* Either at end of string or \n before end. */      /* Either at end of string or \n before end. */
1751    
1752      SCHECK_PARTIAL();      SCHECK_PARTIAL();
1753      ecode++;      ecode++;
1754      break;      break;
# Line 2252  for (;;) Line 2263  for (;;)
2263      loops). */      loops). */
2264    
2265      case OP_REF:      case OP_REF:
2266        {      offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
2267        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */      ecode += 3;
       ecode += 3;  
2268    
2269        /* If the reference is unset, there are two possibilities:      /* If the reference is unset, there are two possibilities:
2270    
2271        (a) In the default, Perl-compatible state, set the length to be longer      (a) In the default, Perl-compatible state, set the length negative;
2272        than the amount of subject left; this ensures that every attempt at a      this ensures that every attempt at a match fails. We can't just fail
2273        match fails. We can't just fail here, because of the possibility of      here, because of the possibility of quantifiers with zero minima.
       quantifiers with zero minima.  
2274    
2275        (b) If the JavaScript compatibility flag is set, set the length to zero      (b) If the JavaScript compatibility flag is set, set the length to zero
2276        so that the back reference matches an empty string.      so that the back reference matches an empty string.
2277    
2278        Otherwise, set the length to the length of what was matched by the      Otherwise, set the length to the length of what was matched by the
2279        referenced subpattern. */      referenced subpattern. */
2280    
2281        if (offset >= offset_top || md->offset_vector[offset] < 0)      if (offset >= offset_top || md->offset_vector[offset] < 0)
2282          length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);        length = (md->jscript_compat)? 0 : -1;
2283        else      else
2284          length = md->offset_vector[offset+1] - md->offset_vector[offset];        length = md->offset_vector[offset+1] - md->offset_vector[offset];
2285    
2286        /* Set up for repetition, or handle the non-repeated case */      /* Set up for repetition, or handle the non-repeated case */
2287    
2288        switch (*ecode)      switch (*ecode)
2289          {        {
2290          case OP_CRSTAR:        case OP_CRSTAR:
2291          case OP_CRMINSTAR:        case OP_CRMINSTAR:
2292          case OP_CRPLUS:        case OP_CRPLUS:
2293          case OP_CRMINPLUS:        case OP_CRMINPLUS:
2294          case OP_CRQUERY:        case OP_CRQUERY:
2295          case OP_CRMINQUERY:        case OP_CRMINQUERY:
2296          c = *ecode++ - OP_CRSTAR;        c = *ecode++ - OP_CRSTAR;
2297          minimize = (c & 1) != 0;        minimize = (c & 1) != 0;
2298          min = rep_min[c];                 /* Pick up values from tables; */        min = rep_min[c];                 /* Pick up values from tables; */
2299          max = rep_max[c];                 /* zero for max => infinity */        max = rep_max[c];                 /* zero for max => infinity */
2300          if (max == 0) max = INT_MAX;        if (max == 0) max = INT_MAX;
2301          break;        break;
2302    
2303          case OP_CRRANGE:        case OP_CRRANGE:
2304          case OP_CRMINRANGE:        case OP_CRMINRANGE:
2305          minimize = (*ecode == OP_CRMINRANGE);        minimize = (*ecode == OP_CRMINRANGE);
2306          min = GET2(ecode, 1);        min = GET2(ecode, 1);
2307          max = GET2(ecode, 3);        max = GET2(ecode, 3);
2308          if (max == 0) max = INT_MAX;        if (max == 0) max = INT_MAX;
2309          ecode += 5;        ecode += 5;
2310          break;        break;
2311    
2312          default:               /* No repeat follows */        default:               /* No repeat follows */
2313          if (!match_ref(offset, eptr, length, md, ims))        if ((length = match_ref(offset, eptr, length, md, ims)) < 0)
2314            {          {
2315            CHECK_PARTIAL();          CHECK_PARTIAL();
2316            MRRETURN(MATCH_NOMATCH);          MRRETURN(MATCH_NOMATCH);
           }  
         eptr += length;  
         continue;              /* With the main loop */  
2317          }          }
2318          eptr += length;
2319          continue;              /* With the main loop */
2320          }
2321    
2322        /* If the length of the reference is zero, just continue with the      /* Handle repeated back references. If the length of the reference is
2323        main loop. */      zero, just continue with the main loop. */
2324    
2325        if (length == 0) continue;      if (length == 0) continue;
2326    
2327        /* First, ensure the minimum number of matches are present. We get back      /* First, ensure the minimum number of matches are present. We get back
2328        the length of the reference string explicitly rather than passing the      the length of the reference string explicitly rather than passing the
2329        address of eptr, so that eptr can be a register variable. */      address of eptr, so that eptr can be a register variable. */
2330    
2331        for (i = 1; i <= min; i++)      for (i = 1; i <= min; i++)
2332          {
2333          int slength;
2334          if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
2335          {          {
2336          if (!match_ref(offset, eptr, length, md, ims))          CHECK_PARTIAL();
2337            {          MRRETURN(MATCH_NOMATCH);
           CHECK_PARTIAL();  
           MRRETURN(MATCH_NOMATCH);  
           }  
         eptr += length;  
2338          }          }
2339          eptr += slength;
2340          }
2341    
2342        /* If min = max, continue at the same level without recursion.      /* If min = max, continue at the same level without recursion.
2343        They are not both allowed to be zero. */      They are not both allowed to be zero. */
2344    
2345        if (min == max) continue;      if (min == max) continue;
2346    
2347        /* If minimizing, keep trying and advancing the pointer */      /* If minimizing, keep trying and advancing the pointer */
2348    
2349        if (minimize)      if (minimize)
2350          {
2351          for (fi = min;; fi++)
2352          {          {
2353          for (fi = min;; fi++)          int slength;
2354            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2355            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2356            if (fi >= max) MRRETURN(MATCH_NOMATCH);
2357            if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
2358            {            {
2359            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);            CHECK_PARTIAL();
2360            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            MRRETURN(MATCH_NOMATCH);
           if (fi >= max) MRRETURN(MATCH_NOMATCH);  
           if (!match_ref(offset, eptr, length, md, ims))  
             {  
             CHECK_PARTIAL();  
             MRRETURN(MATCH_NOMATCH);  
             }  
           eptr += length;  
2361            }            }
2362          /* Control never gets here */          eptr += slength;
2363          }          }
2364          /* Control never gets here */
2365          }
2366    
2367        /* If maximizing, find the longest string and work backwards */      /* If maximizing, find the longest string and work backwards */
2368    
2369        else      else
2370          {
2371          pp = eptr;
2372          for (i = min; i < max; i++)
2373          {          {
2374          pp = eptr;          int slength;
2375          for (i = min; i < max; i++)          if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
           {  
           if (!match_ref(offset, eptr, length, md, ims))  
             {  
             CHECK_PARTIAL();  
             break;  
             }  
           eptr += length;  
           }  
         while (eptr >= pp)  
2376            {            {
2377            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);            CHECK_PARTIAL();
2378            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            break;
           eptr -= length;  
2379            }            }
2380          MRRETURN(MATCH_NOMATCH);          eptr += slength;
2381          }          }
2382          while (eptr >= pp)
2383            {
2384            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2385            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2386            eptr -= length;
2387            }
2388          MRRETURN(MATCH_NOMATCH);
2389        }        }
2390      /* Control never gets here */      /* Control never gets here */
2391    
# Line 5801  back the character offset. */ Line 5812  back the character offset. */
5812  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5813  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5814    {    {
5815    int tb;    int tb;
5816    if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)    if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)
5817      return (tb == length && md->partial > 1)?      return (tb == length && md->partial > 1)?
5818        PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;        PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5819    if (start_offset > 0 && start_offset < length)    if (start_offset > 0 && start_offset < length)
5820      {      {
# Line 5936  for(;;) Line 5947  for(;;)
5947    /* There are some optimizations that avoid running the match if a known    /* There are some optimizations that avoid running the match if a known
5948    starting point is not found, or if a known later character is not present.    starting point is not found, or if a known later character is not present.
5949    However, there is an option that disables these, for testing and for ensuring    However, there is an option that disables these, for testing and for ensuring
5950    that all callouts do actually occur. */    that all callouts do actually occur. The option can be set in the regex by
5951      (*NO_START_OPT) or passed in match-time options. */
5952    
5953    if ((options & PCRE_NO_START_OPTIMIZE) == 0)    if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
5954      {      {
5955      /* Advance to a unique first byte if there is one. */      /* Advance to a unique first byte if there is one. */
5956    

Legend:
Removed from v.569  
changed lines
  Added in v.595

  ViewVC Help
Powered by ViewVC 1.1.5