/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 579 by ph10, Wed Nov 24 17:39:25 2010 UTC revision 600 by ph10, Mon May 9 08:54:11 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 132  while (length-- > 0) Line 132  while (length-- > 0)
132  *          Match a back-reference                *  *          Match a back-reference                *
133  *************************************************/  *************************************************/
134    
135  /* If a back reference hasn't been set, the length that is passed is greater  /* Normally, if a back reference hasn't been set, the length that is passed is
136  than the number of characters left in the string, so the match fails.  negative, so the match always fails. However, in JavaScript compatibility mode,
137    the length passed is zero. Note that in caseless UTF-8 mode, the number of
138    subject bytes matched may be different to the number of reference bytes.
139    
140  Arguments:  Arguments:
141    offset      index into the offset vector    offset      index into the offset vector
142    eptr        points into the subject    eptr        pointer into the subject
143    length      length to be matched    length      length of reference to be matched (number of bytes)
144    md          points to match data block    md          points to match data block
145    ims         the ims flags    ims         the ims flags
146    
147  Returns:      TRUE if matched  Returns:      < 0 if not matched, otherwise the number of subject bytes matched
148  */  */
149    
150  static BOOL  static int
151  match_ref(int offset, register USPTR eptr, int length, match_data *md,  match_ref(int offset, register USPTR eptr, int length, match_data *md,
152    unsigned long int ims)    unsigned long int ims)
153  {  {
154  USPTR p = md->start_subject + md->offset_vector[offset];  USPTR eptr_start = eptr;
155    register USPTR p = md->start_subject + md->offset_vector[offset];
156    
157  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
158  if (eptr >= md->end_subject)  if (eptr >= md->end_subject)
# Line 164  pchars(p, length, FALSE, md); Line 167  pchars(p, length, FALSE, md);
167  printf("\n");  printf("\n");
168  #endif  #endif
169    
170  /* Always fail if not enough characters left */  /* Always fail if reference not set (and not JavaScript compatible). */
171    
172  if (length > md->end_subject - eptr) return FALSE;  if (length < 0) return -1;
173    
174  /* Separate the caseless case for speed. In UTF-8 mode we can only do this  /* Separate the caseless case for speed. In UTF-8 mode we can only do this
175  properly if Unicode properties are supported. Otherwise, we can check only  properly if Unicode properties are supported. Otherwise, we can check only
# Line 178  if ((ims & PCRE_CASELESS) != 0) Line 181  if ((ims & PCRE_CASELESS) != 0)
181  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
182    if (md->utf8)    if (md->utf8)
183      {      {
184      USPTR endptr = eptr + length;      /* Match characters up to the end of the reference. NOTE: the number of
185      while (eptr < endptr)      bytes matched may differ, because there are some characters whose upper and
186        lower case versions code as different numbers of bytes. For example, U+023A
187        (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
188        a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
189        the latter. It is important, therefore, to check the length along the
190        reference, not along the subject (earlier code did this wrong). */
191    
192        USPTR endptr = p + length;
193        while (p < endptr)
194        {        {
195        int c, d;        int c, d;
196          if (eptr >= md->end_subject) return -1;
197        GETCHARINC(c, eptr);        GETCHARINC(c, eptr);
198        GETCHARINC(d, p);        GETCHARINC(d, p);
199        if (c != d && c != UCD_OTHERCASE(d)) return FALSE;        if (c != d && c != UCD_OTHERCASE(d)) return -1;
200        }        }
201      }      }
202    else    else
# Line 193  if ((ims & PCRE_CASELESS) != 0) Line 205  if ((ims & PCRE_CASELESS) != 0)
205    
206    /* The same code works when not in UTF-8 mode and in UTF-8 mode when there    /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
207    is no UCP support. */    is no UCP support. */
208        {
209    while (length-- > 0)      if (eptr + length > md->end_subject) return -1;
210      { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }      while (length-- > 0)
211          { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
212        }
213    }    }
214    
215  /* In the caseful case, we can just compare the bytes, whether or not we  /* In the caseful case, we can just compare the bytes, whether or not we
216  are in UTF-8 mode. */  are in UTF-8 mode. */
217    
218  else  else
219    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    {
220      if (eptr + length > md->end_subject) return -1;
221      while (length-- > 0) if (*p++ != *eptr++) return -1;
222      }
223    
224  return TRUE;  return eptr - eptr_start;
225  }  }
226    
227    
# Line 2000  for (;;) Line 2017  for (;;)
2017      switch(c)      switch(c)
2018        {        {
2019        default: MRRETURN(MATCH_NOMATCH);        default: MRRETURN(MATCH_NOMATCH);
2020    
2021        case 0x000d:        case 0x000d:
2022        if (eptr < md->end_subject && *eptr == 0x0a) eptr++;        if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2023        break;        break;
# Line 2252  for (;;) Line 2270  for (;;)
2270      loops). */      loops). */
2271    
2272      case OP_REF:      case OP_REF:
2273        {      offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
2274        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */      ecode += 3;
       ecode += 3;  
2275    
2276        /* If the reference is unset, there are two possibilities:      /* If the reference is unset, there are two possibilities:
2277    
2278        (a) In the default, Perl-compatible state, set the length to be longer      (a) In the default, Perl-compatible state, set the length negative;
2279        than the amount of subject left; this ensures that every attempt at a      this ensures that every attempt at a match fails. We can't just fail
2280        match fails. We can't just fail here, because of the possibility of      here, because of the possibility of quantifiers with zero minima.
       quantifiers with zero minima.  
2281    
2282        (b) If the JavaScript compatibility flag is set, set the length to zero      (b) If the JavaScript compatibility flag is set, set the length to zero
2283        so that the back reference matches an empty string.      so that the back reference matches an empty string.
2284    
2285        Otherwise, set the length to the length of what was matched by the      Otherwise, set the length to the length of what was matched by the
2286        referenced subpattern. */      referenced subpattern. */
2287    
2288        if (offset >= offset_top || md->offset_vector[offset] < 0)      if (offset >= offset_top || md->offset_vector[offset] < 0)
2289          length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);        length = (md->jscript_compat)? 0 : -1;
2290        else      else
2291          length = md->offset_vector[offset+1] - md->offset_vector[offset];        length = md->offset_vector[offset+1] - md->offset_vector[offset];
2292    
2293        /* Set up for repetition, or handle the non-repeated case */      /* Set up for repetition, or handle the non-repeated case */
2294    
2295        switch (*ecode)      switch (*ecode)
2296          {        {
2297          case OP_CRSTAR:        case OP_CRSTAR:
2298          case OP_CRMINSTAR:        case OP_CRMINSTAR:
2299          case OP_CRPLUS:        case OP_CRPLUS:
2300          case OP_CRMINPLUS:        case OP_CRMINPLUS:
2301          case OP_CRQUERY:        case OP_CRQUERY:
2302          case OP_CRMINQUERY:        case OP_CRMINQUERY:
2303          c = *ecode++ - OP_CRSTAR;        c = *ecode++ - OP_CRSTAR;
2304          minimize = (c & 1) != 0;        minimize = (c & 1) != 0;
2305          min = rep_min[c];                 /* Pick up values from tables; */        min = rep_min[c];                 /* Pick up values from tables; */
2306          max = rep_max[c];                 /* zero for max => infinity */        max = rep_max[c];                 /* zero for max => infinity */
2307          if (max == 0) max = INT_MAX;        if (max == 0) max = INT_MAX;
2308          break;        break;
2309    
2310          case OP_CRRANGE:        case OP_CRRANGE:
2311          case OP_CRMINRANGE:        case OP_CRMINRANGE:
2312          minimize = (*ecode == OP_CRMINRANGE);        minimize = (*ecode == OP_CRMINRANGE);
2313          min = GET2(ecode, 1);        min = GET2(ecode, 1);
2314          max = GET2(ecode, 3);        max = GET2(ecode, 3);
2315          if (max == 0) max = INT_MAX;        if (max == 0) max = INT_MAX;
2316          ecode += 5;        ecode += 5;
2317          break;        break;
2318    
2319          default:               /* No repeat follows */        default:               /* No repeat follows */
2320          if (!match_ref(offset, eptr, length, md, ims))        if ((length = match_ref(offset, eptr, length, md, ims)) < 0)
2321            {          {
2322            CHECK_PARTIAL();          CHECK_PARTIAL();
2323            MRRETURN(MATCH_NOMATCH);          MRRETURN(MATCH_NOMATCH);
           }  
         eptr += length;  
         continue;              /* With the main loop */  
2324          }          }
2325          eptr += length;
2326          continue;              /* With the main loop */
2327          }
2328    
2329        /* If the length of the reference is zero, just continue with the      /* Handle repeated back references. If the length of the reference is
2330        main loop. */      zero, just continue with the main loop. */
2331    
2332        if (length == 0) continue;      if (length == 0) continue;
2333    
2334        /* First, ensure the minimum number of matches are present. We get back      /* First, ensure the minimum number of matches are present. We get back
2335        the length of the reference string explicitly rather than passing the      the length of the reference string explicitly rather than passing the
2336        address of eptr, so that eptr can be a register variable. */      address of eptr, so that eptr can be a register variable. */
2337    
2338        for (i = 1; i <= min; i++)      for (i = 1; i <= min; i++)
2339          {
2340          int slength;
2341          if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
2342          {          {
2343          if (!match_ref(offset, eptr, length, md, ims))          CHECK_PARTIAL();
2344            {          MRRETURN(MATCH_NOMATCH);
           CHECK_PARTIAL();  
           MRRETURN(MATCH_NOMATCH);  
           }  
         eptr += length;  
2345          }          }
2346          eptr += slength;
2347          }
2348    
2349        /* If min = max, continue at the same level without recursion.      /* If min = max, continue at the same level without recursion.
2350        They are not both allowed to be zero. */      They are not both allowed to be zero. */
2351    
2352        if (min == max) continue;      if (min == max) continue;
2353    
2354        /* If minimizing, keep trying and advancing the pointer */      /* If minimizing, keep trying and advancing the pointer */
2355    
2356        if (minimize)      if (minimize)
2357          {
2358          for (fi = min;; fi++)
2359          {          {
2360          for (fi = min;; fi++)          int slength;
2361            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2362            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2363            if (fi >= max) MRRETURN(MATCH_NOMATCH);
2364            if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
2365            {            {
2366            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);            CHECK_PARTIAL();
2367            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            MRRETURN(MATCH_NOMATCH);
           if (fi >= max) MRRETURN(MATCH_NOMATCH);  
           if (!match_ref(offset, eptr, length, md, ims))  
             {  
             CHECK_PARTIAL();  
             MRRETURN(MATCH_NOMATCH);  
             }  
           eptr += length;  
2368            }            }
2369          /* Control never gets here */          eptr += slength;
2370          }          }
2371          /* Control never gets here */
2372          }
2373    
2374        /* If maximizing, find the longest string and work backwards */      /* If maximizing, find the longest string and work backwards */
2375    
2376        else      else
2377          {
2378          pp = eptr;
2379          for (i = min; i < max; i++)
2380          {          {
2381          pp = eptr;          int slength;
2382          for (i = min; i < max; i++)          if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
           {  
           if (!match_ref(offset, eptr, length, md, ims))  
             {  
             CHECK_PARTIAL();  
             break;  
             }  
           eptr += length;  
           }  
         while (eptr >= pp)  
2383            {            {
2384            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);            CHECK_PARTIAL();
2385            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            break;
           eptr -= length;  
2386            }            }
2387          MRRETURN(MATCH_NOMATCH);          eptr += slength;
2388            }
2389          while (eptr >= pp)
2390            {
2391            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2392            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2393            eptr -= length;
2394          }          }
2395          MRRETURN(MATCH_NOMATCH);
2396        }        }
2397      /* Control never gets here */      /* Control never gets here */
2398    
# Line 3774  for (;;) Line 3792  for (;;)
3792            switch(c)            switch(c)
3793              {              {
3794              default: MRRETURN(MATCH_NOMATCH);              default: MRRETURN(MATCH_NOMATCH);
3795    
3796              case 0x000d:              case 0x000d:
3797              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3798              break;              break;
# Line 4050  for (;;) Line 4069  for (;;)
4069            switch(*eptr++)            switch(*eptr++)
4070              {              {
4071              default: MRRETURN(MATCH_NOMATCH);              default: MRRETURN(MATCH_NOMATCH);
4072    
4073              case 0x000d:              case 0x000d:
4074              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4075              break;              break;
4076    
4077              case 0x000a:              case 0x000a:
4078              break;              break;
4079    
# Line 5241  for (;;) Line 5262  for (;;)
5262            RRETURN(PCRE_ERROR_INTERNAL);            RRETURN(PCRE_ERROR_INTERNAL);
5263            }            }
5264    
5265          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run. If possessive, we are
5266            done (no backing up). Otherwise, match at this position; anything other
5267            than no match is immediately returned. For nomatch, back up one
5268            character, unless we are matching \R and the last thing matched was
5269            \r\n, in which case, back up two bytes. */
5270    
5271          if (possessive) continue;          if (possessive) continue;
5272          for(;;)          for(;;)
# Line 5250  for (;;) Line 5275  for (;;)
5275            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5276            if (eptr-- == pp) break;        /* Stop if tried at original pos */            if (eptr-- == pp) break;        /* Stop if tried at original pos */
5277            BACKCHAR(eptr);            BACKCHAR(eptr);
5278              if (ctype == OP_ANYNL && eptr > pp  && *eptr == '\n' &&
5279                  eptr[-1] == '\r') eptr--;
5280            }            }
5281          }          }
5282        else        else
# Line 5448  for (;;) Line 5475  for (;;)
5475            RRETURN(PCRE_ERROR_INTERNAL);            RRETURN(PCRE_ERROR_INTERNAL);
5476            }            }
5477    
5478          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run. If possessive, we are
5479            done (no backing up). Otherwise, match at this position; anything other
5480            than no match is immediately returned. For nomatch, back up one
5481            character (byte), unless we are matching \R and the last thing matched
5482            was \r\n, in which case, back up two bytes. */
5483    
5484          if (possessive) continue;          if (possessive) continue;
5485          while (eptr >= pp)          while (eptr >= pp)
5486            {            {
5487            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
           eptr--;  
5488            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5489              eptr--;
5490              if (ctype == OP_ANYNL && eptr > pp  && *eptr == '\n' &&
5491                  eptr[-1] == '\r') eptr--;
5492            }            }
5493          }          }
5494    
# Line 5795  defined (though never set). So there's n Line 5828  defined (though never set). So there's n
5828  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5829    return PCRE_ERROR_BADPARTIAL;    return PCRE_ERROR_BADPARTIAL;
5830    
5831  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Pass back the character offset and error
5832  back the character offset. */  code if a results vector is available. */
5833    
5834  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5835  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5836    {    {
5837    int tb;    int errorcode;
5838    if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)    int tb = _pcre_valid_utf8((USPTR)subject, length, &errorcode);
5839      return (tb == length && md->partial > 1)?    if (tb >= 0)
5840        {
5841        if (offsetcount >= 2)
5842          {
5843          offsets[0] = tb;
5844          offsets[1] = errorcode;
5845          }
5846        return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5847        PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;        PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5848        }
5849    if (start_offset > 0 && start_offset < length)    if (start_offset > 0 && start_offset < length)
5850      {      {
5851      tb = ((USPTR)subject)[start_offset] & 0xc0;      tb = ((USPTR)subject)[start_offset] & 0xc0;

Legend:
Removed from v.579  
changed lines
  Added in v.600

  ViewVC Help
Powered by ViewVC 1.1.5