/[pcre]/code/trunk/pcrecpp.cc
ViewVC logotype

Diff of /code/trunk/pcrecpp.cc

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 322 by ph10, Wed Mar 5 17:14:08 2008 UTC revision 486 by ph10, Tue Jan 5 17:44:57 2010 UTC
# Line 1  Line 1 
1  // Copyright (c) 2005, Google Inc.  // Copyright (c) 2010, Google Inc.
2  // All rights reserved.  // All rights reserved.
3  //  //
4  // Redistribution and use in source and binary forms, with or without  // Redistribution and use in source and binary forms, with or without
# Line 59  Arg RE::no_arg((void*)NULL); Line 59  Arg RE::no_arg((void*)NULL);
59    
60  // This is for ABI compatibility with old versions of pcre (pre-7.6),  // This is for ABI compatibility with old versions of pcre (pre-7.6),
61  // which defined a global no_arg variable instead of putting it in the  // which defined a global no_arg variable instead of putting it in the
62  // RE class.  This works on GCC >= 3, at least.  We could probably have  // RE class.  This works on GCC >= 3, at least.  It definitely works
63  // a more inclusive test if we ever needed it.  // for ELF, but may not for other object formats (Mach-O, for
64  #if defined(__GNUC__) && __GNUC__ >= 3  // instance, does not support aliases.)  We could probably have a more
65  extern Arg no_arg __attribute__((alias("_ZN7pcrecpp2RE6no_argE")));  // inclusive test if we ever needed it.  (Note that not only the
66    // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
67    // gnu-specific.)
68    #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
69    # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
70    # define ULP_AS_STRING_INTERNAL(x)   #x
71    # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
72    extern Arg no_arg
73      __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
74  #endif  #endif
75    
76  // If a regular expression has no error, its error_ field points here  // If a regular expression has no error, its error_ field points here
# Line 323  bool RE::FindAndConsume(StringPiece* inp Line 331  bool RE::FindAndConsume(StringPiece* inp
331  bool RE::Replace(const StringPiece& rewrite,  bool RE::Replace(const StringPiece& rewrite,
332                   string *str) const {                   string *str) const {
333    int vec[kVecSize];    int vec[kVecSize];
334    int matches = TryMatch(*str, 0, UNANCHORED, vec, kVecSize);    int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
335    if (matches == 0)    if (matches == 0)
336      return false;      return false;
337    
# Line 364  static int NewlineMode(int pcre_options) Line 372  static int NewlineMode(int pcre_options)
372      else if (newline == -2)      else if (newline == -2)
373        newline_mode = PCRE_NEWLINE_ANYCRLF;        newline_mode = PCRE_NEWLINE_ANYCRLF;
374      else      else
375        assert("" == "Unexpected return value from pcre_config(NEWLINE)");        assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
376    }    }
377    return newline_mode;    return newline_mode;
378  }  }
# Line 376  int RE::GlobalReplace(const StringPiece& Line 384  int RE::GlobalReplace(const StringPiece&
384    string out;    string out;
385    int start = 0;    int start = 0;
386    int lastend = -1;    int lastend = -1;
387      bool last_match_was_empty_string = false;
388    
389    while (start <= static_cast<int>(str->length())) {    while (start <= static_cast<int>(str->length())) {
390      int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);      // If the previous match was for the empty string, we shouldn't
391      if (matches <= 0)      // just match again: we'll match in the same way and get an
392        break;      // infinite loop.  Instead, we do the match in a special way:
393      int matchstart = vec[0], matchend = vec[1];      // anchored -- to force another try at the same position --
394      assert(matchstart >= start);      // and with a flag saying that this time, ignore empty matches.
395      assert(matchend >= matchstart);      // If this special match returns, that means there's a non-empty
396      if (matchstart == matchend && matchstart == lastend) {      // match at this position as well, and we can continue.  If not,
397        // advance one character if we matched an empty string at the same      // we do what perl does, and just advance by one.
398        // place as the last match occurred      // Notice that perl prints '@@@' for this;
399        matchend = start + 1;      //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
400        // If the current char is CR and we're in CRLF mode, skip LF too.      int matches;
401        // Note it's better to call pcre_fullinfo() than to examine      if (last_match_was_empty_string) {
402        // all_options(), since options_ could have changed bewteen        matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
403        // compile-time and now, but this is simpler and safe enough.        if (matches <= 0) {
404        // Modified by PH to add ANY and ANYCRLF.          int matchend = start + 1;     // advance one character.
405        if (start+1 < static_cast<int>(str->length()) &&          // If the current char is CR and we're in CRLF mode, skip LF too.
406            (*str)[start] == '\r' && (*str)[start+1] == '\n' &&          // Note it's better to call pcre_fullinfo() than to examine
407            (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||          // all_options(), since options_ could have changed bewteen
408             NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||          // compile-time and now, but this is simpler and safe enough.
409             NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)          // Modified by PH to add ANY and ANYCRLF.
410            ) {          if (matchend < static_cast<int>(str->length()) &&
411          matchend++;              (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
412        }              (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
413        // We also need to advance more than one char if we're in utf8 mode.               NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
414  #ifdef SUPPORT_UTF8               NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
       if (options_.utf8()) {  
         while (matchend < static_cast<int>(str->length()) &&  
                ((*str)[matchend] & 0xc0) == 0x80)  
415            matchend++;            matchend++;
416        }          }
417            // We also need to advance more than one char if we're in utf8 mode.
418    #ifdef SUPPORT_UTF8
419            if (options_.utf8()) {
420              while (matchend < static_cast<int>(str->length()) &&
421                     ((*str)[matchend] & 0xc0) == 0x80)
422                matchend++;
423            }
424  #endif  #endif
425        if (matchend <= static_cast<int>(str->length()))          if (start < static_cast<int>(str->length()))
426          out.append(*str, start, matchend - start);            out.append(*str, start, matchend - start);
427        start = matchend;          start = matchend;
428            last_match_was_empty_string = false;
429            continue;
430          }
431      } else {      } else {
432        out.append(*str, start, matchstart - start);        matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
433        Rewrite(&out, rewrite, *str, vec, matches);        if (matches <= 0)
434        start = matchend;          break;
       lastend = matchend;  
       count++;  
435      }      }
436        int matchstart = vec[0], matchend = vec[1];
437        assert(matchstart >= start);
438        assert(matchend >= matchstart);
439        out.append(*str, start, matchstart - start);
440        Rewrite(&out, rewrite, *str, vec, matches);
441        start = matchend;
442        lastend = matchend;
443        count++;
444        last_match_was_empty_string = (matchstart == matchend);
445    }    }
446    
447    if (count == 0)    if (count == 0)
# Line 434  bool RE::Extract(const StringPiece& rewr Line 457  bool RE::Extract(const StringPiece& rewr
457                   const StringPiece& text,                   const StringPiece& text,
458                   string *out) const {                   string *out) const {
459    int vec[kVecSize];    int vec[kVecSize];
460    int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize);    int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
461    if (matches == 0)    if (matches == 0)
462      return false;      return false;
463    out->erase();    out->erase();
# Line 449  bool RE::Extract(const StringPiece& rewr Line 472  bool RE::Extract(const StringPiece& rewr
472    // Note that it's legal to escape a character even if it has no    // Note that it's legal to escape a character even if it has no
473    // special meaning in a regular expression -- so this function does    // special meaning in a regular expression -- so this function does
474    // that.  (This also makes it identical to the perl function of the    // that.  (This also makes it identical to the perl function of the
475    // same name; see `perldoc -f quotemeta`.)    // same name; see `perldoc -f quotemeta`.)  The one exception is
476      // escaping NUL: rather than doing backslash + NUL, like perl does,
477      // we do '\0', because pcre itself doesn't take embedded NUL chars.
478    for (int ii = 0; ii < unquoted.size(); ++ii) {    for (int ii = 0; ii < unquoted.size(); ++ii) {
479      // Note that using 'isalnum' here raises the benchmark time from      // Note that using 'isalnum' here raises the benchmark time from
480      // 32ns to 58ns:      // 32ns to 58ns:
481      if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&      if (unquoted[ii] == '\0') {
482          (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&        result += "\\0";
483          (unquoted[ii] < '0' || unquoted[ii] > '9') &&      } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
484          unquoted[ii] != '_' &&                 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
485          // If this is the part of a UTF8 or Latin1 character, we need                 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
486          // to copy this byte without escaping.  Experimentally this is                 unquoted[ii] != '_' &&
487          // what works correctly with the regexp library.                 // If this is the part of a UTF8 or Latin1 character, we need
488          !(unquoted[ii] & 128)) {                 // to copy this byte without escaping.  Experimentally this is
489                   // what works correctly with the regexp library.
490                   !(unquoted[ii] & 128)) {
491        result += '\\';        result += '\\';
492          result += unquoted[ii];
493        } else {
494          result += unquoted[ii];
495      }      }
     result += unquoted[ii];  
496    }    }
497    
498    return result;    return result;
# Line 474  bool RE::Extract(const StringPiece& rewr Line 503  bool RE::Extract(const StringPiece& rewr
503  int RE::TryMatch(const StringPiece& text,  int RE::TryMatch(const StringPiece& text,
504                   int startpos,                   int startpos,
505                   Anchor anchor,                   Anchor anchor,
506                     bool empty_ok,
507                   int *vec,                   int *vec,
508                   int vecsize) const {                   int vecsize) const {
509    pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;    pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
# Line 491  int RE::TryMatch(const StringPiece& text Line 521  int RE::TryMatch(const StringPiece& text
521      extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;      extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
522      extra.match_limit_recursion = options_.match_limit_recursion();      extra.match_limit_recursion = options_.match_limit_recursion();
523    }    }
524    
525      int options = 0;
526      if (anchor != UNANCHORED)
527        options |= PCRE_ANCHORED;
528      if (!empty_ok)
529        options |= PCRE_NOTEMPTY;
530    
531    int rc = pcre_exec(re,              // The regular expression object    int rc = pcre_exec(re,              // The regular expression object
532                       &extra,                       &extra,
533                       (text.data() == NULL) ? "" : text.data(),                       (text.data() == NULL) ? "" : text.data(),
534                       text.size(),                       text.size(),
535                       startpos,                       startpos,
536                       (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED,                       options,
537                       vec,                       vec,
538                       vecsize);                       vecsize);
539    
# Line 526  bool RE::DoMatchImpl(const StringPiece& Line 563  bool RE::DoMatchImpl(const StringPiece&
563                       int* vec,                       int* vec,
564                       int vecsize) const {                       int vecsize) const {
565    assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace    assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
566    int matches = TryMatch(text, 0, anchor, vec, vecsize);    int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
567    assert(matches >= 0);  // TryMatch never returns negatives    assert(matches >= 0);  // TryMatch never returns negatives
568    if (matches == 0)    if (matches == 0)
569      return false;      return false;
# Line 591  bool RE::Rewrite(string *out, const Stri Line 628  bool RE::Rewrite(string *out, const Stri
628          if (start >= 0)          if (start >= 0)
629            out->append(text.data() + start, vec[2 * n + 1] - start);            out->append(text.data() + start, vec[2 * n + 1] - start);
630        } else if (c == '\\') {        } else if (c == '\\') {
631          out->push_back('\\');          *out += '\\';
632        } else {        } else {
633          //fprintf(stderr, "invalid rewrite pattern: %.*s\n",          //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
634          //        rewrite.size(), rewrite.data());          //        rewrite.size(), rewrite.data());
635          return false;          return false;
636        }        }
637      } else {      } else {
638        out->push_back(c);        *out += c;
639      }      }
640    }    }
641    return true;    return true;
# Line 784  bool Arg::parse_longlong_radix(const cha Line 821  bool Arg::parse_longlong_radix(const cha
821    long long r = strtoll(str, &end, radix);    long long r = strtoll(str, &end, radix);
822  #elif defined HAVE__STRTOI64  #elif defined HAVE__STRTOI64
823    long long r = _strtoi64(str, &end, radix);    long long r = _strtoi64(str, &end, radix);
824    #elif defined HAVE_STRTOIMAX
825      long long r = strtoimax(str, &end, radix);
826  #else  #else
827  #error parse_longlong_radix: cannot convert input to a long-long  #error parse_longlong_radix: cannot convert input to a long-long
828  #endif  #endif
# Line 814  bool Arg::parse_ulonglong_radix(const ch Line 853  bool Arg::parse_ulonglong_radix(const ch
853    unsigned long long r = strtoull(str, &end, radix);    unsigned long long r = strtoull(str, &end, radix);
854  #elif defined HAVE__STRTOI64  #elif defined HAVE__STRTOI64
855    unsigned long long r = _strtoui64(str, &end, radix);    unsigned long long r = _strtoui64(str, &end, radix);
856    #elif defined HAVE_STRTOIMAX
857      unsigned long long r = strtoumax(str, &end, radix);
858  #else  #else
859  #error parse_ulonglong_radix: cannot convert input to a long-long  #error parse_ulonglong_radix: cannot convert input to a long-long
860  #endif  #endif

Legend:
Removed from v.322  
changed lines
  Added in v.486

  ViewVC Help
Powered by ViewVC 1.1.5