/[pcre]/code/trunk/pcrecpp.cc
ViewVC logotype

Diff of /code/trunk/pcrecpp.cc

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 322 by ph10, Wed Mar 5 17:14:08 2008 UTC
# Line 29  Line 29 
29  //  //
30  // Author: Sanjay Ghemawat  // Author: Sanjay Ghemawat
31    
32    #ifdef HAVE_CONFIG_H
33    #include "config.h"
34    #endif
35    
36  #include <stdlib.h>  #include <stdlib.h>
37  #include <stdio.h>  #include <stdio.h>
38  #include <ctype.h>  #include <ctype.h>
# Line 37  Line 41 
41  #include <errno.h>  #include <errno.h>
42  #include <string>  #include <string>
43  #include <algorithm>  #include <algorithm>
44  #include "config.h"  
45  // We need this to compile the proper dll on windows/msys.  This is copied  #include "pcrecpp_internal.h"
 // from pcre_internal.h.  It would probably be better just to include that.  
 #define PCRE_DEFINITION  /* Win32 __declspec(export) trigger for .dll */  
46  #include "pcre.h"  #include "pcre.h"
 #include "pcre_stringpiece.h"  
47  #include "pcrecpp.h"  #include "pcrecpp.h"
48    #include "pcre_stringpiece.h"
49    
50    
51  namespace pcrecpp {  namespace pcrecpp {
# Line 53  static const int kMaxArgs = 16; Line 55  static const int kMaxArgs = 16;
55  static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace  static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
56    
57  // Special object that stands-in for no argument  // Special object that stands-in for no argument
58  Arg no_arg((void*)NULL);  Arg RE::no_arg((void*)NULL);
59    
60    // This is for ABI compatibility with old versions of pcre (pre-7.6),
61    // which defined a global no_arg variable instead of putting it in the
62    // RE class.  This works on GCC >= 3, at least.  We could probably have
63    // a more inclusive test if we ever needed it.
64    #if defined(__GNUC__) && __GNUC__ >= 3
65    extern Arg no_arg __attribute__((alias("_ZN7pcrecpp2RE6no_argE")));
66    #endif
67    
68  // If a regular expression has no error, its error_ field points here  // If a regular expression has no error, its error_ field points here
69  static const string empty_string;  static const string empty_string;
# Line 61  static const string empty_string; Line 71  static const string empty_string;
71  // If the user doesn't ask for any options, we just use this one  // If the user doesn't ask for any options, we just use this one
72  static RE_Options default_options;  static RE_Options default_options;
73    
74  void RE::Init(const char* pat, const RE_Options* options) {  void RE::Init(const string& pat, const RE_Options* options) {
75    pattern_ = pat;    pattern_ = pat;
76    if (options == NULL) {    if (options == NULL) {
77      options_ = default_options;      options_ = default_options;
# Line 74  void RE::Init(const char* pat, const RE_ Line 84  void RE::Init(const char* pat, const RE_
84    
85    re_partial_ = Compile(UNANCHORED);    re_partial_ = Compile(UNANCHORED);
86    if (re_partial_ != NULL) {    if (re_partial_ != NULL) {
87      // Check for complicated patterns.  The following change is      re_full_ = Compile(ANCHOR_BOTH);
     // conservative in that it may treat some "simple" patterns  
     // as "complex" (e.g., if the vertical bar is in a character  
     // class or is escaped).  But it seems good enough.  
     if (strchr(pat, '|') == NULL) {  
       // Simple pattern: we can use position-based checks to perform  
       // fully anchored matches  
       re_full_ = re_partial_;  
     } else {  
       // We need a special pattern for anchored matches  
       re_full_ = Compile(ANCHOR_BOTH);  
     }  
88    }    }
89  }  }
90    
91    void RE::Cleanup() {
92      if (re_full_ != NULL)         (*pcre_free)(re_full_);
93      if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
94      if (error_ != &empty_string)  delete error_;
95    }
96    
97    
98  RE::~RE() {  RE::~RE() {
99    if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_);    Cleanup();
   if (re_partial_ != NULL)                         (*pcre_free)(re_partial_);  
   if (error_ != &empty_string)                     delete error_;  
100  }  }
101    
102    
103  pcre* RE::Compile(Anchor anchor) {  pcre* RE::Compile(Anchor anchor) {
104    // First, convert RE_Options into pcre options    // First, convert RE_Options into pcre options
105    int pcre_options = 0;    int pcre_options = 0;
# Line 332  bool RE::Replace(const StringPiece& rewr Line 337  bool RE::Replace(const StringPiece& rewr
337    return true;    return true;
338  }  }
339    
340    // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
341    // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
342    // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
343    
344    static int NewlineMode(int pcre_options) {
345      // TODO: if we can make it threadsafe, cache this var
346      int newline_mode = 0;
347      /* if (newline_mode) return newline_mode; */  // do this once it's cached
348      if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
349                          PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
350        newline_mode = (pcre_options &
351                        (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
352                         PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
353      } else {
354        int newline;
355        pcre_config(PCRE_CONFIG_NEWLINE, &newline);
356        if (newline == 10)
357          newline_mode = PCRE_NEWLINE_LF;
358        else if (newline == 13)
359          newline_mode = PCRE_NEWLINE_CR;
360        else if (newline == 3338)
361          newline_mode = PCRE_NEWLINE_CRLF;
362        else if (newline == -1)
363          newline_mode = PCRE_NEWLINE_ANY;
364        else if (newline == -2)
365          newline_mode = PCRE_NEWLINE_ANYCRLF;
366        else
367          assert("" == "Unexpected return value from pcre_config(NEWLINE)");
368      }
369      return newline_mode;
370    }
371    
372  int RE::GlobalReplace(const StringPiece& rewrite,  int RE::GlobalReplace(const StringPiece& rewrite,
373                        string *str) const {                        string *str) const {
374    int count = 0;    int count = 0;
# Line 340  int RE::GlobalReplace(const StringPiece& Line 377  int RE::GlobalReplace(const StringPiece&
377    int start = 0;    int start = 0;
378    int lastend = -1;    int lastend = -1;
379    
380    for (; start <= static_cast<int>(str->length()); count++) {    while (start <= static_cast<int>(str->length())) {
381      int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);      int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);
382      if (matches <= 0)      if (matches <= 0)
383        break;        break;
# Line 350  int RE::GlobalReplace(const StringPiece& Line 387  int RE::GlobalReplace(const StringPiece&
387      if (matchstart == matchend && matchstart == lastend) {      if (matchstart == matchend && matchstart == lastend) {
388        // advance one character if we matched an empty string at the same        // advance one character if we matched an empty string at the same
389        // place as the last match occurred        // place as the last match occurred
390        if (start < static_cast<int>(str->length()))        matchend = start + 1;
391          out.push_back((*str)[start]);        // If the current char is CR and we're in CRLF mode, skip LF too.
392        start++;        // Note it's better to call pcre_fullinfo() than to examine
393          // all_options(), since options_ could have changed bewteen
394          // compile-time and now, but this is simpler and safe enough.
395          // Modified by PH to add ANY and ANYCRLF.
396          if (start+1 < static_cast<int>(str->length()) &&
397              (*str)[start] == '\r' && (*str)[start+1] == '\n' &&
398              (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
399               NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
400               NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
401              ) {
402            matchend++;
403          }
404          // We also need to advance more than one char if we're in utf8 mode.
405    #ifdef SUPPORT_UTF8
406          if (options_.utf8()) {
407            while (matchend < static_cast<int>(str->length()) &&
408                   ((*str)[matchend] & 0xc0) == 0x80)
409              matchend++;
410          }
411    #endif
412          if (matchend <= static_cast<int>(str->length()))
413            out.append(*str, start, matchend - start);
414          start = matchend;
415      } else {      } else {
416        out.append(*str, start, matchstart - start);        out.append(*str, start, matchstart - start);
417        Rewrite(&out, rewrite, *str, vec, matches);        Rewrite(&out, rewrite, *str, vec, matches);
# Line 382  bool RE::Extract(const StringPiece& rewr Line 441  bool RE::Extract(const StringPiece& rewr
441    return Rewrite(out, rewrite, text, vec, matches);    return Rewrite(out, rewrite, text, vec, matches);
442  }  }
443    
444    /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
445      string result;
446    
447      // Escape any ascii character not in [A-Za-z_0-9].
448      //
449      // Note that it's legal to escape a character even if it has no
450      // special meaning in a regular expression -- so this function does
451      // that.  (This also makes it identical to the perl function of the
452      // same name; see `perldoc -f quotemeta`.)
453      for (int ii = 0; ii < unquoted.size(); ++ii) {
454        // Note that using 'isalnum' here raises the benchmark time from
455        // 32ns to 58ns:
456        if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
457            (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
458            (unquoted[ii] < '0' || unquoted[ii] > '9') &&
459            unquoted[ii] != '_' &&
460            // If this is the part of a UTF8 or Latin1 character, we need
461            // to copy this byte without escaping.  Experimentally this is
462            // what works correctly with the regexp library.
463            !(unquoted[ii] & 128)) {
464          result += '\\';
465        }
466        result += unquoted[ii];
467      }
468    
469      return result;
470    }
471    
472  /***** Actual matching and rewriting code *****/  /***** Actual matching and rewriting code *****/
473    
474  int RE::TryMatch(const StringPiece& text,  int RE::TryMatch(const StringPiece& text,
# Line 395  int RE::TryMatch(const StringPiece& text Line 482  int RE::TryMatch(const StringPiece& text
482      return 0;      return 0;
483    }    }
484    
485    pcre_extra extra = { 0 };    pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
486    if (options_.match_limit() > 0) {    if (options_.match_limit() > 0) {
487      extra.flags |= PCRE_EXTRA_MATCH_LIMIT;      extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
488      extra.match_limit = options_.match_limit();      extra.match_limit = options_.match_limit();
# Line 428  int RE::TryMatch(const StringPiece& text Line 515  int RE::TryMatch(const StringPiece& text
515      rc = vecsize / 2;      rc = vecsize / 2;
516    }    }
517    
   if ((anchor == ANCHOR_BOTH) && (re_full_ == re_partial_)) {  
     // We need an extra check to make sure that the match extended  
     // to the end of the input string  
     assert(vec[0] == 0);                 // PCRE_ANCHORED forces starting match  
     if (vec[1] != text.size()) return 0; // Did not get ending match  
   }  
   
518    return rc;    return rc;
519  }  }
520    
# Line 546  bool Arg::parse_null(const char* str, in Line 626  bool Arg::parse_null(const char* str, in
626  }  }
627    
628  bool Arg::parse_string(const char* str, int n, void* dest) {  bool Arg::parse_string(const char* str, int n, void* dest) {
629      if (dest == NULL) return true;
630    reinterpret_cast<string*>(dest)->assign(str, n);    reinterpret_cast<string*>(dest)->assign(str, n);
631    return true;    return true;
632  }  }
633    
634  bool Arg::parse_stringpiece(const char* str, int n, void* dest) {  bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
635      if (dest == NULL) return true;
636    reinterpret_cast<StringPiece*>(dest)->set(str, n);    reinterpret_cast<StringPiece*>(dest)->set(str, n);
637    return true;    return true;
638  }  }
639    
640  bool Arg::parse_char(const char* str, int n, void* dest) {  bool Arg::parse_char(const char* str, int n, void* dest) {
641    if (n != 1) return false;    if (n != 1) return false;
642      if (dest == NULL) return true;
643    *(reinterpret_cast<char*>(dest)) = str[0];    *(reinterpret_cast<char*>(dest)) = str[0];
644    return true;    return true;
645  }  }
646    
647  bool Arg::parse_uchar(const char* str, int n, void* dest) {  bool Arg::parse_uchar(const char* str, int n, void* dest) {
648    if (n != 1) return false;    if (n != 1) return false;
649      if (dest == NULL) return true;
650    *(reinterpret_cast<unsigned char*>(dest)) = str[0];    *(reinterpret_cast<unsigned char*>(dest)) = str[0];
651    return true;    return true;
652  }  }
# Line 611  bool Arg::parse_long_radix(const char* s Line 695  bool Arg::parse_long_radix(const char* s
695    long r = strtol(str, &end, radix);    long r = strtol(str, &end, radix);
696    if (end != str + n) return false;   // Leftover junk    if (end != str + n) return false;   // Leftover junk
697    if (errno) return false;    if (errno) return false;
698      if (dest == NULL) return true;
699    *(reinterpret_cast<long*>(dest)) = r;    *(reinterpret_cast<long*>(dest)) = r;
700    return true;    return true;
701  }  }
# Line 628  bool Arg::parse_ulong_radix(const char* Line 713  bool Arg::parse_ulong_radix(const char*
713    unsigned long r = strtoul(str, &end, radix);    unsigned long r = strtoul(str, &end, radix);
714    if (end != str + n) return false;   // Leftover junk    if (end != str + n) return false;   // Leftover junk
715    if (errno) return false;    if (errno) return false;
716      if (dest == NULL) return true;
717    *(reinterpret_cast<unsigned long*>(dest)) = r;    *(reinterpret_cast<unsigned long*>(dest)) = r;
718    return true;    return true;
719  }  }
# Line 639  bool Arg::parse_short_radix(const char* Line 725  bool Arg::parse_short_radix(const char*
725    long r;    long r;
726    if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse    if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
727    if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range    if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
728    *(reinterpret_cast<short*>(dest)) = r;    if (dest == NULL) return true;
729      *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
730    return true;    return true;
731  }  }
732    
# Line 650  bool Arg::parse_ushort_radix(const char* Line 737  bool Arg::parse_ushort_radix(const char*
737    unsigned long r;    unsigned long r;
738    if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse    if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
739    if (r > USHRT_MAX) return false;                      // Out of range    if (r > USHRT_MAX) return false;                      // Out of range
740    *(reinterpret_cast<unsigned short*>(dest)) = r;    if (dest == NULL) return true;
741      *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
742    return true;    return true;
743  }  }
744    
# Line 661  bool Arg::parse_int_radix(const char* st Line 749  bool Arg::parse_int_radix(const char* st
749    long r;    long r;
750    if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse    if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
751    if (r < INT_MIN || r > INT_MAX) return false;         // Out of range    if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
752      if (dest == NULL) return true;
753    *(reinterpret_cast<int*>(dest)) = r;    *(reinterpret_cast<int*>(dest)) = r;
754    return true;    return true;
755  }  }
# Line 672  bool Arg::parse_uint_radix(const char* s Line 761  bool Arg::parse_uint_radix(const char* s
761    unsigned long r;    unsigned long r;
762    if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse    if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
763    if (r > UINT_MAX) return false;                       // Out of range    if (r > UINT_MAX) return false;                       // Out of range
764      if (dest == NULL) return true;
765    *(reinterpret_cast<unsigned int*>(dest)) = r;    *(reinterpret_cast<unsigned int*>(dest)) = r;
766    return true;    return true;
767  }  }
# Line 692  bool Arg::parse_longlong_radix(const cha Line 782  bool Arg::parse_longlong_radix(const cha
782    long long r = strtoq(str, &end, radix);    long long r = strtoq(str, &end, radix);
783  #elif defined HAVE_STRTOLL  #elif defined HAVE_STRTOLL
784    long long r = strtoll(str, &end, radix);    long long r = strtoll(str, &end, radix);
785    #elif defined HAVE__STRTOI64
786      long long r = _strtoi64(str, &end, radix);
787  #else  #else
788  #error parse_longlong_radix: cannot convert input to a long-long  #error parse_longlong_radix: cannot convert input to a long-long
789  #endif  #endif
790    if (end != str + n) return false;   // Leftover junk    if (end != str + n) return false;   // Leftover junk
791    if (errno) return false;    if (errno) return false;
792      if (dest == NULL) return true;
793    *(reinterpret_cast<long long*>(dest)) = r;    *(reinterpret_cast<long long*>(dest)) = r;
794    return true;    return true;
795  #endif   /* HAVE_LONG_LONG */  #endif   /* HAVE_LONG_LONG */
# Line 719  bool Arg::parse_ulonglong_radix(const ch Line 812  bool Arg::parse_ulonglong_radix(const ch
812    unsigned long long r = strtouq(str, &end, radix);    unsigned long long r = strtouq(str, &end, radix);
813  #elif defined HAVE_STRTOLL  #elif defined HAVE_STRTOLL
814    unsigned long long r = strtoull(str, &end, radix);    unsigned long long r = strtoull(str, &end, radix);
815    #elif defined HAVE__STRTOI64
816      unsigned long long r = _strtoui64(str, &end, radix);
817  #else  #else
818  #error parse_ulonglong_radix: cannot convert input to a long-long  #error parse_ulonglong_radix: cannot convert input to a long-long
819  #endif  #endif
820    if (end != str + n) return false;   // Leftover junk    if (end != str + n) return false;   // Leftover junk
821    if (errno) return false;    if (errno) return false;
822      if (dest == NULL) return true;
823    *(reinterpret_cast<unsigned long long*>(dest)) = r;    *(reinterpret_cast<unsigned long long*>(dest)) = r;
824    return true;    return true;
825  #endif   /* HAVE_UNSIGNED_LONG_LONG */  #endif   /* HAVE_UNSIGNED_LONG_LONG */
# Line 741  bool Arg::parse_double(const char* str, Line 837  bool Arg::parse_double(const char* str,
837    double r = strtod(buf, &end);    double r = strtod(buf, &end);
838    if (end != buf + n) return false;   // Leftover junk    if (end != buf + n) return false;   // Leftover junk
839    if (errno) return false;    if (errno) return false;
840      if (dest == NULL) return true;
841    *(reinterpret_cast<double*>(dest)) = r;    *(reinterpret_cast<double*>(dest)) = r;
842    return true;    return true;
843  }  }
# Line 748  bool Arg::parse_double(const char* str, Line 845  bool Arg::parse_double(const char* str,
845  bool Arg::parse_float(const char* str, int n, void* dest) {  bool Arg::parse_float(const char* str, int n, void* dest) {
846    double r;    double r;
847    if (!parse_double(str, n, &r)) return false;    if (!parse_double(str, n, &r)) return false;
848      if (dest == NULL) return true;
849    *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);    *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
850    return true;    return true;
851  }  }
# Line 767  bool Arg::parse_float(const char* str, i Line 865  bool Arg::parse_float(const char* str, i
865      return parse_##name##_radix(str, n, dest, 0);                       \      return parse_##name##_radix(str, n, dest, 0);                       \
866    }    }
867    
868  DEFINE_INTEGER_PARSERS(short);  DEFINE_INTEGER_PARSERS(short)      /*                                   */
869  DEFINE_INTEGER_PARSERS(ushort);  DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
870  DEFINE_INTEGER_PARSERS(int);  DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
871  DEFINE_INTEGER_PARSERS(uint);  DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
872  DEFINE_INTEGER_PARSERS(long);  DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
873  DEFINE_INTEGER_PARSERS(ulong);  DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
874  DEFINE_INTEGER_PARSERS(longlong);  DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
875  DEFINE_INTEGER_PARSERS(ulonglong);  DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
876    
877  #undef DEFINE_INTEGER_PARSERS  #undef DEFINE_INTEGER_PARSERS
878    

Legend:
Removed from v.87  
changed lines
  Added in v.322

  ViewVC Help
Powered by ViewVC 1.1.5