/[pcre]/code/trunk/pcrecpp.cc
ViewVC logotype

Diff of /code/trunk/pcrecpp.cc

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 77 by nigel, Sat Feb 24 21:40:45 2007 UTC revision 253 by ph10, Mon Sep 17 10:51:30 2007 UTC
# Line 29  Line 29 
29  //  //
30  // Author: Sanjay Ghemawat  // Author: Sanjay Ghemawat
31    
32    #ifdef HAVE_CONFIG_H
33    #include "config.h"
34    #endif
35    
36    #ifdef HAVE_WINDOWS_H
37    #define HAVE_STRTOQ 1
38    #define strtoll     _strtoui64
39    #define strtoull    _strtoi64
40    #endif
41    
42  #include <stdlib.h>  #include <stdlib.h>
43  #include <stdio.h>  #include <stdio.h>
44  #include <ctype.h>  #include <ctype.h>
# Line 36  Line 46 
46  #include <assert.h>  #include <assert.h>
47  #include <errno.h>  #include <errno.h>
48  #include <string>  #include <string>
49  #include "config.h"  #include <algorithm>
50  // We need this to compile the proper dll on windows/msys.  This is copied  
51  // from pcre_internal.h.  It would probably be better just to include that.  #include "pcrecpp_internal.h"
 #define PCRE_DEFINITION  /* Win32 __declspec(export) trigger for .dll */  
52  #include "pcre.h"  #include "pcre.h"
 #include "pcre_stringpiece.h"  
53  #include "pcrecpp.h"  #include "pcrecpp.h"
54    #include "pcre_stringpiece.h"
55    
56    
57  namespace pcrecpp {  namespace pcrecpp {
# Line 52  static const int kMaxArgs = 16; Line 61  static const int kMaxArgs = 16;
61  static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace  static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
62    
63  // Special object that stands-in for no argument  // Special object that stands-in for no argument
64  Arg no_arg((void*)NULL);  PCRECPP_EXP_DEFN Arg no_arg((void*)NULL);
65    
66  // If a regular expression has no error, its error_ field points here  // If a regular expression has no error, its error_ field points here
67  static const string empty_string;  static const string empty_string;
# Line 60  static const string empty_string; Line 69  static const string empty_string;
69  // If the user doesn't ask for any options, we just use this one  // If the user doesn't ask for any options, we just use this one
70  static RE_Options default_options;  static RE_Options default_options;
71    
72  void RE::Init(const char* pat, const RE_Options* options) {  void RE::Init(const string& pat, const RE_Options* options) {
73    pattern_ = pat;    pattern_ = pat;
74    if (options == NULL) {    if (options == NULL) {
75      options_ = default_options;      options_ = default_options;
# Line 73  void RE::Init(const char* pat, const RE_ Line 82  void RE::Init(const char* pat, const RE_
82    
83    re_partial_ = Compile(UNANCHORED);    re_partial_ = Compile(UNANCHORED);
84    if (re_partial_ != NULL) {    if (re_partial_ != NULL) {
85      // Check for complicated patterns.  The following change is      re_full_ = Compile(ANCHOR_BOTH);
     // conservative in that it may treat some "simple" patterns  
     // as "complex" (e.g., if the vertical bar is in a character  
     // class or is escaped).  But it seems good enough.  
     if (strchr(pat, '|') == NULL) {  
       // Simple pattern: we can use position-based checks to perform  
       // fully anchored matches  
       re_full_ = re_partial_;  
     } else {  
       // We need a special pattern for anchored matches  
       re_full_ = Compile(ANCHOR_BOTH);  
     }  
86    }    }
87  }  }
88    
89    void RE::Cleanup() {
90      if (re_full_ != NULL)         (*pcre_free)(re_full_);
91      if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
92      if (error_ != &empty_string)  delete error_;
93    }
94    
95    
96  RE::~RE() {  RE::~RE() {
97    if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_);    Cleanup();
   if (re_partial_ != NULL)                         (*pcre_free)(re_partial_);  
   if (error_ != &empty_string)                     delete error_;  
98  }  }
99    
100    
101  pcre* RE::Compile(Anchor anchor) {  pcre* RE::Compile(Anchor anchor) {
102    // First, convert RE_Options into pcre options    // First, convert RE_Options into pcre options
103    int pcre_options = 0;    int pcre_options = 0;
104    if (options_.utf8())    pcre_options = options_.all_options();
     pcre_options |= PCRE_UTF8;  
105    
106    // Special treatment for anchoring.  This is needed because at    // Special treatment for anchoring.  This is needed because at
107    // runtime pcre only provides an option for anchoring at the    // runtime pcre only provides an option for anchoring at the
# Line 332  bool RE::Replace(const StringPiece& rewr Line 335  bool RE::Replace(const StringPiece& rewr
335    return true;    return true;
336  }  }
337    
338    // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
339    // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
340    // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
341    
342    static int NewlineMode(int pcre_options) {
343      // TODO: if we can make it threadsafe, cache this var
344      int newline_mode = 0;
345      /* if (newline_mode) return newline_mode; */  // do this once it's cached
346      if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
347                          PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
348        newline_mode = (pcre_options &
349                        (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
350                         PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
351      } else {
352        int newline;
353        pcre_config(PCRE_CONFIG_NEWLINE, &newline);
354        if (newline == 10)
355          newline_mode = PCRE_NEWLINE_LF;
356        else if (newline == 13)
357          newline_mode = PCRE_NEWLINE_CR;
358        else if (newline == 3338)
359          newline_mode = PCRE_NEWLINE_CRLF;
360        else if (newline == -1)
361          newline_mode = PCRE_NEWLINE_ANY;
362        else if (newline == -2)
363          newline_mode = PCRE_NEWLINE_ANYCRLF;
364        else
365          assert("" == "Unexpected return value from pcre_config(NEWLINE)");
366      }
367      return newline_mode;
368    }
369    
370  int RE::GlobalReplace(const StringPiece& rewrite,  int RE::GlobalReplace(const StringPiece& rewrite,
371                        string *str) const {                        string *str) const {
372    int count = 0;    int count = 0;
# Line 350  int RE::GlobalReplace(const StringPiece& Line 385  int RE::GlobalReplace(const StringPiece&
385      if (matchstart == matchend && matchstart == lastend) {      if (matchstart == matchend && matchstart == lastend) {
386        // advance one character if we matched an empty string at the same        // advance one character if we matched an empty string at the same
387        // place as the last match occurred        // place as the last match occurred
388        if (start < static_cast<int>(str->length()))        matchend = start + 1;
389          out.push_back((*str)[start]);        // If the current char is CR and we're in CRLF mode, skip LF too.
390        start++;        // Note it's better to call pcre_fullinfo() than to examine
391          // all_options(), since options_ could have changed bewteen
392          // compile-time and now, but this is simpler and safe enough.
393          // Modified by PH to add ANY and ANYCRLF.
394          if (start+1 < static_cast<int>(str->length()) &&
395              (*str)[start] == '\r' && (*str)[start+1] == '\n' &&
396              (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
397               NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
398               NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
399              ) {
400            matchend++;
401          }
402          // We also need to advance more than one char if we're in utf8 mode.
403    #ifdef SUPPORT_UTF8
404          if (options_.utf8()) {
405            while (matchend < static_cast<int>(str->length()) &&
406                   ((*str)[matchend] & 0xc0) == 0x80)
407              matchend++;
408          }
409    #endif
410          if (matchend <= static_cast<int>(str->length()))
411            out.append(*str, start, matchend - start);
412          start = matchend;
413      } else {      } else {
414        out.append(*str, start, matchstart - start);        out.append(*str, start, matchstart - start);
415        Rewrite(&out, rewrite, *str, vec, matches);        Rewrite(&out, rewrite, *str, vec, matches);
# Line 378  bool RE::Extract(const StringPiece& rewr Line 435  bool RE::Extract(const StringPiece& rewr
435    int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize);    int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize);
436    if (matches == 0)    if (matches == 0)
437      return false;      return false;
438    out->clear();    out->erase();
439    return Rewrite(out, rewrite, text, vec, matches);    return Rewrite(out, rewrite, text, vec, matches);
440  }  }
441    
442    /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
443      string result;
444    
445      // Escape any ascii character not in [A-Za-z_0-9].
446      //
447      // Note that it's legal to escape a character even if it has no
448      // special meaning in a regular expression -- so this function does
449      // that.  (This also makes it identical to the perl function of the
450      // same name; see `perldoc -f quotemeta`.)
451      for (int ii = 0; ii < unquoted.size(); ++ii) {
452        // Note that using 'isalnum' here raises the benchmark time from
453        // 32ns to 58ns:
454        if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
455            (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
456            (unquoted[ii] < '0' || unquoted[ii] > '9') &&
457            unquoted[ii] != '_' &&
458            // If this is the part of a UTF8 or Latin1 character, we need
459            // to copy this byte without escaping.  Experimentally this is
460            // what works correctly with the regexp library.
461            !(unquoted[ii] & 128)) {
462          result += '\\';
463        }
464        result += unquoted[ii];
465      }
466    
467      return result;
468    }
469    
470  /***** Actual matching and rewriting code *****/  /***** Actual matching and rewriting code *****/
471    
472  int RE::TryMatch(const StringPiece& text,  int RE::TryMatch(const StringPiece& text,
# Line 395  int RE::TryMatch(const StringPiece& text Line 480  int RE::TryMatch(const StringPiece& text
480      return 0;      return 0;
481    }    }
482    
483    pcre_extra extra = { 0 };    pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
484    if (options_.match_limit() > 0) {    if (options_.match_limit() > 0) {
485      extra.flags = PCRE_EXTRA_MATCH_LIMIT;      extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
486      extra.match_limit = options_.match_limit();      extra.match_limit = options_.match_limit();
487    }    }
488      if (options_.match_limit_recursion() > 0) {
489        extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
490        extra.match_limit_recursion = options_.match_limit_recursion();
491      }
492    int rc = pcre_exec(re,              // The regular expression object    int rc = pcre_exec(re,              // The regular expression object
493                       &extra,                       &extra,
494                       text.data(),                       (text.data() == NULL) ? "" : text.data(),
495                       text.size(),                       text.size(),
496                       startpos,                       startpos,
497                       (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED,                       (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED,
# Line 424  int RE::TryMatch(const StringPiece& text Line 513  int RE::TryMatch(const StringPiece& text
513      rc = vecsize / 2;      rc = vecsize / 2;
514    }    }
515    
   if ((anchor == ANCHOR_BOTH) && (re_full_ == re_partial_)) {  
     // We need an extra check to make sure that the match extended  
     // to the end of the input string  
     assert(vec[0] == 0);                 // PCRE_ANCHORED forces starting match  
     if (vec[1] != text.size()) return 0; // Did not get ending match  
   }  
   
516    return rc;    return rc;
517  }  }
518    
# Line 449  bool RE::DoMatchImpl(const StringPiece& Line 531  bool RE::DoMatchImpl(const StringPiece&
531    
532    *consumed = vec[1];    *consumed = vec[1];
533    
534    if (args == NULL) {    if (n == 0 || args == NULL) {
535      // We are not interested in results      // We are not interested in results
536      return true;      return true;
537    }    }
538    
539      if (NumberOfCapturingGroups() < n) {
540        // RE has fewer capturing groups than number of arg pointers passed in
541        return false;
542      }
543    
544    // If we got here, we must have matched the whole pattern.    // If we got here, we must have matched the whole pattern.
545    // We do not need (can not do) any more checks on the value of 'matches' here    // We do not need (can not do) any more checks on the value of 'matches' here
546    // -- see the comment for TryMatch.    // -- see the comment for TryMatch.
# Line 517  bool RE::Rewrite(string *out, const Stri Line 604  bool RE::Rewrite(string *out, const Stri
604    
605  // Return the number of capturing subpatterns, or -1 if the  // Return the number of capturing subpatterns, or -1 if the
606  // regexp wasn't valid on construction.  // regexp wasn't valid on construction.
607  int RE::NumberOfCapturingGroups() {  int RE::NumberOfCapturingGroups() const {
608    if (re_partial_ == NULL) return -1;    if (re_partial_ == NULL) return -1;
609    
610    int result;    int result;
# Line 613  bool Arg::parse_ulong_radix(const char* Line 700  bool Arg::parse_ulong_radix(const char*
700    if (n == 0) return false;    if (n == 0) return false;
701    char buf[kMaxNumberLength+1];    char buf[kMaxNumberLength+1];
702    str = TerminateNumber(buf, str, n);    str = TerminateNumber(buf, str, n);
703      if (str[0] == '-') return false;    // strtoul() on a negative number?!
704    char* end;    char* end;
705    errno = 0;    errno = 0;
706    unsigned long r = strtoul(str, &end, radix);    unsigned long r = strtoul(str, &end, radix);
# Line 702  bool Arg::parse_ulonglong_radix(const ch Line 790  bool Arg::parse_ulonglong_radix(const ch
790    if (n == 0) return false;    if (n == 0) return false;
791    char buf[kMaxNumberLength+1];    char buf[kMaxNumberLength+1];
792    str = TerminateNumber(buf, str, n);    str = TerminateNumber(buf, str, n);
793      if (str[0] == '-') return false;    // strtoull() on a negative number?!
794    char* end;    char* end;
795    errno = 0;    errno = 0;
796  #if defined HAVE_STRTOQ  #if defined HAVE_STRTOQ
# Line 756  bool Arg::parse_float(const char* str, i Line 845  bool Arg::parse_float(const char* str, i
845      return parse_##name##_radix(str, n, dest, 0);                       \      return parse_##name##_radix(str, n, dest, 0);                       \
846    }    }
847    
848  DEFINE_INTEGER_PARSERS(short);  DEFINE_INTEGER_PARSERS(short)      /*                                   */
849  DEFINE_INTEGER_PARSERS(ushort);  DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
850  DEFINE_INTEGER_PARSERS(int);  DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
851  DEFINE_INTEGER_PARSERS(uint);  DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
852  DEFINE_INTEGER_PARSERS(long);  DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
853  DEFINE_INTEGER_PARSERS(ulong);  DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
854  DEFINE_INTEGER_PARSERS(longlong);  DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
855  DEFINE_INTEGER_PARSERS(ulonglong);  DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
856    
857  #undef DEFINE_INTEGER_PARSERS  #undef DEFINE_INTEGER_PARSERS
858    

Legend:
Removed from v.77  
changed lines
  Added in v.253

  ViewVC Help
Powered by ViewVC 1.1.5