29 |
// |
// |
30 |
// Author: Sanjay Ghemawat |
// Author: Sanjay Ghemawat |
31 |
|
|
32 |
|
#ifdef HAVE_CONFIG_H |
33 |
|
#include "config.h" |
34 |
|
#endif |
35 |
|
|
36 |
#include <stdlib.h> |
#include <stdlib.h> |
37 |
#include <stdio.h> |
#include <stdio.h> |
38 |
#include <ctype.h> |
#include <ctype.h> |
40 |
#include <assert.h> |
#include <assert.h> |
41 |
#include <errno.h> |
#include <errno.h> |
42 |
#include <string> |
#include <string> |
43 |
#include "config.h" |
#include <algorithm> |
44 |
// We need this to compile the proper dll on windows/msys. This is copied |
|
45 |
// from pcre_internal.h. It would probably be better just to include that. |
#include "pcrecpp_internal.h" |
|
#define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */ |
|
46 |
#include "pcre.h" |
#include "pcre.h" |
|
#include "pcre_stringpiece.h" |
|
47 |
#include "pcrecpp.h" |
#include "pcrecpp.h" |
48 |
|
#include "pcre_stringpiece.h" |
49 |
|
|
50 |
|
|
51 |
namespace pcrecpp { |
namespace pcrecpp { |
55 |
static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace |
static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace |
56 |
|
|
57 |
// Special object that stands-in for no argument |
// Special object that stands-in for no argument |
58 |
Arg no_arg((void*)NULL); |
PCRECPP_EXP_DEFN Arg no_arg((void*)NULL); |
59 |
|
|
60 |
// If a regular expression has no error, its error_ field points here |
// If a regular expression has no error, its error_ field points here |
61 |
static const string empty_string; |
static const string empty_string; |
63 |
// If the user doesn't ask for any options, we just use this one |
// If the user doesn't ask for any options, we just use this one |
64 |
static RE_Options default_options; |
static RE_Options default_options; |
65 |
|
|
66 |
void RE::Init(const char* pat, const RE_Options* options) { |
void RE::Init(const string& pat, const RE_Options* options) { |
67 |
pattern_ = pat; |
pattern_ = pat; |
68 |
if (options == NULL) { |
if (options == NULL) { |
69 |
options_ = default_options; |
options_ = default_options; |
76 |
|
|
77 |
re_partial_ = Compile(UNANCHORED); |
re_partial_ = Compile(UNANCHORED); |
78 |
if (re_partial_ != NULL) { |
if (re_partial_ != NULL) { |
79 |
// Check for complicated patterns. The following change is |
re_full_ = Compile(ANCHOR_BOTH); |
|
// conservative in that it may treat some "simple" patterns |
|
|
// as "complex" (e.g., if the vertical bar is in a character |
|
|
// class or is escaped). But it seems good enough. |
|
|
if (strchr(pat, '|') == NULL) { |
|
|
// Simple pattern: we can use position-based checks to perform |
|
|
// fully anchored matches |
|
|
re_full_ = re_partial_; |
|
|
} else { |
|
|
// We need a special pattern for anchored matches |
|
|
re_full_ = Compile(ANCHOR_BOTH); |
|
|
} |
|
80 |
} |
} |
81 |
} |
} |
82 |
|
|
83 |
|
void RE::Cleanup() { |
84 |
|
if (re_full_ != NULL) (*pcre_free)(re_full_); |
85 |
|
if (re_partial_ != NULL) (*pcre_free)(re_partial_); |
86 |
|
if (error_ != &empty_string) delete error_; |
87 |
|
} |
88 |
|
|
89 |
|
|
90 |
RE::~RE() { |
RE::~RE() { |
91 |
if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_); |
Cleanup(); |
|
if (re_partial_ != NULL) (*pcre_free)(re_partial_); |
|
|
if (error_ != &empty_string) delete error_; |
|
92 |
} |
} |
93 |
|
|
94 |
|
|
95 |
pcre* RE::Compile(Anchor anchor) { |
pcre* RE::Compile(Anchor anchor) { |
96 |
// First, convert RE_Options into pcre options |
// First, convert RE_Options into pcre options |
97 |
int pcre_options = 0; |
int pcre_options = 0; |
98 |
if (options_.utf8()) |
pcre_options = options_.all_options(); |
|
pcre_options |= PCRE_UTF8; |
|
99 |
|
|
100 |
// Special treatment for anchoring. This is needed because at |
// Special treatment for anchoring. This is needed because at |
101 |
// runtime pcre only provides an option for anchoring at the |
// runtime pcre only provides an option for anchoring at the |
329 |
return true; |
return true; |
330 |
} |
} |
331 |
|
|
332 |
|
// Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF. |
333 |
|
// Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF. |
334 |
|
// Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF. |
335 |
|
|
336 |
|
static int NewlineMode(int pcre_options) { |
337 |
|
// TODO: if we can make it threadsafe, cache this var |
338 |
|
int newline_mode = 0; |
339 |
|
/* if (newline_mode) return newline_mode; */ // do this once it's cached |
340 |
|
if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| |
341 |
|
PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) { |
342 |
|
newline_mode = (pcre_options & |
343 |
|
(PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| |
344 |
|
PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)); |
345 |
|
} else { |
346 |
|
int newline; |
347 |
|
pcre_config(PCRE_CONFIG_NEWLINE, &newline); |
348 |
|
if (newline == 10) |
349 |
|
newline_mode = PCRE_NEWLINE_LF; |
350 |
|
else if (newline == 13) |
351 |
|
newline_mode = PCRE_NEWLINE_CR; |
352 |
|
else if (newline == 3338) |
353 |
|
newline_mode = PCRE_NEWLINE_CRLF; |
354 |
|
else if (newline == -1) |
355 |
|
newline_mode = PCRE_NEWLINE_ANY; |
356 |
|
else if (newline == -2) |
357 |
|
newline_mode = PCRE_NEWLINE_ANYCRLF; |
358 |
|
else |
359 |
|
assert("" == "Unexpected return value from pcre_config(NEWLINE)"); |
360 |
|
} |
361 |
|
return newline_mode; |
362 |
|
} |
363 |
|
|
364 |
int RE::GlobalReplace(const StringPiece& rewrite, |
int RE::GlobalReplace(const StringPiece& rewrite, |
365 |
string *str) const { |
string *str) const { |
366 |
int count = 0; |
int count = 0; |
379 |
if (matchstart == matchend && matchstart == lastend) { |
if (matchstart == matchend && matchstart == lastend) { |
380 |
// advance one character if we matched an empty string at the same |
// advance one character if we matched an empty string at the same |
381 |
// place as the last match occurred |
// place as the last match occurred |
382 |
if (start < static_cast<int>(str->length())) |
matchend = start + 1; |
383 |
out.push_back((*str)[start]); |
// If the current char is CR and we're in CRLF mode, skip LF too. |
384 |
start++; |
// Note it's better to call pcre_fullinfo() than to examine |
385 |
|
// all_options(), since options_ could have changed bewteen |
386 |
|
// compile-time and now, but this is simpler and safe enough. |
387 |
|
// Modified by PH to add ANY and ANYCRLF. |
388 |
|
if (start+1 < static_cast<int>(str->length()) && |
389 |
|
(*str)[start] == '\r' && (*str)[start+1] == '\n' && |
390 |
|
(NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF || |
391 |
|
NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY || |
392 |
|
NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF) |
393 |
|
) { |
394 |
|
matchend++; |
395 |
|
} |
396 |
|
// We also need to advance more than one char if we're in utf8 mode. |
397 |
|
#ifdef SUPPORT_UTF8 |
398 |
|
if (options_.utf8()) { |
399 |
|
while (matchend < static_cast<int>(str->length()) && |
400 |
|
((*str)[matchend] & 0xc0) == 0x80) |
401 |
|
matchend++; |
402 |
|
} |
403 |
|
#endif |
404 |
|
if (matchend <= static_cast<int>(str->length())) |
405 |
|
out.append(*str, start, matchend - start); |
406 |
|
start = matchend; |
407 |
} else { |
} else { |
408 |
out.append(*str, start, matchstart - start); |
out.append(*str, start, matchstart - start); |
409 |
Rewrite(&out, rewrite, *str, vec, matches); |
Rewrite(&out, rewrite, *str, vec, matches); |
429 |
int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize); |
int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize); |
430 |
if (matches == 0) |
if (matches == 0) |
431 |
return false; |
return false; |
432 |
out->clear(); |
out->erase(); |
433 |
return Rewrite(out, rewrite, text, vec, matches); |
return Rewrite(out, rewrite, text, vec, matches); |
434 |
} |
} |
435 |
|
|
436 |
|
/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) { |
437 |
|
string result; |
438 |
|
|
439 |
|
// Escape any ascii character not in [A-Za-z_0-9]. |
440 |
|
// |
441 |
|
// Note that it's legal to escape a character even if it has no |
442 |
|
// special meaning in a regular expression -- so this function does |
443 |
|
// that. (This also makes it identical to the perl function of the |
444 |
|
// same name; see `perldoc -f quotemeta`.) |
445 |
|
for (int ii = 0; ii < unquoted.size(); ++ii) { |
446 |
|
// Note that using 'isalnum' here raises the benchmark time from |
447 |
|
// 32ns to 58ns: |
448 |
|
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
449 |
|
(unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
450 |
|
(unquoted[ii] < '0' || unquoted[ii] > '9') && |
451 |
|
unquoted[ii] != '_' && |
452 |
|
// If this is the part of a UTF8 or Latin1 character, we need |
453 |
|
// to copy this byte without escaping. Experimentally this is |
454 |
|
// what works correctly with the regexp library. |
455 |
|
!(unquoted[ii] & 128)) { |
456 |
|
result += '\\'; |
457 |
|
} |
458 |
|
result += unquoted[ii]; |
459 |
|
} |
460 |
|
|
461 |
|
return result; |
462 |
|
} |
463 |
|
|
464 |
/***** Actual matching and rewriting code *****/ |
/***** Actual matching and rewriting code *****/ |
465 |
|
|
466 |
int RE::TryMatch(const StringPiece& text, |
int RE::TryMatch(const StringPiece& text, |
474 |
return 0; |
return 0; |
475 |
} |
} |
476 |
|
|
477 |
pcre_extra extra = { 0 }; |
pcre_extra extra = { 0, 0, 0, 0, 0, 0 }; |
478 |
if (options_.match_limit() > 0) { |
if (options_.match_limit() > 0) { |
479 |
extra.flags = PCRE_EXTRA_MATCH_LIMIT; |
extra.flags |= PCRE_EXTRA_MATCH_LIMIT; |
480 |
extra.match_limit = options_.match_limit(); |
extra.match_limit = options_.match_limit(); |
481 |
} |
} |
482 |
|
if (options_.match_limit_recursion() > 0) { |
483 |
|
extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; |
484 |
|
extra.match_limit_recursion = options_.match_limit_recursion(); |
485 |
|
} |
486 |
int rc = pcre_exec(re, // The regular expression object |
int rc = pcre_exec(re, // The regular expression object |
487 |
&extra, |
&extra, |
488 |
text.data(), |
(text.data() == NULL) ? "" : text.data(), |
489 |
text.size(), |
text.size(), |
490 |
startpos, |
startpos, |
491 |
(anchor == UNANCHORED) ? 0 : PCRE_ANCHORED, |
(anchor == UNANCHORED) ? 0 : PCRE_ANCHORED, |
507 |
rc = vecsize / 2; |
rc = vecsize / 2; |
508 |
} |
} |
509 |
|
|
|
if ((anchor == ANCHOR_BOTH) && (re_full_ == re_partial_)) { |
|
|
// We need an extra check to make sure that the match extended |
|
|
// to the end of the input string |
|
|
assert(vec[0] == 0); // PCRE_ANCHORED forces starting match |
|
|
if (vec[1] != text.size()) return 0; // Did not get ending match |
|
|
} |
|
|
|
|
510 |
return rc; |
return rc; |
511 |
} |
} |
512 |
|
|
525 |
|
|
526 |
*consumed = vec[1]; |
*consumed = vec[1]; |
527 |
|
|
528 |
if (args == NULL) { |
if (n == 0 || args == NULL) { |
529 |
// We are not interested in results |
// We are not interested in results |
530 |
return true; |
return true; |
531 |
} |
} |
532 |
|
|
533 |
|
if (NumberOfCapturingGroups() < n) { |
534 |
|
// RE has fewer capturing groups than number of arg pointers passed in |
535 |
|
return false; |
536 |
|
} |
537 |
|
|
538 |
// If we got here, we must have matched the whole pattern. |
// If we got here, we must have matched the whole pattern. |
539 |
// We do not need (can not do) any more checks on the value of 'matches' here |
// We do not need (can not do) any more checks on the value of 'matches' here |
540 |
// -- see the comment for TryMatch. |
// -- see the comment for TryMatch. |
598 |
|
|
599 |
// Return the number of capturing subpatterns, or -1 if the |
// Return the number of capturing subpatterns, or -1 if the |
600 |
// regexp wasn't valid on construction. |
// regexp wasn't valid on construction. |
601 |
int RE::NumberOfCapturingGroups() { |
int RE::NumberOfCapturingGroups() const { |
602 |
if (re_partial_ == NULL) return -1; |
if (re_partial_ == NULL) return -1; |
603 |
|
|
604 |
int result; |
int result; |
694 |
if (n == 0) return false; |
if (n == 0) return false; |
695 |
char buf[kMaxNumberLength+1]; |
char buf[kMaxNumberLength+1]; |
696 |
str = TerminateNumber(buf, str, n); |
str = TerminateNumber(buf, str, n); |
697 |
|
if (str[0] == '-') return false; // strtoul() on a negative number?! |
698 |
char* end; |
char* end; |
699 |
errno = 0; |
errno = 0; |
700 |
unsigned long r = strtoul(str, &end, radix); |
unsigned long r = strtoul(str, &end, radix); |
711 |
long r; |
long r; |
712 |
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
713 |
if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range |
if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range |
714 |
*(reinterpret_cast<short*>(dest)) = r; |
*(reinterpret_cast<short*>(dest)) = static_cast<short>(r); |
715 |
return true; |
return true; |
716 |
} |
} |
717 |
|
|
722 |
unsigned long r; |
unsigned long r; |
723 |
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
724 |
if (r > USHRT_MAX) return false; // Out of range |
if (r > USHRT_MAX) return false; // Out of range |
725 |
*(reinterpret_cast<unsigned short*>(dest)) = r; |
*(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r); |
726 |
return true; |
return true; |
727 |
} |
} |
728 |
|
|
764 |
long long r = strtoq(str, &end, radix); |
long long r = strtoq(str, &end, radix); |
765 |
#elif defined HAVE_STRTOLL |
#elif defined HAVE_STRTOLL |
766 |
long long r = strtoll(str, &end, radix); |
long long r = strtoll(str, &end, radix); |
767 |
|
#elif defined HAVE__STRTOI64 |
768 |
|
long long r = _strtoi64(str, &end, radix); |
769 |
#else |
#else |
770 |
#error parse_longlong_radix: cannot convert input to a long-long |
#error parse_longlong_radix: cannot convert input to a long-long |
771 |
#endif |
#endif |
786 |
if (n == 0) return false; |
if (n == 0) return false; |
787 |
char buf[kMaxNumberLength+1]; |
char buf[kMaxNumberLength+1]; |
788 |
str = TerminateNumber(buf, str, n); |
str = TerminateNumber(buf, str, n); |
789 |
|
if (str[0] == '-') return false; // strtoull() on a negative number?! |
790 |
char* end; |
char* end; |
791 |
errno = 0; |
errno = 0; |
792 |
#if defined HAVE_STRTOQ |
#if defined HAVE_STRTOQ |
793 |
unsigned long long r = strtouq(str, &end, radix); |
unsigned long long r = strtouq(str, &end, radix); |
794 |
#elif defined HAVE_STRTOLL |
#elif defined HAVE_STRTOLL |
795 |
unsigned long long r = strtoull(str, &end, radix); |
unsigned long long r = strtoull(str, &end, radix); |
796 |
|
#elif defined HAVE__STRTOI64 |
797 |
|
unsigned long long r = _strtoui64(str, &end, radix); |
798 |
#else |
#else |
799 |
#error parse_ulonglong_radix: cannot convert input to a long-long |
#error parse_ulonglong_radix: cannot convert input to a long-long |
800 |
#endif |
#endif |
843 |
return parse_##name##_radix(str, n, dest, 0); \ |
return parse_##name##_radix(str, n, dest, 0); \ |
844 |
} |
} |
845 |
|
|
846 |
DEFINE_INTEGER_PARSERS(short); |
DEFINE_INTEGER_PARSERS(short) /* */ |
847 |
DEFINE_INTEGER_PARSERS(ushort); |
DEFINE_INTEGER_PARSERS(ushort) /* */ |
848 |
DEFINE_INTEGER_PARSERS(int); |
DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */ |
849 |
DEFINE_INTEGER_PARSERS(uint); |
DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */ |
850 |
DEFINE_INTEGER_PARSERS(long); |
DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */ |
851 |
DEFINE_INTEGER_PARSERS(ulong); |
DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */ |
852 |
DEFINE_INTEGER_PARSERS(longlong); |
DEFINE_INTEGER_PARSERS(longlong) /* */ |
853 |
DEFINE_INTEGER_PARSERS(ulonglong); |
DEFINE_INTEGER_PARSERS(ulonglong) /* */ |
854 |
|
|
855 |
#undef DEFINE_INTEGER_PARSERS |
#undef DEFINE_INTEGER_PARSERS |
856 |
|
|