/[pcre]/code/trunk/pcrecpp.cc
ViewVC logotype

Contents of /code/trunk/pcrecpp.cc

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1735 - (show annotations)
Tue Jun 26 16:51:43 2018 UTC (2 years, 6 months ago) by ph10
File size: 34930 byte(s)
Fix two C++ wrapper bugs, unnoticed for years. 
1 // Copyright (c) 2010, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // Author: Sanjay Ghemawat
31
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <ctype.h>
39 #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */
40 #include <string.h> /* for memcpy */
41 #include <assert.h>
42 #include <errno.h>
43 #include <string>
44 #include <algorithm>
45
46 #include "pcrecpp_internal.h"
47 #include "pcre.h"
48 #include "pcrecpp.h"
49 #include "pcre_stringpiece.h"
50
51
52 namespace pcrecpp {
53
54 // Maximum number of args we can set
55 static const int kMaxArgs = 16;
56 static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
57
58 // Special object that stands-in for no argument
59 Arg RE::no_arg((void*)NULL);
60
61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
62 // which defined a global no_arg variable instead of putting it in the
63 // RE class. This works on GCC >= 3, at least. It definitely works
64 // for ELF, but may not for other object formats (Mach-O, for
65 // instance, does not support aliases.) We could probably have a more
66 // inclusive test if we ever needed it. (Note that not only the
67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
68 // gnu-specific.)
69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) && !defined(__INTEL_COMPILER)
70 # define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x)
71 # define ULP_AS_STRING_INTERNAL(x) #x
72 # define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__)
73 extern Arg no_arg
74 __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
75 #endif
76
77 // If a regular expression has no error, its error_ field points here
78 static const string empty_string;
79
80 // If the user doesn't ask for any options, we just use this one
81 static RE_Options default_options;
82
83 // Specials for the start of patterns. See comments where start_options is used
84 // below. (PH June 2018)
85 static const char *start_options[] = {
86 "(*UTF8)",
87 "(*UTF)",
88 "(*UCP)",
89 "(*NO_START_OPT)",
90 "(*NO_AUTO_POSSESS)",
91 "(*LIMIT_RECURSION=",
92 "(*LIMIT_MATCH=",
93 "(*CRLF)",
94 "(*CR)",
95 "(*BSR_UNICODE)",
96 "(*BSR_ANYCRLF)",
97 "(*ANYCRLF)",
98 "(*ANY)",
99 "" };
100
101 void RE::Init(const string& pat, const RE_Options* options) {
102 pattern_ = pat;
103 if (options == NULL) {
104 options_ = default_options;
105 } else {
106 options_ = *options;
107 }
108 error_ = &empty_string;
109 re_full_ = NULL;
110 re_partial_ = NULL;
111
112 re_partial_ = Compile(UNANCHORED);
113 if (re_partial_ != NULL) {
114 re_full_ = Compile(ANCHOR_BOTH);
115 }
116 }
117
118 void RE::Cleanup() {
119 if (re_full_ != NULL) (*pcre_free)(re_full_);
120 if (re_partial_ != NULL) (*pcre_free)(re_partial_);
121 if (error_ != &empty_string) delete error_;
122 }
123
124
125 RE::~RE() {
126 Cleanup();
127 }
128
129
130 pcre* RE::Compile(Anchor anchor) {
131 // First, convert RE_Options into pcre options
132 int pcre_options = 0;
133 pcre_options = options_.all_options();
134
135 // Special treatment for anchoring. This is needed because at
136 // runtime pcre only provides an option for anchoring at the
137 // beginning of a string (unless you use offset).
138 //
139 // There are three types of anchoring we want:
140 // UNANCHORED Compile the original pattern, and use
141 // a pcre unanchored match.
142 // ANCHOR_START Compile the original pattern, and use
143 // a pcre anchored match.
144 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
145 // and use a pcre anchored match.
146
147 const char* compile_error;
148 int eoffset;
149 pcre* re;
150 if (anchor != ANCHOR_BOTH) {
151 re = pcre_compile(pattern_.c_str(), pcre_options,
152 &compile_error, &eoffset, NULL);
153 } else {
154 // Tack a '\z' at the end of RE. Parenthesize it first so that
155 // the '\z' applies to all top-level alternatives in the regexp.
156
157 /* When this code was written (for PCRE 6.0) it was enough just to
158 parenthesize the entire pattern. Unfortunately, when the feature of
159 starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
160 this code was never updated. This bug was not noticed till 2018, long after
161 PCRE became obsolescent and its maintainer no longer around. Since PCRE is
162 frozen, I have added a hack to check for all the existing "start of
163 pattern" specials - knowing that no new ones will ever be added. I am not a
164 C++ programmer, so the code style is no doubt crude. It is also
165 inefficient, but is only run when the pattern starts with "(*".
166 PH June 2018. */
167
168 string wrapped = "";
169
170 if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
171 int kk, klen, kmat;
172 for (;;) { // Loop for any number of leading items
173
174 for (kk = 0; start_options[kk][0] != 0; kk++) {
175 klen = strlen(start_options[kk]);
176 kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
177 if (kmat >= 0) break;
178 }
179 if (kmat != 0) break; // Not found
180
181 // If the item ended in "=" we must copy digits up to ")".
182
183 if (start_options[kk][klen-1] == '=') {
184 while (isdigit(pattern_.c_str()[klen])) klen++;
185 if (pattern_.c_str()[klen] != ')') break; // Syntax error
186 klen++;
187 }
188
189 // Move the item from the pattern to the start of the wrapped string.
190
191 wrapped += pattern_.substr(0, klen);
192 pattern_.erase(0, klen);
193 }
194 }
195
196 // Wrap the rest of the pattern.
197
198 wrapped += "(?:"; // A non-counting grouping operator
199 wrapped += pattern_;
200 wrapped += ")\\z";
201 re = pcre_compile(wrapped.c_str(), pcre_options,
202 &compile_error, &eoffset, NULL);
203 }
204 if (re == NULL) {
205 if (error_ == &empty_string) error_ = new string(compile_error);
206 }
207 return re;
208 }
209
210 /***** Matching interfaces *****/
211
212 bool RE::FullMatch(const StringPiece& text,
213 const Arg& ptr1,
214 const Arg& ptr2,
215 const Arg& ptr3,
216 const Arg& ptr4,
217 const Arg& ptr5,
218 const Arg& ptr6,
219 const Arg& ptr7,
220 const Arg& ptr8,
221 const Arg& ptr9,
222 const Arg& ptr10,
223 const Arg& ptr11,
224 const Arg& ptr12,
225 const Arg& ptr13,
226 const Arg& ptr14,
227 const Arg& ptr15,
228 const Arg& ptr16) const {
229 const Arg* args[kMaxArgs];
230 int n = 0;
231 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
232 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
233 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
234 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
235 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
236 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
237 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
238 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
239 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
240 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
241 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
242 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
243 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
244 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
245 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
246 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
247 done:
248
249 int consumed;
250 int vec[kVecSize];
251 return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
252 }
253
254 bool RE::PartialMatch(const StringPiece& text,
255 const Arg& ptr1,
256 const Arg& ptr2,
257 const Arg& ptr3,
258 const Arg& ptr4,
259 const Arg& ptr5,
260 const Arg& ptr6,
261 const Arg& ptr7,
262 const Arg& ptr8,
263 const Arg& ptr9,
264 const Arg& ptr10,
265 const Arg& ptr11,
266 const Arg& ptr12,
267 const Arg& ptr13,
268 const Arg& ptr14,
269 const Arg& ptr15,
270 const Arg& ptr16) const {
271 const Arg* args[kMaxArgs];
272 int n = 0;
273 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
274 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
275 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
276 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
277 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
278 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
279 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
280 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
281 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
282 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
283 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
284 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
285 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
286 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
287 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
288 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
289 done:
290
291 int consumed;
292 int vec[kVecSize];
293 return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
294 }
295
296 bool RE::Consume(StringPiece* input,
297 const Arg& ptr1,
298 const Arg& ptr2,
299 const Arg& ptr3,
300 const Arg& ptr4,
301 const Arg& ptr5,
302 const Arg& ptr6,
303 const Arg& ptr7,
304 const Arg& ptr8,
305 const Arg& ptr9,
306 const Arg& ptr10,
307 const Arg& ptr11,
308 const Arg& ptr12,
309 const Arg& ptr13,
310 const Arg& ptr14,
311 const Arg& ptr15,
312 const Arg& ptr16) const {
313 const Arg* args[kMaxArgs];
314 int n = 0;
315 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
316 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
317 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
318 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
319 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
320 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
321 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
322 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
323 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
324 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
325 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
326 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
327 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
328 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
329 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
330 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
331 done:
332
333 int consumed;
334 int vec[kVecSize];
335 if (DoMatchImpl(*input, ANCHOR_START, &consumed,
336 args, n, vec, kVecSize)) {
337 input->remove_prefix(consumed);
338 return true;
339 } else {
340 return false;
341 }
342 }
343
344 bool RE::FindAndConsume(StringPiece* input,
345 const Arg& ptr1,
346 const Arg& ptr2,
347 const Arg& ptr3,
348 const Arg& ptr4,
349 const Arg& ptr5,
350 const Arg& ptr6,
351 const Arg& ptr7,
352 const Arg& ptr8,
353 const Arg& ptr9,
354 const Arg& ptr10,
355 const Arg& ptr11,
356 const Arg& ptr12,
357 const Arg& ptr13,
358 const Arg& ptr14,
359 const Arg& ptr15,
360 const Arg& ptr16) const {
361 const Arg* args[kMaxArgs];
362 int n = 0;
363 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
364 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
365 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
366 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
367 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
368 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
369 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
370 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
371 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
372 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
373 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
374 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
375 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
376 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
377 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
378 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
379 done:
380
381 int consumed;
382 int vec[kVecSize];
383 if (DoMatchImpl(*input, UNANCHORED, &consumed,
384 args, n, vec, kVecSize)) {
385 input->remove_prefix(consumed);
386 return true;
387 } else {
388 return false;
389 }
390 }
391
392 bool RE::Replace(const StringPiece& rewrite,
393 string *str) const {
394 int vec[kVecSize];
395 int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
396 if (matches == 0)
397 return false;
398
399 string s;
400 if (!Rewrite(&s, rewrite, *str, vec, matches))
401 return false;
402
403 assert(vec[0] >= 0);
404 assert(vec[1] >= 0);
405 str->replace(vec[0], vec[1] - vec[0], s);
406 return true;
407 }
408
409 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
410 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
411 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
412
413 static int NewlineMode(int pcre_options) {
414 // TODO: if we can make it threadsafe, cache this var
415 int newline_mode = 0;
416 /* if (newline_mode) return newline_mode; */ // do this once it's cached
417 if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
418 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
419 newline_mode = (pcre_options &
420 (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
421 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
422 } else {
423 int newline;
424 pcre_config(PCRE_CONFIG_NEWLINE, &newline);
425 if (newline == 10)
426 newline_mode = PCRE_NEWLINE_LF;
427 else if (newline == 13)
428 newline_mode = PCRE_NEWLINE_CR;
429 else if (newline == 3338)
430 newline_mode = PCRE_NEWLINE_CRLF;
431 else if (newline == -1)
432 newline_mode = PCRE_NEWLINE_ANY;
433 else if (newline == -2)
434 newline_mode = PCRE_NEWLINE_ANYCRLF;
435 else
436 assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
437 }
438 return newline_mode;
439 }
440
441 int RE::GlobalReplace(const StringPiece& rewrite,
442 string *str) const {
443 int count = 0;
444 int vec[kVecSize];
445 string out;
446 int start = 0;
447 bool last_match_was_empty_string = false;
448
449 while (start <= static_cast<int>(str->length())) {
450 // If the previous match was for the empty string, we shouldn't
451 // just match again: we'll match in the same way and get an
452 // infinite loop. Instead, we do the match in a special way:
453 // anchored -- to force another try at the same position --
454 // and with a flag saying that this time, ignore empty matches.
455 // If this special match returns, that means there's a non-empty
456 // match at this position as well, and we can continue. If not,
457 // we do what perl does, and just advance by one.
458 // Notice that perl prints '@@@' for this;
459 // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
460 int matches;
461 if (last_match_was_empty_string) {
462 matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
463 if (matches <= 0) {
464 int matchend = start + 1; // advance one character.
465 // If the current char is CR and we're in CRLF mode, skip LF too.
466 // Note it's better to call pcre_fullinfo() than to examine
467 // all_options(), since options_ could have changed bewteen
468 // compile-time and now, but this is simpler and safe enough.
469 // Modified by PH to add ANY and ANYCRLF.
470 if (matchend < static_cast<int>(str->length()) &&
471 (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
472 (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
473 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
474 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
475 matchend++;
476 }
477 // We also need to advance more than one char if we're in utf8 mode.
478 #ifdef SUPPORT_UTF
479 if (options_.utf8()) {
480 while (matchend < static_cast<int>(str->length()) &&
481 ((*str)[matchend] & 0xc0) == 0x80)
482 matchend++;
483 }
484 #endif
485 if (start < static_cast<int>(str->length()))
486 out.append(*str, start, matchend - start);
487 start = matchend;
488 last_match_was_empty_string = false;
489 continue;
490 }
491 } else {
492 matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
493 if (matches <= 0)
494 break;
495 }
496 int matchstart = vec[0], matchend = vec[1];
497 assert(matchstart >= start);
498 assert(matchend >= matchstart);
499 out.append(*str, start, matchstart - start);
500 Rewrite(&out, rewrite, *str, vec, matches);
501 start = matchend;
502 count++;
503 last_match_was_empty_string = (matchstart == matchend);
504 }
505
506 if (count == 0)
507 return 0;
508
509 if (start < static_cast<int>(str->length()))
510 out.append(*str, start, str->length() - start);
511 swap(out, *str);
512 return count;
513 }
514
515 bool RE::Extract(const StringPiece& rewrite,
516 const StringPiece& text,
517 string *out) const {
518 int vec[kVecSize];
519 int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
520 if (matches == 0)
521 return false;
522 out->erase();
523 return Rewrite(out, rewrite, text, vec, matches);
524 }
525
526 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
527 string result;
528
529 // Escape any ascii character not in [A-Za-z_0-9].
530 //
531 // Note that it's legal to escape a character even if it has no
532 // special meaning in a regular expression -- so this function does
533 // that. (This also makes it identical to the perl function of the
534 // same name; see `perldoc -f quotemeta`.) The one exception is
535 // escaping NUL: rather than doing backslash + NUL, like perl does,
536 // we do '\0', because pcre itself doesn't take embedded NUL chars.
537 for (int ii = 0; ii < unquoted.size(); ++ii) {
538 // Note that using 'isalnum' here raises the benchmark time from
539 // 32ns to 58ns:
540 if (unquoted[ii] == '\0') {
541 result += "\\0";
542 } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
543 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
544 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
545 unquoted[ii] != '_' &&
546 // If this is the part of a UTF8 or Latin1 character, we need
547 // to copy this byte without escaping. Experimentally this is
548 // what works correctly with the regexp library.
549 !(unquoted[ii] & 128)) {
550 result += '\\';
551 result += unquoted[ii];
552 } else {
553 result += unquoted[ii];
554 }
555 }
556
557 return result;
558 }
559
560 /***** Actual matching and rewriting code *****/
561
562 int RE::TryMatch(const StringPiece& text,
563 int startpos,
564 Anchor anchor,
565 bool empty_ok,
566 int *vec,
567 int vecsize) const {
568 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
569 if (re == NULL) {
570 //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
571 return 0;
572 }
573
574 pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
575 if (options_.match_limit() > 0) {
576 extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
577 extra.match_limit = options_.match_limit();
578 }
579 if (options_.match_limit_recursion() > 0) {
580 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
581 extra.match_limit_recursion = options_.match_limit_recursion();
582 }
583
584 // int options = 0;
585 // Changed by PH as a result of bugzilla #1288
586 int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
587
588 if (anchor != UNANCHORED)
589 options |= PCRE_ANCHORED;
590 if (!empty_ok)
591 options |= PCRE_NOTEMPTY;
592
593 int rc = pcre_exec(re, // The regular expression object
594 &extra,
595 (text.data() == NULL) ? "" : text.data(),
596 text.size(),
597 startpos,
598 options,
599 vec,
600 vecsize);
601
602 // Handle errors
603 if (rc == PCRE_ERROR_NOMATCH) {
604 return 0;
605 } else if (rc < 0) {
606 //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
607 // re, pattern_.c_str());
608 return 0;
609 } else if (rc == 0) {
610 // pcre_exec() returns 0 as a special case when the number of
611 // capturing subpatterns exceeds the size of the vector.
612 // When this happens, there is a match and the output vector
613 // is filled, but we miss out on the positions of the extra subpatterns.
614 rc = vecsize / 2;
615 }
616
617 return rc;
618 }
619
620 bool RE::DoMatchImpl(const StringPiece& text,
621 Anchor anchor,
622 int* consumed,
623 const Arg* const* args,
624 int n,
625 int* vec,
626 int vecsize) const {
627 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
628 int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
629 assert(matches >= 0); // TryMatch never returns negatives
630 if (matches == 0)
631 return false;
632
633 *consumed = vec[1];
634
635 if (n == 0 || args == NULL) {
636 // We are not interested in results
637 return true;
638 }
639
640 if (NumberOfCapturingGroups() < n) {
641 // RE has fewer capturing groups than number of arg pointers passed in
642 return false;
643 }
644
645 // If we got here, we must have matched the whole pattern.
646 // We do not need (can not do) any more checks on the value of 'matches' here
647 // -- see the comment for TryMatch.
648 for (int i = 0; i < n; i++) {
649 const int start = vec[2*(i+1)];
650 const int limit = vec[2*(i+1)+1];
651 if (!args[i]->Parse(text.data() + start, limit-start)) {
652 // TODO: Should we indicate what the error was?
653 return false;
654 }
655 }
656
657 return true;
658 }
659
660 bool RE::DoMatch(const StringPiece& text,
661 Anchor anchor,
662 int* consumed,
663 const Arg* const args[],
664 int n) const {
665 assert(n >= 0);
666 size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
667 // (as for kVecSize)
668 int space[21]; // use stack allocation for small vecsize (common case)
669 int* vec = vecsize <= 21 ? space : new int[vecsize];
670 bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
671 if (vec != space) delete [] vec;
672 return retval;
673 }
674
675 bool RE::Rewrite(string *out, const StringPiece &rewrite,
676 const StringPiece &text, int *vec, int veclen) const {
677 for (const char *s = rewrite.data(), *end = s + rewrite.size();
678 s < end; s++) {
679 int c = *s;
680 if (c == '\\') {
681 c = *++s;
682 if (isdigit(c)) {
683 int n = (c - '0');
684 if (n >= veclen) {
685 //fprintf(stderr, requested group %d in regexp %.*s\n",
686 // n, rewrite.size(), rewrite.data());
687 return false;
688 }
689 int start = vec[2 * n];
690 if (start >= 0)
691 out->append(text.data() + start, vec[2 * n + 1] - start);
692 } else if (c == '\\') {
693 *out += '\\';
694 } else {
695 //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
696 // rewrite.size(), rewrite.data());
697 return false;
698 }
699 } else {
700 *out += c;
701 }
702 }
703 return true;
704 }
705
706 // Return the number of capturing subpatterns, or -1 if the
707 // regexp wasn't valid on construction.
708 int RE::NumberOfCapturingGroups() const {
709 if (re_partial_ == NULL) return -1;
710
711 int result;
712 int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object
713 NULL, // We did not study the pattern
714 PCRE_INFO_CAPTURECOUNT,
715 &result);
716 assert(pcre_retval == 0);
717 return result;
718 }
719
720 /***** Parsers for various types *****/
721
722 bool Arg::parse_null(const char* str, int n, void* dest) {
723 (void)str;
724 (void)n;
725 // We fail if somebody asked us to store into a non-NULL void* pointer
726 return (dest == NULL);
727 }
728
729 bool Arg::parse_string(const char* str, int n, void* dest) {
730 if (dest == NULL) return true;
731 reinterpret_cast<string*>(dest)->assign(str, n);
732 return true;
733 }
734
735 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
736 if (dest == NULL) return true;
737 reinterpret_cast<StringPiece*>(dest)->set(str, n);
738 return true;
739 }
740
741 bool Arg::parse_char(const char* str, int n, void* dest) {
742 if (n != 1) return false;
743 if (dest == NULL) return true;
744 *(reinterpret_cast<char*>(dest)) = str[0];
745 return true;
746 }
747
748 bool Arg::parse_uchar(const char* str, int n, void* dest) {
749 if (n != 1) return false;
750 if (dest == NULL) return true;
751 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
752 return true;
753 }
754
755 // Largest number spec that we are willing to parse
756 static const int kMaxNumberLength = 32;
757
758 // REQUIRES "buf" must have length at least kMaxNumberLength+1
759 // REQUIRES "n > 0"
760 // Copies "str" into "buf" and null-terminates if necessary.
761 // Returns one of:
762 // a. "str" if no termination is needed
763 // b. "buf" if the string was copied and null-terminated
764 // c. "" if the input was invalid and has no hope of being parsed
765 static const char* TerminateNumber(char* buf, const char* str, int n) {
766 if ((n > 0) && isspace(*str)) {
767 // We are less forgiving than the strtoxxx() routines and do not
768 // allow leading spaces.
769 return "";
770 }
771
772 // See if the character right after the input text may potentially
773 // look like a digit.
774 if (isdigit(str[n]) ||
775 ((str[n] >= 'a') && (str[n] <= 'f')) ||
776 ((str[n] >= 'A') && (str[n] <= 'F'))) {
777 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
778 memcpy(buf, str, n);
779 buf[n] = '\0';
780 return buf;
781 } else {
782 // We can parse right out of the supplied string, so return it.
783 return str;
784 }
785 }
786
787 bool Arg::parse_long_radix(const char* str,
788 int n,
789 void* dest,
790 int radix) {
791 if (n == 0) return false;
792 char buf[kMaxNumberLength+1];
793 str = TerminateNumber(buf, str, n);
794 char* end;
795 errno = 0;
796 long r = strtol(str, &end, radix);
797 if (end != str + n) return false; // Leftover junk
798 if (errno) return false;
799 if (dest == NULL) return true;
800 *(reinterpret_cast<long*>(dest)) = r;
801 return true;
802 }
803
804 bool Arg::parse_ulong_radix(const char* str,
805 int n,
806 void* dest,
807 int radix) {
808 if (n == 0) return false;
809 char buf[kMaxNumberLength+1];
810 str = TerminateNumber(buf, str, n);
811 if (str[0] == '-') return false; // strtoul() on a negative number?!
812 char* end;
813 errno = 0;
814 unsigned long r = strtoul(str, &end, radix);
815 if (end != str + n) return false; // Leftover junk
816 if (errno) return false;
817 if (dest == NULL) return true;
818 *(reinterpret_cast<unsigned long*>(dest)) = r;
819 return true;
820 }
821
822 bool Arg::parse_short_radix(const char* str,
823 int n,
824 void* dest,
825 int radix) {
826 long r;
827 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
828 if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range
829 if (dest == NULL) return true;
830 *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
831 return true;
832 }
833
834 bool Arg::parse_ushort_radix(const char* str,
835 int n,
836 void* dest,
837 int radix) {
838 unsigned long r;
839 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
840 if (r > USHRT_MAX) return false; // Out of range
841 if (dest == NULL) return true;
842 *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
843 return true;
844 }
845
846 bool Arg::parse_int_radix(const char* str,
847 int n,
848 void* dest,
849 int radix) {
850 long r;
851 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
852 if (r < INT_MIN || r > INT_MAX) return false; // Out of range
853 if (dest == NULL) return true;
854 *(reinterpret_cast<int*>(dest)) = r;
855 return true;
856 }
857
858 bool Arg::parse_uint_radix(const char* str,
859 int n,
860 void* dest,
861 int radix) {
862 unsigned long r;
863 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
864 if (r > UINT_MAX) return false; // Out of range
865 if (dest == NULL) return true;
866 *(reinterpret_cast<unsigned int*>(dest)) = r;
867 return true;
868 }
869
870 bool Arg::parse_longlong_radix(const char* str,
871 int n,
872 void* dest,
873 int radix) {
874 #ifndef HAVE_LONG_LONG
875 return false;
876 #else
877 if (n == 0) return false;
878 char buf[kMaxNumberLength+1];
879 str = TerminateNumber(buf, str, n);
880 char* end;
881 errno = 0;
882 #if defined HAVE_STRTOQ
883 long long r = strtoq(str, &end, radix);
884 #elif defined HAVE_STRTOLL
885 long long r = strtoll(str, &end, radix);
886 #elif defined HAVE__STRTOI64
887 long long r = _strtoi64(str, &end, radix);
888 #elif defined HAVE_STRTOIMAX
889 long long r = strtoimax(str, &end, radix);
890 #else
891 #error parse_longlong_radix: cannot convert input to a long-long
892 #endif
893 if (end != str + n) return false; // Leftover junk
894 if (errno) return false;
895 if (dest == NULL) return true;
896 *(reinterpret_cast<long long*>(dest)) = r;
897 return true;
898 #endif /* HAVE_LONG_LONG */
899 }
900
901 bool Arg::parse_ulonglong_radix(const char* str,
902 int n,
903 void* dest,
904 int radix) {
905 #ifndef HAVE_UNSIGNED_LONG_LONG
906 return false;
907 #else
908 if (n == 0) return false;
909 char buf[kMaxNumberLength+1];
910 str = TerminateNumber(buf, str, n);
911 if (str[0] == '-') return false; // strtoull() on a negative number?!
912 char* end;
913 errno = 0;
914 #if defined HAVE_STRTOQ
915 unsigned long long r = strtouq(str, &end, radix);
916 #elif defined HAVE_STRTOLL
917 unsigned long long r = strtoull(str, &end, radix);
918 #elif defined HAVE__STRTOI64
919 unsigned long long r = _strtoui64(str, &end, radix);
920 #elif defined HAVE_STRTOIMAX
921 unsigned long long r = strtoumax(str, &end, radix);
922 #else
923 #error parse_ulonglong_radix: cannot convert input to a long-long
924 #endif
925 if (end != str + n) return false; // Leftover junk
926 if (errno) return false;
927 if (dest == NULL) return true;
928 *(reinterpret_cast<unsigned long long*>(dest)) = r;
929 return true;
930 #endif /* HAVE_UNSIGNED_LONG_LONG */
931 }
932
933 bool Arg::parse_double(const char* str, int n, void* dest) {
934 if (n == 0) return false;
935 static const int kMaxLength = 200;
936 char buf[kMaxLength];
937 if (n >= kMaxLength) return false;
938 memcpy(buf, str, n);
939 buf[n] = '\0';
940 errno = 0;
941 char* end;
942 double r = strtod(buf, &end);
943 if (end != buf + n) return false; // Leftover junk
944 if (errno) return false;
945 if (dest == NULL) return true;
946 *(reinterpret_cast<double*>(dest)) = r;
947 return true;
948 }
949
950 bool Arg::parse_float(const char* str, int n, void* dest) {
951 double r;
952 if (!parse_double(str, n, &r)) return false;
953 if (dest == NULL) return true;
954 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
955 return true;
956 }
957
958
959 #define DEFINE_INTEGER_PARSERS(name) \
960 bool Arg::parse_##name(const char* str, int n, void* dest) { \
961 return parse_##name##_radix(str, n, dest, 10); \
962 } \
963 bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
964 return parse_##name##_radix(str, n, dest, 16); \
965 } \
966 bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
967 return parse_##name##_radix(str, n, dest, 8); \
968 } \
969 bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
970 return parse_##name##_radix(str, n, dest, 0); \
971 }
972
973 DEFINE_INTEGER_PARSERS(short) /* */
974 DEFINE_INTEGER_PARSERS(ushort) /* */
975 DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
976 DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
977 DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
978 DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
979 DEFINE_INTEGER_PARSERS(longlong) /* */
980 DEFINE_INTEGER_PARSERS(ulonglong) /* */
981
982 #undef DEFINE_INTEGER_PARSERS
983
984 } // namespace pcrecpp

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5