/[pcre]/code/trunk/pcrecpp.cc
ViewVC logotype

Contents of /code/trunk/pcrecpp.cc

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1752 - (show annotations)
Sat Apr 20 15:06:04 2019 UTC (21 months ago) by ph10
File size: 34960 byte(s)
Fix lcc compiler issue.
1 // Copyright (c) 2010, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // Author: Sanjay Ghemawat
31
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <ctype.h>
39 #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */
40 #include <string.h> /* for memcpy */
41 #include <assert.h>
42 #include <errno.h>
43 #include <string>
44 #include <algorithm>
45
46 #include "pcrecpp_internal.h"
47 #include "pcre.h"
48 #include "pcrecpp.h"
49 #include "pcre_stringpiece.h"
50
51
52 namespace pcrecpp {
53
54 // Maximum number of args we can set
55 static const int kMaxArgs = 16;
56 static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
57
58 // Special object that stands-in for no argument
59 Arg RE::no_arg((void*)NULL);
60
61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
62 // which defined a global no_arg variable instead of putting it in the
63 // RE class. This works on GCC >= 3, at least. It definitely works
64 // for ELF, but may not for other object formats (Mach-O, for
65 // instance, does not support aliases.) We could probably have a more
66 // inclusive test if we ever needed it. (Note that not only the
67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
68 // gnu-specific.)
69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) \
70 && !defined(__INTEL_COMPILER) && !defined(__LCC__)
71 # define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x)
72 # define ULP_AS_STRING_INTERNAL(x) #x
73 # define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__)
74 extern Arg no_arg
75 __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
76 #endif
77
78 // If a regular expression has no error, its error_ field points here
79 static const string empty_string;
80
81 // If the user doesn't ask for any options, we just use this one
82 static RE_Options default_options;
83
84 // Specials for the start of patterns. See comments where start_options is used
85 // below. (PH June 2018)
86 static const char *start_options[] = {
87 "(*UTF8)",
88 "(*UTF)",
89 "(*UCP)",
90 "(*NO_START_OPT)",
91 "(*NO_AUTO_POSSESS)",
92 "(*LIMIT_RECURSION=",
93 "(*LIMIT_MATCH=",
94 "(*CRLF)",
95 "(*CR)",
96 "(*BSR_UNICODE)",
97 "(*BSR_ANYCRLF)",
98 "(*ANYCRLF)",
99 "(*ANY)",
100 "" };
101
102 void RE::Init(const string& pat, const RE_Options* options) {
103 pattern_ = pat;
104 if (options == NULL) {
105 options_ = default_options;
106 } else {
107 options_ = *options;
108 }
109 error_ = &empty_string;
110 re_full_ = NULL;
111 re_partial_ = NULL;
112
113 re_partial_ = Compile(UNANCHORED);
114 if (re_partial_ != NULL) {
115 re_full_ = Compile(ANCHOR_BOTH);
116 }
117 }
118
119 void RE::Cleanup() {
120 if (re_full_ != NULL) (*pcre_free)(re_full_);
121 if (re_partial_ != NULL) (*pcre_free)(re_partial_);
122 if (error_ != &empty_string) delete error_;
123 }
124
125
126 RE::~RE() {
127 Cleanup();
128 }
129
130
131 pcre* RE::Compile(Anchor anchor) {
132 // First, convert RE_Options into pcre options
133 int pcre_options = 0;
134 pcre_options = options_.all_options();
135
136 // Special treatment for anchoring. This is needed because at
137 // runtime pcre only provides an option for anchoring at the
138 // beginning of a string (unless you use offset).
139 //
140 // There are three types of anchoring we want:
141 // UNANCHORED Compile the original pattern, and use
142 // a pcre unanchored match.
143 // ANCHOR_START Compile the original pattern, and use
144 // a pcre anchored match.
145 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
146 // and use a pcre anchored match.
147
148 const char* compile_error;
149 int eoffset;
150 pcre* re;
151 if (anchor != ANCHOR_BOTH) {
152 re = pcre_compile(pattern_.c_str(), pcre_options,
153 &compile_error, &eoffset, NULL);
154 } else {
155 // Tack a '\z' at the end of RE. Parenthesize it first so that
156 // the '\z' applies to all top-level alternatives in the regexp.
157
158 /* When this code was written (for PCRE 6.0) it was enough just to
159 parenthesize the entire pattern. Unfortunately, when the feature of
160 starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
161 this code was never updated. This bug was not noticed till 2018, long after
162 PCRE became obsolescent and its maintainer no longer around. Since PCRE is
163 frozen, I have added a hack to check for all the existing "start of
164 pattern" specials - knowing that no new ones will ever be added. I am not a
165 C++ programmer, so the code style is no doubt crude. It is also
166 inefficient, but is only run when the pattern starts with "(*".
167 PH June 2018. */
168
169 string wrapped = "";
170
171 if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
172 int kk, klen, kmat;
173 for (;;) { // Loop for any number of leading items
174
175 for (kk = 0; start_options[kk][0] != 0; kk++) {
176 klen = strlen(start_options[kk]);
177 kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
178 if (kmat >= 0) break;
179 }
180 if (kmat != 0) break; // Not found
181
182 // If the item ended in "=" we must copy digits up to ")".
183
184 if (start_options[kk][klen-1] == '=') {
185 while (isdigit(pattern_.c_str()[klen])) klen++;
186 if (pattern_.c_str()[klen] != ')') break; // Syntax error
187 klen++;
188 }
189
190 // Move the item from the pattern to the start of the wrapped string.
191
192 wrapped += pattern_.substr(0, klen);
193 pattern_.erase(0, klen);
194 }
195 }
196
197 // Wrap the rest of the pattern.
198
199 wrapped += "(?:"; // A non-counting grouping operator
200 wrapped += pattern_;
201 wrapped += ")\\z";
202 re = pcre_compile(wrapped.c_str(), pcre_options,
203 &compile_error, &eoffset, NULL);
204 }
205 if (re == NULL) {
206 if (error_ == &empty_string) error_ = new string(compile_error);
207 }
208 return re;
209 }
210
211 /***** Matching interfaces *****/
212
213 bool RE::FullMatch(const StringPiece& text,
214 const Arg& ptr1,
215 const Arg& ptr2,
216 const Arg& ptr3,
217 const Arg& ptr4,
218 const Arg& ptr5,
219 const Arg& ptr6,
220 const Arg& ptr7,
221 const Arg& ptr8,
222 const Arg& ptr9,
223 const Arg& ptr10,
224 const Arg& ptr11,
225 const Arg& ptr12,
226 const Arg& ptr13,
227 const Arg& ptr14,
228 const Arg& ptr15,
229 const Arg& ptr16) const {
230 const Arg* args[kMaxArgs];
231 int n = 0;
232 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
233 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
234 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
235 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
236 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
237 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
238 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
239 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
240 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
241 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
242 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
243 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
244 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
245 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
246 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
247 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
248 done:
249
250 int consumed;
251 int vec[kVecSize];
252 return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
253 }
254
255 bool RE::PartialMatch(const StringPiece& text,
256 const Arg& ptr1,
257 const Arg& ptr2,
258 const Arg& ptr3,
259 const Arg& ptr4,
260 const Arg& ptr5,
261 const Arg& ptr6,
262 const Arg& ptr7,
263 const Arg& ptr8,
264 const Arg& ptr9,
265 const Arg& ptr10,
266 const Arg& ptr11,
267 const Arg& ptr12,
268 const Arg& ptr13,
269 const Arg& ptr14,
270 const Arg& ptr15,
271 const Arg& ptr16) const {
272 const Arg* args[kMaxArgs];
273 int n = 0;
274 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
275 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
276 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
277 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
278 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
279 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
280 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
281 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
282 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
283 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
284 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
285 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
286 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
287 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
288 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
289 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
290 done:
291
292 int consumed;
293 int vec[kVecSize];
294 return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
295 }
296
297 bool RE::Consume(StringPiece* input,
298 const Arg& ptr1,
299 const Arg& ptr2,
300 const Arg& ptr3,
301 const Arg& ptr4,
302 const Arg& ptr5,
303 const Arg& ptr6,
304 const Arg& ptr7,
305 const Arg& ptr8,
306 const Arg& ptr9,
307 const Arg& ptr10,
308 const Arg& ptr11,
309 const Arg& ptr12,
310 const Arg& ptr13,
311 const Arg& ptr14,
312 const Arg& ptr15,
313 const Arg& ptr16) const {
314 const Arg* args[kMaxArgs];
315 int n = 0;
316 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
317 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
318 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
319 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
320 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
321 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
322 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
323 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
324 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
325 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
326 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
327 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
328 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
329 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
330 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
331 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
332 done:
333
334 int consumed;
335 int vec[kVecSize];
336 if (DoMatchImpl(*input, ANCHOR_START, &consumed,
337 args, n, vec, kVecSize)) {
338 input->remove_prefix(consumed);
339 return true;
340 } else {
341 return false;
342 }
343 }
344
345 bool RE::FindAndConsume(StringPiece* input,
346 const Arg& ptr1,
347 const Arg& ptr2,
348 const Arg& ptr3,
349 const Arg& ptr4,
350 const Arg& ptr5,
351 const Arg& ptr6,
352 const Arg& ptr7,
353 const Arg& ptr8,
354 const Arg& ptr9,
355 const Arg& ptr10,
356 const Arg& ptr11,
357 const Arg& ptr12,
358 const Arg& ptr13,
359 const Arg& ptr14,
360 const Arg& ptr15,
361 const Arg& ptr16) const {
362 const Arg* args[kMaxArgs];
363 int n = 0;
364 if (&ptr1 == &no_arg) { goto done; } args[n++] = &ptr1;
365 if (&ptr2 == &no_arg) { goto done; } args[n++] = &ptr2;
366 if (&ptr3 == &no_arg) { goto done; } args[n++] = &ptr3;
367 if (&ptr4 == &no_arg) { goto done; } args[n++] = &ptr4;
368 if (&ptr5 == &no_arg) { goto done; } args[n++] = &ptr5;
369 if (&ptr6 == &no_arg) { goto done; } args[n++] = &ptr6;
370 if (&ptr7 == &no_arg) { goto done; } args[n++] = &ptr7;
371 if (&ptr8 == &no_arg) { goto done; } args[n++] = &ptr8;
372 if (&ptr9 == &no_arg) { goto done; } args[n++] = &ptr9;
373 if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
374 if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
375 if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
376 if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
377 if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
378 if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
379 if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
380 done:
381
382 int consumed;
383 int vec[kVecSize];
384 if (DoMatchImpl(*input, UNANCHORED, &consumed,
385 args, n, vec, kVecSize)) {
386 input->remove_prefix(consumed);
387 return true;
388 } else {
389 return false;
390 }
391 }
392
393 bool RE::Replace(const StringPiece& rewrite,
394 string *str) const {
395 int vec[kVecSize];
396 int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
397 if (matches == 0)
398 return false;
399
400 string s;
401 if (!Rewrite(&s, rewrite, *str, vec, matches))
402 return false;
403
404 assert(vec[0] >= 0);
405 assert(vec[1] >= 0);
406 str->replace(vec[0], vec[1] - vec[0], s);
407 return true;
408 }
409
410 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
411 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
412 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
413
414 static int NewlineMode(int pcre_options) {
415 // TODO: if we can make it threadsafe, cache this var
416 int newline_mode = 0;
417 /* if (newline_mode) return newline_mode; */ // do this once it's cached
418 if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
419 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
420 newline_mode = (pcre_options &
421 (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
422 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
423 } else {
424 int newline;
425 pcre_config(PCRE_CONFIG_NEWLINE, &newline);
426 if (newline == 10)
427 newline_mode = PCRE_NEWLINE_LF;
428 else if (newline == 13)
429 newline_mode = PCRE_NEWLINE_CR;
430 else if (newline == 3338)
431 newline_mode = PCRE_NEWLINE_CRLF;
432 else if (newline == -1)
433 newline_mode = PCRE_NEWLINE_ANY;
434 else if (newline == -2)
435 newline_mode = PCRE_NEWLINE_ANYCRLF;
436 else
437 assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
438 }
439 return newline_mode;
440 }
441
442 int RE::GlobalReplace(const StringPiece& rewrite,
443 string *str) const {
444 int count = 0;
445 int vec[kVecSize];
446 string out;
447 int start = 0;
448 bool last_match_was_empty_string = false;
449
450 while (start <= static_cast<int>(str->length())) {
451 // If the previous match was for the empty string, we shouldn't
452 // just match again: we'll match in the same way and get an
453 // infinite loop. Instead, we do the match in a special way:
454 // anchored -- to force another try at the same position --
455 // and with a flag saying that this time, ignore empty matches.
456 // If this special match returns, that means there's a non-empty
457 // match at this position as well, and we can continue. If not,
458 // we do what perl does, and just advance by one.
459 // Notice that perl prints '@@@' for this;
460 // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
461 int matches;
462 if (last_match_was_empty_string) {
463 matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
464 if (matches <= 0) {
465 int matchend = start + 1; // advance one character.
466 // If the current char is CR and we're in CRLF mode, skip LF too.
467 // Note it's better to call pcre_fullinfo() than to examine
468 // all_options(), since options_ could have changed bewteen
469 // compile-time and now, but this is simpler and safe enough.
470 // Modified by PH to add ANY and ANYCRLF.
471 if (matchend < static_cast<int>(str->length()) &&
472 (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
473 (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
474 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
475 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
476 matchend++;
477 }
478 // We also need to advance more than one char if we're in utf8 mode.
479 #ifdef SUPPORT_UTF
480 if (options_.utf8()) {
481 while (matchend < static_cast<int>(str->length()) &&
482 ((*str)[matchend] & 0xc0) == 0x80)
483 matchend++;
484 }
485 #endif
486 if (start < static_cast<int>(str->length()))
487 out.append(*str, start, matchend - start);
488 start = matchend;
489 last_match_was_empty_string = false;
490 continue;
491 }
492 } else {
493 matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
494 if (matches <= 0)
495 break;
496 }
497 int matchstart = vec[0], matchend = vec[1];
498 assert(matchstart >= start);
499 assert(matchend >= matchstart);
500 out.append(*str, start, matchstart - start);
501 Rewrite(&out, rewrite, *str, vec, matches);
502 start = matchend;
503 count++;
504 last_match_was_empty_string = (matchstart == matchend);
505 }
506
507 if (count == 0)
508 return 0;
509
510 if (start < static_cast<int>(str->length()))
511 out.append(*str, start, str->length() - start);
512 swap(out, *str);
513 return count;
514 }
515
516 bool RE::Extract(const StringPiece& rewrite,
517 const StringPiece& text,
518 string *out) const {
519 int vec[kVecSize];
520 int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
521 if (matches == 0)
522 return false;
523 out->erase();
524 return Rewrite(out, rewrite, text, vec, matches);
525 }
526
527 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
528 string result;
529
530 // Escape any ascii character not in [A-Za-z_0-9].
531 //
532 // Note that it's legal to escape a character even if it has no
533 // special meaning in a regular expression -- so this function does
534 // that. (This also makes it identical to the perl function of the
535 // same name; see `perldoc -f quotemeta`.) The one exception is
536 // escaping NUL: rather than doing backslash + NUL, like perl does,
537 // we do '\0', because pcre itself doesn't take embedded NUL chars.
538 for (int ii = 0; ii < unquoted.size(); ++ii) {
539 // Note that using 'isalnum' here raises the benchmark time from
540 // 32ns to 58ns:
541 if (unquoted[ii] == '\0') {
542 result += "\\0";
543 } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
544 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
545 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
546 unquoted[ii] != '_' &&
547 // If this is the part of a UTF8 or Latin1 character, we need
548 // to copy this byte without escaping. Experimentally this is
549 // what works correctly with the regexp library.
550 !(unquoted[ii] & 128)) {
551 result += '\\';
552 result += unquoted[ii];
553 } else {
554 result += unquoted[ii];
555 }
556 }
557
558 return result;
559 }
560
561 /***** Actual matching and rewriting code *****/
562
563 int RE::TryMatch(const StringPiece& text,
564 int startpos,
565 Anchor anchor,
566 bool empty_ok,
567 int *vec,
568 int vecsize) const {
569 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
570 if (re == NULL) {
571 //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
572 return 0;
573 }
574
575 pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
576 if (options_.match_limit() > 0) {
577 extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
578 extra.match_limit = options_.match_limit();
579 }
580 if (options_.match_limit_recursion() > 0) {
581 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
582 extra.match_limit_recursion = options_.match_limit_recursion();
583 }
584
585 // int options = 0;
586 // Changed by PH as a result of bugzilla #1288
587 int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
588
589 if (anchor != UNANCHORED)
590 options |= PCRE_ANCHORED;
591 if (!empty_ok)
592 options |= PCRE_NOTEMPTY;
593
594 int rc = pcre_exec(re, // The regular expression object
595 &extra,
596 (text.data() == NULL) ? "" : text.data(),
597 text.size(),
598 startpos,
599 options,
600 vec,
601 vecsize);
602
603 // Handle errors
604 if (rc == PCRE_ERROR_NOMATCH) {
605 return 0;
606 } else if (rc < 0) {
607 //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
608 // re, pattern_.c_str());
609 return 0;
610 } else if (rc == 0) {
611 // pcre_exec() returns 0 as a special case when the number of
612 // capturing subpatterns exceeds the size of the vector.
613 // When this happens, there is a match and the output vector
614 // is filled, but we miss out on the positions of the extra subpatterns.
615 rc = vecsize / 2;
616 }
617
618 return rc;
619 }
620
621 bool RE::DoMatchImpl(const StringPiece& text,
622 Anchor anchor,
623 int* consumed,
624 const Arg* const* args,
625 int n,
626 int* vec,
627 int vecsize) const {
628 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
629 int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
630 assert(matches >= 0); // TryMatch never returns negatives
631 if (matches == 0)
632 return false;
633
634 *consumed = vec[1];
635
636 if (n == 0 || args == NULL) {
637 // We are not interested in results
638 return true;
639 }
640
641 if (NumberOfCapturingGroups() < n) {
642 // RE has fewer capturing groups than number of arg pointers passed in
643 return false;
644 }
645
646 // If we got here, we must have matched the whole pattern.
647 // We do not need (can not do) any more checks on the value of 'matches' here
648 // -- see the comment for TryMatch.
649 for (int i = 0; i < n; i++) {
650 const int start = vec[2*(i+1)];
651 const int limit = vec[2*(i+1)+1];
652 if (!args[i]->Parse(text.data() + start, limit-start)) {
653 // TODO: Should we indicate what the error was?
654 return false;
655 }
656 }
657
658 return true;
659 }
660
661 bool RE::DoMatch(const StringPiece& text,
662 Anchor anchor,
663 int* consumed,
664 const Arg* const args[],
665 int n) const {
666 assert(n >= 0);
667 size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
668 // (as for kVecSize)
669 int space[21]; // use stack allocation for small vecsize (common case)
670 int* vec = vecsize <= 21 ? space : new int[vecsize];
671 bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
672 if (vec != space) delete [] vec;
673 return retval;
674 }
675
676 bool RE::Rewrite(string *out, const StringPiece &rewrite,
677 const StringPiece &text, int *vec, int veclen) const {
678 for (const char *s = rewrite.data(), *end = s + rewrite.size();
679 s < end; s++) {
680 int c = *s;
681 if (c == '\\') {
682 c = *++s;
683 if (isdigit(c)) {
684 int n = (c - '0');
685 if (n >= veclen) {
686 //fprintf(stderr, requested group %d in regexp %.*s\n",
687 // n, rewrite.size(), rewrite.data());
688 return false;
689 }
690 int start = vec[2 * n];
691 if (start >= 0)
692 out->append(text.data() + start, vec[2 * n + 1] - start);
693 } else if (c == '\\') {
694 *out += '\\';
695 } else {
696 //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
697 // rewrite.size(), rewrite.data());
698 return false;
699 }
700 } else {
701 *out += c;
702 }
703 }
704 return true;
705 }
706
707 // Return the number of capturing subpatterns, or -1 if the
708 // regexp wasn't valid on construction.
709 int RE::NumberOfCapturingGroups() const {
710 if (re_partial_ == NULL) return -1;
711
712 int result;
713 int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object
714 NULL, // We did not study the pattern
715 PCRE_INFO_CAPTURECOUNT,
716 &result);
717 assert(pcre_retval == 0);
718 return result;
719 }
720
721 /***** Parsers for various types *****/
722
723 bool Arg::parse_null(const char* str, int n, void* dest) {
724 (void)str;
725 (void)n;
726 // We fail if somebody asked us to store into a non-NULL void* pointer
727 return (dest == NULL);
728 }
729
730 bool Arg::parse_string(const char* str, int n, void* dest) {
731 if (dest == NULL) return true;
732 reinterpret_cast<string*>(dest)->assign(str, n);
733 return true;
734 }
735
736 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
737 if (dest == NULL) return true;
738 reinterpret_cast<StringPiece*>(dest)->set(str, n);
739 return true;
740 }
741
742 bool Arg::parse_char(const char* str, int n, void* dest) {
743 if (n != 1) return false;
744 if (dest == NULL) return true;
745 *(reinterpret_cast<char*>(dest)) = str[0];
746 return true;
747 }
748
749 bool Arg::parse_uchar(const char* str, int n, void* dest) {
750 if (n != 1) return false;
751 if (dest == NULL) return true;
752 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
753 return true;
754 }
755
756 // Largest number spec that we are willing to parse
757 static const int kMaxNumberLength = 32;
758
759 // REQUIRES "buf" must have length at least kMaxNumberLength+1
760 // REQUIRES "n > 0"
761 // Copies "str" into "buf" and null-terminates if necessary.
762 // Returns one of:
763 // a. "str" if no termination is needed
764 // b. "buf" if the string was copied and null-terminated
765 // c. "" if the input was invalid and has no hope of being parsed
766 static const char* TerminateNumber(char* buf, const char* str, int n) {
767 if ((n > 0) && isspace(*str)) {
768 // We are less forgiving than the strtoxxx() routines and do not
769 // allow leading spaces.
770 return "";
771 }
772
773 // See if the character right after the input text may potentially
774 // look like a digit.
775 if (isdigit(str[n]) ||
776 ((str[n] >= 'a') && (str[n] <= 'f')) ||
777 ((str[n] >= 'A') && (str[n] <= 'F'))) {
778 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
779 memcpy(buf, str, n);
780 buf[n] = '\0';
781 return buf;
782 } else {
783 // We can parse right out of the supplied string, so return it.
784 return str;
785 }
786 }
787
788 bool Arg::parse_long_radix(const char* str,
789 int n,
790 void* dest,
791 int radix) {
792 if (n == 0) return false;
793 char buf[kMaxNumberLength+1];
794 str = TerminateNumber(buf, str, n);
795 char* end;
796 errno = 0;
797 long r = strtol(str, &end, radix);
798 if (end != str + n) return false; // Leftover junk
799 if (errno) return false;
800 if (dest == NULL) return true;
801 *(reinterpret_cast<long*>(dest)) = r;
802 return true;
803 }
804
805 bool Arg::parse_ulong_radix(const char* str,
806 int n,
807 void* dest,
808 int radix) {
809 if (n == 0) return false;
810 char buf[kMaxNumberLength+1];
811 str = TerminateNumber(buf, str, n);
812 if (str[0] == '-') return false; // strtoul() on a negative number?!
813 char* end;
814 errno = 0;
815 unsigned long r = strtoul(str, &end, radix);
816 if (end != str + n) return false; // Leftover junk
817 if (errno) return false;
818 if (dest == NULL) return true;
819 *(reinterpret_cast<unsigned long*>(dest)) = r;
820 return true;
821 }
822
823 bool Arg::parse_short_radix(const char* str,
824 int n,
825 void* dest,
826 int radix) {
827 long r;
828 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
829 if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range
830 if (dest == NULL) return true;
831 *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
832 return true;
833 }
834
835 bool Arg::parse_ushort_radix(const char* str,
836 int n,
837 void* dest,
838 int radix) {
839 unsigned long r;
840 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
841 if (r > USHRT_MAX) return false; // Out of range
842 if (dest == NULL) return true;
843 *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
844 return true;
845 }
846
847 bool Arg::parse_int_radix(const char* str,
848 int n,
849 void* dest,
850 int radix) {
851 long r;
852 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
853 if (r < INT_MIN || r > INT_MAX) return false; // Out of range
854 if (dest == NULL) return true;
855 *(reinterpret_cast<int*>(dest)) = r;
856 return true;
857 }
858
859 bool Arg::parse_uint_radix(const char* str,
860 int n,
861 void* dest,
862 int radix) {
863 unsigned long r;
864 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
865 if (r > UINT_MAX) return false; // Out of range
866 if (dest == NULL) return true;
867 *(reinterpret_cast<unsigned int*>(dest)) = r;
868 return true;
869 }
870
871 bool Arg::parse_longlong_radix(const char* str,
872 int n,
873 void* dest,
874 int radix) {
875 #ifndef HAVE_LONG_LONG
876 return false;
877 #else
878 if (n == 0) return false;
879 char buf[kMaxNumberLength+1];
880 str = TerminateNumber(buf, str, n);
881 char* end;
882 errno = 0;
883 #if defined HAVE_STRTOQ
884 long long r = strtoq(str, &end, radix);
885 #elif defined HAVE_STRTOLL
886 long long r = strtoll(str, &end, radix);
887 #elif defined HAVE__STRTOI64
888 long long r = _strtoi64(str, &end, radix);
889 #elif defined HAVE_STRTOIMAX
890 long long r = strtoimax(str, &end, radix);
891 #else
892 #error parse_longlong_radix: cannot convert input to a long-long
893 #endif
894 if (end != str + n) return false; // Leftover junk
895 if (errno) return false;
896 if (dest == NULL) return true;
897 *(reinterpret_cast<long long*>(dest)) = r;
898 return true;
899 #endif /* HAVE_LONG_LONG */
900 }
901
902 bool Arg::parse_ulonglong_radix(const char* str,
903 int n,
904 void* dest,
905 int radix) {
906 #ifndef HAVE_UNSIGNED_LONG_LONG
907 return false;
908 #else
909 if (n == 0) return false;
910 char buf[kMaxNumberLength+1];
911 str = TerminateNumber(buf, str, n);
912 if (str[0] == '-') return false; // strtoull() on a negative number?!
913 char* end;
914 errno = 0;
915 #if defined HAVE_STRTOQ
916 unsigned long long r = strtouq(str, &end, radix);
917 #elif defined HAVE_STRTOLL
918 unsigned long long r = strtoull(str, &end, radix);
919 #elif defined HAVE__STRTOI64
920 unsigned long long r = _strtoui64(str, &end, radix);
921 #elif defined HAVE_STRTOIMAX
922 unsigned long long r = strtoumax(str, &end, radix);
923 #else
924 #error parse_ulonglong_radix: cannot convert input to a long-long
925 #endif
926 if (end != str + n) return false; // Leftover junk
927 if (errno) return false;
928 if (dest == NULL) return true;
929 *(reinterpret_cast<unsigned long long*>(dest)) = r;
930 return true;
931 #endif /* HAVE_UNSIGNED_LONG_LONG */
932 }
933
934 bool Arg::parse_double(const char* str, int n, void* dest) {
935 if (n == 0) return false;
936 static const int kMaxLength = 200;
937 char buf[kMaxLength];
938 if (n >= kMaxLength) return false;
939 memcpy(buf, str, n);
940 buf[n] = '\0';
941 errno = 0;
942 char* end;
943 double r = strtod(buf, &end);
944 if (end != buf + n) return false; // Leftover junk
945 if (errno) return false;
946 if (dest == NULL) return true;
947 *(reinterpret_cast<double*>(dest)) = r;
948 return true;
949 }
950
951 bool Arg::parse_float(const char* str, int n, void* dest) {
952 double r;
953 if (!parse_double(str, n, &r)) return false;
954 if (dest == NULL) return true;
955 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
956 return true;
957 }
958
959
960 #define DEFINE_INTEGER_PARSERS(name) \
961 bool Arg::parse_##name(const char* str, int n, void* dest) { \
962 return parse_##name##_radix(str, n, dest, 10); \
963 } \
964 bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
965 return parse_##name##_radix(str, n, dest, 16); \
966 } \
967 bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
968 return parse_##name##_radix(str, n, dest, 8); \
969 } \
970 bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
971 return parse_##name##_radix(str, n, dest, 0); \
972 }
973
974 DEFINE_INTEGER_PARSERS(short) /* */
975 DEFINE_INTEGER_PARSERS(ushort) /* */
976 DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
977 DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
978 DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
979 DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
980 DEFINE_INTEGER_PARSERS(longlong) /* */
981 DEFINE_INTEGER_PARSERS(ulonglong) /* */
982
983 #undef DEFINE_INTEGER_PARSERS
984
985 } // namespace pcrecpp

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5