28 |
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 |
// |
// |
30 |
// Author: Sanjay Ghemawat |
// Author: Sanjay Ghemawat |
31 |
|
// Support for PCRE_XXX modifiers added by Giuseppe Maxia, July 2005 |
32 |
|
|
33 |
#ifndef _PCRE_REGEXP_H |
#ifndef _PCRE_REGEXP_H |
34 |
#define _PCRE_REGEXP_H |
#define _PCRE_REGEXP_H |
160 |
// --enable-utf8 flag. |
// --enable-utf8 flag. |
161 |
// |
// |
162 |
// ----------------------------------------------------------------------- |
// ----------------------------------------------------------------------- |
163 |
|
// PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE |
164 |
|
// |
165 |
|
// PCRE defines some modifiers to change the behavior of the regular |
166 |
|
// expression engine. |
167 |
|
// The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle |
168 |
|
// to pass such modifiers to a RE class. |
169 |
|
// |
170 |
|
// Currently, the following modifiers are supported |
171 |
|
// |
172 |
|
// modifier description Perl corresponding |
173 |
|
// |
174 |
|
// PCRE_CASELESS case insensitive match /i |
175 |
|
// PCRE_MULTILINE multiple lines match /m |
176 |
|
// PCRE_DOTALL dot matches newlines /s |
177 |
|
// PCRE_DOLLAR_ENDONLY $ matches only at end N/A |
178 |
|
// PCRE_EXTRA strict escape parsing N/A |
179 |
|
// PCRE_EXTENDED ignore whitespaces /x |
180 |
|
// PCRE_UTF8 handles UTF8 chars built-in |
181 |
|
// PCRE_UNGREEDY reverses * and *? N/A |
182 |
|
// PCRE_NO_AUTO_CAPTURE disables matching parens N/A (*) |
183 |
|
// |
184 |
|
// (For a full account on how each modifier works, please check the |
185 |
|
// PCRE API reference manual). |
186 |
|
// |
187 |
|
// (*) Both Perl and PCRE allow non matching parentheses by means of the |
188 |
|
// "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not |
189 |
|
// capture, while (ab|cd) does. |
190 |
|
// |
191 |
|
// For each modifier, there are two member functions whose name is made |
192 |
|
// out of the modifier in lowercase, without the "PCRE_" prefix. For |
193 |
|
// instance, PCRE_CASELESS is handled by |
194 |
|
// bool caseless(), |
195 |
|
// which returns true if the modifier is set, and |
196 |
|
// RE_Options & set_caseless(bool), |
197 |
|
// which sets or unsets the modifier. |
198 |
|
// |
199 |
|
// Moreover, PCRE_CONFIG_MATCH_LIMIT can be accessed through the |
200 |
|
// set_match_limit() and match_limit() member functions. |
201 |
|
// Setting match_limit to a non-zero value will limit the executation of |
202 |
|
// pcre to keep it from doing bad things like blowing the stack or taking |
203 |
|
// an eternity to return a result. A value of 5000 is good enough to stop |
204 |
|
// stack blowup in a 2MB thread stack. Setting match_limit to zero will |
205 |
|
// disable match limiting. |
206 |
|
// |
207 |
|
// Normally, to pass one or more modifiers to a RE class, you declare |
208 |
|
// a RE_Options object, set the appropriate options, and pass this |
209 |
|
// object to a RE constructor. Example: |
210 |
|
// |
211 |
|
// RE_options opt; |
212 |
|
// opt.set_caseless(true); |
213 |
|
// |
214 |
|
// if (RE("HELLO", opt).PartialMatch("hello world")) ... |
215 |
|
// |
216 |
|
// RE_options has two constructors. The default constructor takes no |
217 |
|
// arguments and creates a set of flags that are off by default. |
218 |
|
// |
219 |
|
// The optional parameter 'option_flags' is to facilitate transfer |
220 |
|
// of legacy code from C programs. This lets you do |
221 |
|
// RE(pattern, RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str); |
222 |
|
// |
223 |
|
// But new code is better off doing |
224 |
|
// RE(pattern, |
225 |
|
// RE_Options().set_caseless(true).set_multiline(true)).PartialMatch(str); |
226 |
|
// (See below) |
227 |
|
// |
228 |
|
// If you are going to pass one of the most used modifiers, there are some |
229 |
|
// convenience functions that return a RE_Options class with the |
230 |
|
// appropriate modifier already set: |
231 |
|
// CASELESS(), UTF8(), MULTILINE(), DOTALL(), EXTENDED() |
232 |
|
// |
233 |
|
// If you need to set several options at once, and you don't want to go |
234 |
|
// through the pains of declaring a RE_Options object and setting several |
235 |
|
// options, there is a parallel method that give you such ability on the |
236 |
|
// fly. You can concatenate several set_xxxxx member functions, since each |
237 |
|
// of them returns a reference to its class object. e.g.: to pass |
238 |
|
// PCRE_CASELESS, PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one |
239 |
|
// statement, you may write |
240 |
|
// |
241 |
|
// RE(" ^ xyz \\s+ .* blah$", RE_Options() |
242 |
|
// .set_caseless(true) |
243 |
|
// .set_extended(true) |
244 |
|
// .set_multiline(true)).PartialMatch(sometext); |
245 |
|
// |
246 |
|
// ----------------------------------------------------------------------- |
247 |
// SCANNING TEXT INCREMENTALLY |
// SCANNING TEXT INCREMENTALLY |
248 |
// |
// |
249 |
// The "Consume" operation may be useful if you want to repeatedly |
// The "Consume" operation may be useful if you want to repeatedly |
330 |
|
|
331 |
namespace pcrecpp { |
namespace pcrecpp { |
332 |
|
|
333 |
|
#define PCRE_SET_OR_CLEAR(b, o) \ |
334 |
|
if (b) all_options_ |= (o); else all_options_ &= ~(o); \ |
335 |
|
return *this |
336 |
|
|
337 |
|
#define PCRE_IS_SET(o) \ |
338 |
|
(all_options_ & o) == o |
339 |
|
|
340 |
// We convert user-passed pointers into special Arg objects |
// We convert user-passed pointers into special Arg objects |
341 |
class Arg; |
class Arg; |
342 |
extern Arg no_arg; |
extern Arg no_arg; |
344 |
/***** Compiling regular expressions: the RE class *****/ |
/***** Compiling regular expressions: the RE class *****/ |
345 |
|
|
346 |
// RE_Options allow you to set options to be passed along to pcre, |
// RE_Options allow you to set options to be passed along to pcre, |
347 |
// along with other options we put on top of pcre. Only UTF and |
// along with other options we put on top of pcre. |
348 |
// match_limit are supported now. Setting match_limit |
// Only 9 modifiers, plus match_limit are supported now. |
|
// to a non-zero value will limit the executation of pcre to |
|
|
// keep it from doing bad things like blowing the stack or taking |
|
|
// an eternity to return a result. A value of 5000 is good enough |
|
|
// to stop stack blowup in a 2MB thread stack. |
|
|
// Setting match_limit to zero will disable match limiting. |
|
349 |
class RE_Options { |
class RE_Options { |
350 |
public: |
public: |
351 |
// constructor |
// constructor |
352 |
RE_Options() : match_limit_(0), utf8_(false) {} |
RE_Options() : match_limit_(0), all_options_(0) {} |
353 |
|
|
354 |
|
// alternative constructor. |
355 |
|
// To facilitate transfer of legacy code from C programs |
356 |
|
// |
357 |
|
// This lets you do |
358 |
|
// RE(pattern, RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str); |
359 |
|
// But new code is better off doing |
360 |
|
// RE(pattern, |
361 |
|
// RE_Options().set_caseless(true).set_multiline(true)).PartialMatch(str); |
362 |
|
RE_Options(int option_flags) : match_limit_(0), all_options_ (option_flags) {} |
363 |
// we're fine with the default destructor, copy constructor, etc. |
// we're fine with the default destructor, copy constructor, etc. |
364 |
|
|
365 |
// accessors and mutators |
// accessors and mutators |
366 |
int match_limit() const { return match_limit_; }; |
int match_limit() const { return match_limit_; }; |
367 |
void set_match_limit(int limit) { |
RE_Options &set_match_limit(int limit) { |
368 |
match_limit_ = limit; |
match_limit_ = limit; |
369 |
|
return *this; |
370 |
} |
} |
371 |
|
|
372 |
bool utf8() const { return utf8_; } |
bool caseless() const { |
373 |
void set_utf8(bool u) { |
return PCRE_IS_SET(PCRE_CASELESS); |
374 |
utf8_ = u; |
} |
375 |
|
RE_Options &set_caseless(bool x) { |
376 |
|
PCRE_SET_OR_CLEAR(x, PCRE_CASELESS); |
377 |
|
} |
378 |
|
|
379 |
|
bool multiline() const { |
380 |
|
return PCRE_IS_SET(PCRE_MULTILINE); |
381 |
|
} |
382 |
|
RE_Options &set_multiline(bool x) { |
383 |
|
PCRE_SET_OR_CLEAR(x, PCRE_MULTILINE); |
384 |
|
} |
385 |
|
|
386 |
|
bool dotall() const { |
387 |
|
return PCRE_IS_SET(PCRE_DOTALL); |
388 |
|
} |
389 |
|
RE_Options &set_dotall(bool x) { |
390 |
|
PCRE_SET_OR_CLEAR(x,PCRE_DOTALL); |
391 |
|
} |
392 |
|
|
393 |
|
bool extended() const { |
394 |
|
return PCRE_IS_SET(PCRE_EXTENDED); |
395 |
|
} |
396 |
|
RE_Options &set_extended(bool x) { |
397 |
|
PCRE_SET_OR_CLEAR(x,PCRE_EXTENDED); |
398 |
|
} |
399 |
|
|
400 |
|
bool dollar_endonly() const { |
401 |
|
return PCRE_IS_SET(PCRE_DOLLAR_ENDONLY); |
402 |
|
} |
403 |
|
RE_Options &set_dollar_endonly(bool x) { |
404 |
|
PCRE_SET_OR_CLEAR(x,PCRE_DOLLAR_ENDONLY); |
405 |
|
} |
406 |
|
|
407 |
|
bool extra() const { |
408 |
|
return PCRE_IS_SET( PCRE_EXTRA); |
409 |
|
} |
410 |
|
RE_Options &set_extra(bool x) { |
411 |
|
PCRE_SET_OR_CLEAR(x, PCRE_EXTRA); |
412 |
|
} |
413 |
|
|
414 |
|
bool ungreedy() const { |
415 |
|
return PCRE_IS_SET(PCRE_UNGREEDY); |
416 |
|
} |
417 |
|
RE_Options &set_ungreedy(bool x) { |
418 |
|
PCRE_SET_OR_CLEAR(x, PCRE_UNGREEDY); |
419 |
|
} |
420 |
|
|
421 |
|
bool utf8() const { |
422 |
|
return PCRE_IS_SET(PCRE_UTF8); |
423 |
|
} |
424 |
|
RE_Options &set_utf8(bool x) { |
425 |
|
PCRE_SET_OR_CLEAR(x, PCRE_UTF8); |
426 |
|
} |
427 |
|
|
428 |
|
bool no_auto_capture() const { |
429 |
|
return PCRE_IS_SET(PCRE_NO_AUTO_CAPTURE); |
430 |
|
} |
431 |
|
RE_Options &set_no_auto_capture(bool x) { |
432 |
|
PCRE_SET_OR_CLEAR(x, PCRE_NO_AUTO_CAPTURE); |
433 |
|
} |
434 |
|
|
435 |
|
RE_Options &set_all_options(int opt) { |
436 |
|
all_options_ = opt; |
437 |
|
return *this; |
438 |
|
} |
439 |
|
int all_options() const { |
440 |
|
return all_options_ ; |
441 |
} |
} |
442 |
|
|
443 |
// TODO: add other pcre flags |
// TODO: add other pcre flags |
444 |
|
|
445 |
private: |
private: |
446 |
int match_limit_; |
int match_limit_; |
447 |
bool utf8_; |
int all_options_; |
448 |
}; |
}; |
449 |
|
|
450 |
// These functions return some common RE_Options |
// These functions return some common RE_Options |
451 |
static inline RE_Options UTF8() { |
static inline RE_Options UTF8() { |
452 |
RE_Options options; |
return RE_Options().set_utf8(true); |
453 |
options.set_utf8(true); |
} |
454 |
return options; |
|
455 |
|
static inline RE_Options CASELESS() { |
456 |
|
return RE_Options().set_caseless(true); |
457 |
|
} |
458 |
|
static inline RE_Options MULTILINE() { |
459 |
|
return RE_Options().set_multiline(true); |
460 |
} |
} |
461 |
|
|
462 |
|
static inline RE_Options DOTALL() { |
463 |
|
return RE_Options().set_dotall(true); |
464 |
|
} |
465 |
|
|
466 |
|
static inline RE_Options EXTENDED() { |
467 |
|
return RE_Options().set_extended(true); |
468 |
|
} |
469 |
|
|
470 |
// Interface for regular expression matching. Also corresponds to a |
// Interface for regular expression matching. Also corresponds to a |
471 |
// pre-compiled regular expression. An "RE" object is safe for |
// pre-compiled regular expression. An "RE" object is safe for |
776 |
MAKE_INTEGER_PARSER(unsigned long long, ulonglong); |
MAKE_INTEGER_PARSER(unsigned long long, ulonglong); |
777 |
#endif |
#endif |
778 |
|
|
779 |
|
#undef PCRE_IS_SET |
780 |
|
#undef PCRE_SET_OR_CLEAR |
781 |
#undef MAKE_INTEGER_PARSER |
#undef MAKE_INTEGER_PARSER |
782 |
|
|
783 |
} // namespace pcrecpp |
} // namespace pcrecpp |
784 |
|
|
785 |
|
|
786 |
#endif /* _PCRE_REGEXP_H */ |
#endif /* _PCRE_REGEXP_H */ |