529 |
#define MAX_MARK ((1 << (sizeof(pcre_uchar)*8)) - 1) |
#define MAX_MARK ((1 << (sizeof(pcre_uchar)*8)) - 1) |
530 |
|
|
531 |
/* When UTF encoding is being used, a character is no longer just a single |
/* When UTF encoding is being used, a character is no longer just a single |
532 |
character. The macros for character handling generate simple sequences when |
byte. The macros for character handling generate simple sequences when used in |
533 |
used in character-mode, and more complicated ones for UTF characters. |
character-mode, and more complicated ones for UTF characters. GETCHARLENTEST |
534 |
GETCHARLENTEST and other macros are not used when UTF is not supported, |
and other macros are not used when UTF is not supported, so they are not |
535 |
so they are not defined. To make sure they can never even appear when |
defined. To make sure they can never even appear when UTF support is omitted, |
536 |
UTF support is omitted, we don't even define them. */ |
we don't even define them. */ |
537 |
|
|
538 |
#ifndef SUPPORT_UTF |
#ifndef SUPPORT_UTF |
539 |
|
|
832 |
#endif /* SUPPORT_UTF */ |
#endif /* SUPPORT_UTF */ |
833 |
|
|
834 |
|
|
835 |
|
/* Tests for Unicode horizontal and vertical whitespace characters must check a |
836 |
|
number of different values. Using a switch statement for this generates the |
837 |
|
fastest code (no loop, no memory access), and there are several places where |
838 |
|
this happens. In order to ensure that all the case lists remain in step, we use |
839 |
|
macros so that there is only one place where the lists are defined. |
840 |
|
|
841 |
|
NOTE: These values are also used explicitly in pcre_compile.c when processing |
842 |
|
\h, \H, \v and \V in a character class, so any changes here should be |
843 |
|
duplicated there as well. They also appear in pcre_jit_compile.c. */ |
844 |
|
|
845 |
|
#ifndef EBCDIC |
846 |
|
#define HSPACE_MULTIBYTE_CASES \ |
847 |
|
case 0x1680: /* OGHAM SPACE MARK */ \ |
848 |
|
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \ |
849 |
|
case 0x2000: /* EN QUAD */ \ |
850 |
|
case 0x2001: /* EM QUAD */ \ |
851 |
|
case 0x2002: /* EN SPACE */ \ |
852 |
|
case 0x2003: /* EM SPACE */ \ |
853 |
|
case 0x2004: /* THREE-PER-EM SPACE */ \ |
854 |
|
case 0x2005: /* FOUR-PER-EM SPACE */ \ |
855 |
|
case 0x2006: /* SIX-PER-EM SPACE */ \ |
856 |
|
case 0x2007: /* FIGURE SPACE */ \ |
857 |
|
case 0x2008: /* PUNCTUATION SPACE */ \ |
858 |
|
case 0x2009: /* THIN SPACE */ \ |
859 |
|
case 0x200A: /* HAIR SPACE */ \ |
860 |
|
case 0x202f: /* NARROW NO-BREAK SPACE */ \ |
861 |
|
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \ |
862 |
|
case 0x3000 /* IDEOGRAPHIC SPACE */ |
863 |
|
|
864 |
|
#define HSPACE_BYTE_CASES \ |
865 |
|
case CHAR_HT: \ |
866 |
|
case CHAR_SPACE: \ |
867 |
|
case 0xa0 /* NBSP */ |
868 |
|
|
869 |
|
#define VSPACE_MULTIBYTE_CASES \ |
870 |
|
case 0x2028: /* LINE SEPARATOR */ \ |
871 |
|
case 0x2029 /* PARAGRAPH SEPARATOR */ |
872 |
|
|
873 |
|
#else /* EBCDIC */ |
874 |
|
#define HSPACE_MULTIBYTE_CASES |
875 |
|
#define VSPACE_MULTIBYTE_CASES |
876 |
|
|
877 |
|
#define HSPACE_BYTE_CASES \ |
878 |
|
case CHAR_HT: \ |
879 |
|
case CHAR_SPACE |
880 |
|
#endif /* EBCDIC */ |
881 |
|
|
882 |
|
#define VSPACE_BYTE_CASES \ |
883 |
|
case CHAR_LF: \ |
884 |
|
case CHAR_VT: \ |
885 |
|
case CHAR_FF: \ |
886 |
|
case CHAR_CR: \ |
887 |
|
case CHAR_NEL |
888 |
|
|
889 |
|
#define HSPACE_CASES \ |
890 |
|
HSPACE_BYTE_CASES: \ |
891 |
|
HSPACE_MULTIBYTE_CASES |
892 |
|
|
893 |
|
#define VSPACE_CASES \ |
894 |
|
VSPACE_BYTE_CASES: \ |
895 |
|
VSPACE_MULTIBYTE_CASES |
896 |
|
|
897 |
/* In case there is no definition of offsetof() provided - though any proper |
/* In case there is no definition of offsetof() provided - though any proper |
898 |
Standard C system should have one. */ |
Standard C system should have one. */ |
899 |
|
|
1008 |
|
|
1009 |
/* UTF-8 support is not enabled; use the platform-dependent character literals |
/* UTF-8 support is not enabled; use the platform-dependent character literals |
1010 |
so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF |
so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF |
1011 |
mode. Newline characters are problematic in EBCDIC. Though it has CR and LF |
mode. Newline characters are problematic in EBCDIC. Though it has CR and LF |
1012 |
characters, a common practice has been to use its NL (0x15) character as the |
characters, a common practice has been to use its NL (0x15) character as the |
1013 |
line terminator in C-like processing environments. However, sometimes the LF |
line terminator in C-like processing environments. However, sometimes the LF |
1014 |
(0x25) character is used instead, according to this Unicode document: |
(0x25) character is used instead, according to this Unicode document: |
1015 |
|
|
1016 |
http://unicode.org/standard/reports/tr13/tr13-5.html |
http://unicode.org/standard/reports/tr13/tr13-5.html |
1017 |
|
|
1018 |
PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25 |
PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25 |
1019 |
instead. Whichever is *not* chosen is defined as NEL. |
instead. Whichever is *not* chosen is defined as NEL. |
1020 |
|
|
1021 |
In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the |
In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the |
1022 |
same code point. */ |
same code point. */ |
1045 |
|
|
1046 |
#else /* Not EBCDIC */ |
#else /* Not EBCDIC */ |
1047 |
|
|
1048 |
/* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for |
/* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for |
1049 |
compatibility. NEL is the Unicode newline character; make sure it is |
compatibility. NEL is the Unicode newline character; make sure it is |
1050 |
a positive value. */ |
a positive value. */ |
1051 |
|
|
2145 |
int external_flags; /* External flag bits to be set */ |
int external_flags; /* External flag bits to be set */ |
2146 |
int req_varyopt; /* "After variable item" flag for reqbyte */ |
int req_varyopt; /* "After variable item" flag for reqbyte */ |
2147 |
BOOL had_accept; /* (*ACCEPT) encountered */ |
BOOL had_accept; /* (*ACCEPT) encountered */ |
2148 |
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ |
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ |
2149 |
BOOL check_lookbehind; /* Lookbehinds need later checking */ |
BOOL check_lookbehind; /* Lookbehinds need later checking */ |
2150 |
int nltype; /* Newline type */ |
int nltype; /* Newline type */ |
2151 |
int nllen; /* Newline string length */ |
int nllen; /* Newline string length */ |