/[pcre]/code/trunk/pcre_internal.h
ViewVC logotype

Diff of /code/trunk/pcre_internal.h

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 975 by ph10, Sat Jun 2 11:03:06 2012 UTC revision 1046 by ph10, Tue Sep 25 16:27:58 2012 UTC
# Line 529  changed in future to be a fixed number o Line 529  changed in future to be a fixed number o
529  #define MAX_MARK ((1 << (sizeof(pcre_uchar)*8)) - 1)  #define MAX_MARK ((1 << (sizeof(pcre_uchar)*8)) - 1)
530    
531  /* When UTF encoding is being used, a character is no longer just a single  /* When UTF encoding is being used, a character is no longer just a single
532  character. The macros for character handling generate simple sequences when  byte. The macros for character handling generate simple sequences when used in
533  used in character-mode, and more complicated ones for UTF characters.  character-mode, and more complicated ones for UTF characters. GETCHARLENTEST
534  GETCHARLENTEST and other macros are not used when UTF is not supported,  and other macros are not used when UTF is not supported, so they are not
535  so they are not defined. To make sure they can never even appear when  defined. To make sure they can never even appear when UTF support is omitted,
536  UTF support is omitted, we don't even define them. */  we don't even define them. */
537    
538  #ifndef SUPPORT_UTF  #ifndef SUPPORT_UTF
539    
# Line 832  code. */ Line 832  code. */
832  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
833    
834    
835    /* Tests for Unicode horizontal and vertical whitespace characters must check a
836    number of different values. Using a switch statement for this generates the
837    fastest code (no loop, no memory access), and there are several places in the
838    interpreter code where this happens. In order to ensure that all the case lists
839    remain in step, we use macros so that there is only one place where the lists
840    are defined.
841    
842    These values are also required as lists in pcre_compile.c when processing \h,
843    \H, \v and \V in a character class. The lists are defined in pcre_tables.c, but
844    macros that define the values are here so that all the definitions are
845    together. The lists must be in ascending character order, terminated by
846    NOTACHAR (which is 0xffffffff).
847    
848    Any changes should ensure that the various macros are kept in step with each
849    other. NOTE: The values also appear in pcre_jit_compile.c. */
850    
851    /* ------ ASCII/Unicode environments ------ */
852    
853    #ifndef EBCDIC
854    
855    #define HSPACE_LIST \
856      CHAR_HT, CHAR_SPACE, 0xa0, \
857      0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
858      0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
859      NOTACHAR
860    
861    #define HSPACE_MULTIBYTE_CASES \
862      case 0x1680:  /* OGHAM SPACE MARK */ \
863      case 0x180e:  /* MONGOLIAN VOWEL SEPARATOR */ \
864      case 0x2000:  /* EN QUAD */ \
865      case 0x2001:  /* EM QUAD */ \
866      case 0x2002:  /* EN SPACE */ \
867      case 0x2003:  /* EM SPACE */ \
868      case 0x2004:  /* THREE-PER-EM SPACE */ \
869      case 0x2005:  /* FOUR-PER-EM SPACE */ \
870      case 0x2006:  /* SIX-PER-EM SPACE */ \
871      case 0x2007:  /* FIGURE SPACE */ \
872      case 0x2008:  /* PUNCTUATION SPACE */ \
873      case 0x2009:  /* THIN SPACE */ \
874      case 0x200A:  /* HAIR SPACE */ \
875      case 0x202f:  /* NARROW NO-BREAK SPACE */ \
876      case 0x205f:  /* MEDIUM MATHEMATICAL SPACE */ \
877      case 0x3000   /* IDEOGRAPHIC SPACE */
878    
879    #define HSPACE_BYTE_CASES \
880      case CHAR_HT: \
881      case CHAR_SPACE: \
882      case 0xa0     /* NBSP */
883    
884    #define HSPACE_CASES \
885      HSPACE_BYTE_CASES: \
886      HSPACE_MULTIBYTE_CASES
887    
888    #define VSPACE_LIST \
889      CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR
890    
891    #define VSPACE_MULTIBYTE_CASES \
892      case 0x2028:    /* LINE SEPARATOR */ \
893      case 0x2029     /* PARAGRAPH SEPARATOR */
894    
895    #define VSPACE_BYTE_CASES \
896      case CHAR_LF: \
897      case CHAR_VT: \
898      case CHAR_FF: \
899      case CHAR_CR: \
900      case CHAR_NEL
901    
902    #define VSPACE_CASES \
903      VSPACE_BYTE_CASES: \
904      VSPACE_MULTIBYTE_CASES
905    
906    /* ------ EBCDIC environments ------ */
907    
908    #else
909    #define HSPACE_LIST CHAR_HT, CHAR_SPACE
910    
911    #define HSPACE_BYTE_CASES \
912      case CHAR_HT: \
913      case CHAR_SPACE
914    
915    #define HSPACE_CASES HSPACE_BYTE_CASES
916    
917    #ifdef EBCDIC_NL25
918    #define VSPACE_LIST \
919      CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR
920    #else
921    #define VSPACE_LIST \
922      CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR
923    #endif
924    
925    #define VSPACE_BYTE_CASES \
926      case CHAR_LF: \
927      case CHAR_VT: \
928      case CHAR_FF: \
929      case CHAR_CR: \
930      case CHAR_NEL
931    
932    #define VSPACE_CASES VSPACE_BYTE_CASES
933    #endif  /* EBCDIC */
934    
935    /* ------ End of whitespace macros ------ */
936    
937    
938  /* In case there is no definition of offsetof() provided - though any proper  /* In case there is no definition of offsetof() provided - though any proper
939  Standard C system should have one. */  Standard C system should have one. */
940    
# Line 893  time, run time, or study time, respectiv Line 996  time, run time, or study time, respectiv
996    
997  #define PUBLIC_STUDY_OPTIONS \  #define PUBLIC_STUDY_OPTIONS \
998     (PCRE_STUDY_JIT_COMPILE|PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE| \     (PCRE_STUDY_JIT_COMPILE|PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE| \
999      PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE)      PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE|PCRE_STUDY_EXTRA_NEEDED)
1000    
1001  /* Magic number to provide a small check against being handed junk. */  /* Magic number to provide a small check against being handed junk. */
1002    
# Line 945  macros to give the functions distinct na Line 1048  macros to give the functions distinct na
1048  #ifndef SUPPORT_UTF  #ifndef SUPPORT_UTF
1049    
1050  /* UTF-8 support is not enabled; use the platform-dependent character literals  /* UTF-8 support is not enabled; use the platform-dependent character literals
1051  so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */  so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF
1052    mode. Newline characters are problematic in EBCDIC. Though it has CR and LF
1053    characters, a common practice has been to use its NL (0x15) character as the
1054    line terminator in C-like processing environments. However, sometimes the LF
1055    (0x25) character is used instead, according to this Unicode document:
1056    
1057    http://unicode.org/standard/reports/tr13/tr13-5.html
1058    
1059    PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25
1060    instead. Whichever is *not* chosen is defined as NEL.
1061    
1062    In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the
1063    same code point. */
1064    
1065    #ifdef EBCDIC
1066    
1067    #ifndef EBCDIC_NL25
1068    #define CHAR_NL                     '\x15'
1069    #define CHAR_NEL                    '\x25'
1070    #define STR_NL                      "\x15"
1071    #define STR_NEL                     "\x25"
1072    #else
1073    #define CHAR_NL                     '\x25'
1074    #define CHAR_NEL                    '\x15'
1075    #define STR_NL                      "\x25"
1076    #define STR_NEL                     "\x15"
1077    #endif
1078    
1079    #define CHAR_LF                     CHAR_NL
1080    #define STR_LF                      STR_NL
1081    
1082    #define CHAR_ESC                    '\047'
1083    #define CHAR_DEL                    '\007'
1084    #define STR_ESC                     "\047"
1085    #define STR_DEL                     "\007"
1086    
1087    #else  /* Not EBCDIC */
1088    
1089    /* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for
1090    compatibility. NEL is the Unicode newline character; make sure it is
1091    a positive value. */
1092    
1093    #define CHAR_LF                     '\n'
1094    #define CHAR_NL                     CHAR_LF
1095    #define CHAR_NEL                    ((unsigned char)'\x85')
1096    #define CHAR_ESC                    '\033'
1097    #define CHAR_DEL                    '\177'
1098    
1099    #define STR_LF                      "\n"
1100    #define STR_NL                      STR_LF
1101    #define STR_NEL                     "\x85"
1102    #define STR_ESC                     "\033"
1103    #define STR_DEL                     "\177"
1104    
1105    #endif  /* EBCDIC */
1106    
1107    /* The remaining definitions work in both environments. */
1108    
1109  #define CHAR_HT                     '\t'  #define CHAR_HT                     '\t'
1110  #define CHAR_VT                     '\v'  #define CHAR_VT                     '\v'
1111  #define CHAR_FF                     '\f'  #define CHAR_FF                     '\f'
1112  #define CHAR_CR                     '\r'  #define CHAR_CR                     '\r'
 #define CHAR_NL                     '\n'  
1113  #define CHAR_BS                     '\b'  #define CHAR_BS                     '\b'
1114  #define CHAR_BEL                    '\a'  #define CHAR_BEL                    '\a'
 #ifdef EBCDIC  
 #define CHAR_ESC                    '\047'  
 #define CHAR_DEL                    '\007'  
 #else  
 #define CHAR_ESC                    '\033'  
 #define CHAR_DEL                    '\177'  
 #endif  
1115    
1116  #define CHAR_SPACE                  ' '  #define CHAR_SPACE                  ' '
1117  #define CHAR_EXCLAMATION_MARK       '!'  #define CHAR_EXCLAMATION_MARK       '!'
# Line 1062  so that PCRE works on both ASCII and EBC Line 1213  so that PCRE works on both ASCII and EBC
1213  #define STR_VT                      "\v"  #define STR_VT                      "\v"
1214  #define STR_FF                      "\f"  #define STR_FF                      "\f"
1215  #define STR_CR                      "\r"  #define STR_CR                      "\r"
 #define STR_NL                      "\n"  
1216  #define STR_BS                      "\b"  #define STR_BS                      "\b"
1217  #define STR_BEL                     "\a"  #define STR_BEL                     "\a"
 #ifdef EBCDIC  
 #define STR_ESC                     "\047"  
 #define STR_DEL                     "\007"  
 #else  
 #define STR_ESC                     "\033"  
 #define STR_DEL                     "\177"  
 #endif  
1218    
1219  #define STR_SPACE                   " "  #define STR_SPACE                   " "
1220  #define STR_EXCLAMATION_MARK        "!"  #define STR_EXCLAMATION_MARK        "!"
# Line 1221  only. */ Line 1364  only. */
1364  #define CHAR_VT                     '\013'  #define CHAR_VT                     '\013'
1365  #define CHAR_FF                     '\014'  #define CHAR_FF                     '\014'
1366  #define CHAR_CR                     '\015'  #define CHAR_CR                     '\015'
1367  #define CHAR_NL                     '\012'  #define CHAR_LF                     '\012'
1368    #define CHAR_NL                     CHAR_LF
1369    #define CHAR_NEL                    ((unsigned char)'\x85')
1370  #define CHAR_BS                     '\010'  #define CHAR_BS                     '\010'
1371  #define CHAR_BEL                    '\007'  #define CHAR_BEL                    '\007'
1372  #define CHAR_ESC                    '\033'  #define CHAR_ESC                    '\033'
# Line 1484  only. */ Line 1629  only. */
1629  #endif  #endif
1630    
1631  #ifndef ESC_n  #ifndef ESC_n
1632  #define ESC_n CHAR_NL  #define ESC_n CHAR_LF
1633  #endif  #endif
1634    
1635  #ifndef ESC_r  #ifndef ESC_r
# Line 1509  only. */ Line 1654  only. */
1654  #define PT_SPACE      6    /* Perl space - Z plus 9,10,12,13 */  #define PT_SPACE      6    /* Perl space - Z plus 9,10,12,13 */
1655  #define PT_PXSPACE    7    /* POSIX space - Z plus 9,10,11,12,13 */  #define PT_PXSPACE    7    /* POSIX space - Z plus 9,10,11,12,13 */
1656  #define PT_WORD       8    /* Word - L plus N plus underscore */  #define PT_WORD       8    /* Word - L plus N plus underscore */
1657    #define PT_CLIST      9    /* Pseudo-property: match character list */
1658    
1659  /* Flag bits and data types for the extended class (OP_XCLASS) for classes that  /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
1660  contain characters with values greater than 255. */  contain characters with values greater than 255. */
# Line 1531  used for [^] in JavaScript compatibility Line 1677  used for [^] in JavaScript compatibility
1677  non-DOTALL mode, "." behaves like \N.  non-DOTALL mode, "." behaves like \N.
1678    
1679  The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.  The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
1680  when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.  when PCRE_UCP is set and replacement of \d etc by \p sequences is required.
1681  They must be contiguous, and remain in order so that the replacements can be  They must be contiguous, and remain in order so that the replacements can be
1682  looked up from a table.  looked up from a table.
1683    
# Line 1573  enum { Line 1719  enum {
1719    OP_NOT_WORDCHAR,       /* 10 \W */    OP_NOT_WORDCHAR,       /* 10 \W */
1720    OP_WORDCHAR,           /* 11 \w */    OP_WORDCHAR,           /* 11 \w */
1721    
1722    OP_ANY,            /* 12 Match any character except newline */    OP_ANY,            /* 12 Match any character except newline (\N) */
1723    OP_ALLANY,         /* 13 Match any character */    OP_ALLANY,         /* 13 Match any character */
1724    OP_ANYBYTE,        /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */    OP_ANYBYTE,        /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
1725    OP_NOTPROP,        /* 15 \P (not Unicode property) */    OP_NOTPROP,        /* 15 \P (not Unicode property) */
# Line 1584  enum { Line 1730  enum {
1730    OP_NOT_VSPACE,     /* 20 \V (not vertical whitespace) */    OP_NOT_VSPACE,     /* 20 \V (not vertical whitespace) */
1731    OP_VSPACE,         /* 21 \v (vertical whitespace) */    OP_VSPACE,         /* 21 \v (vertical whitespace) */
1732    OP_EXTUNI,         /* 22 \X (extended Unicode sequence */    OP_EXTUNI,         /* 22 \X (extended Unicode sequence */
1733    OP_EODN,           /* 23 End of data or \n at end of data: \Z. */    OP_EODN,           /* 23 End of data or \n at end of data (\Z) */
1734    OP_EOD,            /* 24 End of data: \z */    OP_EOD,            /* 24 End of data (\z) */
1735    
1736    OP_CIRC,           /* 25 Start of line - not multiline */    OP_CIRC,           /* 25 Start of line - not multiline */
1737    OP_CIRCM,          /* 26 Start of line - multiline */    OP_CIRCM,          /* 26 Start of line - multiline */
# Line 1945  enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4, Line 2091  enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4,
2091         ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,         ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
2092         ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,         ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
2093         ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,         ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
2094         ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERRCOUNT };         ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERRCOUNT };
2095    
2096  /* JIT compiling modes. The function list is indexed by them. */  /* JIT compiling modes. The function list is indexed by them. */
2097  enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PARTIAL_HARD_COMPILE,  enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PARTIAL_HARD_COMPILE,
# Line 2041  typedef struct compile_data { Line 2187  typedef struct compile_data {
2187    int  external_flags;              /* External flag bits to be set */    int  external_flags;              /* External flag bits to be set */
2188    int  req_varyopt;                 /* "After variable item" flag for reqbyte */    int  req_varyopt;                 /* "After variable item" flag for reqbyte */
2189    BOOL had_accept;                  /* (*ACCEPT) encountered */    BOOL had_accept;                  /* (*ACCEPT) encountered */
2190      BOOL had_pruneorskip;             /* (*PRUNE) or (*SKIP) encountered */
2191    BOOL check_lookbehind;            /* Lookbehinds need later checking */    BOOL check_lookbehind;            /* Lookbehinds need later checking */
2192    int  nltype;                      /* Newline type */    int  nltype;                      /* Newline type */
2193    int  nllen;                       /* Newline string length */    int  nllen;                       /* Newline string length */
# Line 2232  but are not part of the PCRE public API. Line 2379  but are not part of the PCRE public API.
2379  pcre_tables.c module. */  pcre_tables.c module. */
2380    
2381  #ifdef COMPILE_PCRE8  #ifdef COMPILE_PCRE8
   
2382  extern const int            PRIV(utf8_table1)[];  extern const int            PRIV(utf8_table1)[];
2383  extern const int            PRIV(utf8_table1_size);  extern const int            PRIV(utf8_table1_size);
2384  extern const int            PRIV(utf8_table2)[];  extern const int            PRIV(utf8_table2)[];
2385  extern const int            PRIV(utf8_table3)[];  extern const int            PRIV(utf8_table3)[];
2386  extern const pcre_uint8     PRIV(utf8_table4)[];  extern const pcre_uint8     PRIV(utf8_table4)[];
   
2387  #endif /* COMPILE_PCRE8 */  #endif /* COMPILE_PCRE8 */
2388    
2389  extern const char           PRIV(utt_names)[];  extern const char           PRIV(utt_names)[];
2390  extern const ucp_type_table PRIV(utt)[];  extern const ucp_type_table PRIV(utt)[];
2391  extern const int            PRIV(utt_size);  extern const int            PRIV(utt_size);
2392    
2393    extern const pcre_uint8     PRIV(OP_lengths)[];
2394  extern const pcre_uint8     PRIV(default_tables)[];  extern const pcre_uint8     PRIV(default_tables)[];
2395    
2396  extern const pcre_uint8     PRIV(OP_lengths)[];  extern const pcre_uint32    PRIV(hspace_list)[];
2397    extern const pcre_uint32    PRIV(vspace_list)[];
2398    
2399    
2400  /* Internal shared functions. These are functions that are used by more than  /* Internal shared functions. These are functions that are used by more than
# Line 2313  extern const char*       PRIV(jit_get_ta Line 2460  extern const char*       PRIV(jit_get_ta
2460  /* Unicode character database (UCD) */  /* Unicode character database (UCD) */
2461    
2462  typedef struct {  typedef struct {
2463    pcre_uint8 script;    pcre_uint8 script;     /* ucp_Arabic, etc. */
2464    pcre_uint8 chartype;    pcre_uint8 chartype;   /* ucp_Cc, etc. (general categories) */
2465    pcre_int32 other_case;    pcre_uint8 gbprop;     /* ucp_gbControl, etc. (grapheme break property) */
2466      pcre_uint8 caseset;    /* offset to multichar other cases or zero */
2467      pcre_int32 other_case; /* offset to other case, or zero if none */
2468  } ucd_record;  } ucd_record;
2469    
2470    extern const pcre_uint32 PRIV(ucd_caseless_sets)[];
2471  extern const ucd_record  PRIV(ucd_records)[];  extern const ucd_record  PRIV(ucd_records)[];
2472  extern const pcre_uint8  PRIV(ucd_stage1)[];  extern const pcre_uint8  PRIV(ucd_stage1)[];
2473  extern const pcre_uint16 PRIV(ucd_stage2)[];  extern const pcre_uint16 PRIV(ucd_stage2)[];
2474  extern const int         PRIV(ucp_gentype)[];  extern const int         PRIV(ucp_gentype)[];
2475    extern const pcre_uint32 PRIV(ucp_gbtable)[];
2476  #ifdef SUPPORT_JIT  #ifdef SUPPORT_JIT
2477  extern const int         PRIV(ucp_typerange)[];  extern const int         PRIV(ucp_typerange)[];
2478  #endif  #endif
# Line 2334  extern const int         PRIV(ucp_typera Line 2485  extern const int         PRIV(ucp_typera
2485          PRIV(ucd_stage2)[PRIV(ucd_stage1)[(ch) / UCD_BLOCK_SIZE] * \          PRIV(ucd_stage2)[PRIV(ucd_stage1)[(ch) / UCD_BLOCK_SIZE] * \
2486          UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE])          UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE])
2487    
2488  #define UCD_CHARTYPE(ch)  GET_UCD(ch)->chartype  #define UCD_CHARTYPE(ch)    GET_UCD(ch)->chartype
2489  #define UCD_SCRIPT(ch)    GET_UCD(ch)->script  #define UCD_SCRIPT(ch)      GET_UCD(ch)->script
2490  #define UCD_CATEGORY(ch)  PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]  #define UCD_CATEGORY(ch)    PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
2491  #define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)  #define UCD_GRAPHBREAK(ch)  GET_UCD(ch)->gbprop
2492    #define UCD_CASESET(ch)     GET_UCD(ch)->caseset
2493    #define UCD_OTHERCASE(ch)   (ch + GET_UCD(ch)->other_case)
2494    
2495  #endif /* SUPPORT_UCP */  #endif /* SUPPORT_UCP */
2496    

Legend:
Removed from v.975  
changed lines
  Added in v.1046

  ViewVC Help
Powered by ViewVC 1.1.5