6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
7 |
|
|
8 |
Written by Philip Hazel |
Written by Philip Hazel |
9 |
Copyright (c) 1997-2007 University of Cambridge |
Copyright (c) 1997-2012 University of Cambridge |
10 |
|
|
11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
38 |
*/ |
*/ |
39 |
|
|
40 |
|
|
41 |
/* This module contains internal functions for testing newlines when more than |
/* This module contains internal functions for testing newlines when more than |
42 |
one kind of newline is to be recognized. When a newline is found, its length is |
one kind of newline is to be recognized. When a newline is found, its length is |
43 |
returned. In principle, we could implement several newline "types", each |
returned. In principle, we could implement several newline "types", each |
44 |
referring to a different set of newline characters. At present, PCRE supports |
referring to a different set of newline characters. At present, PCRE supports |
45 |
only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF, |
only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF, |
46 |
and NLTYPE_ANY. The full list of Unicode newline characters is taken from |
and NLTYPE_ANY. The full list of Unicode newline characters is taken from |
47 |
http://unicode.org/unicode/reports/tr18/. */ |
http://unicode.org/unicode/reports/tr18/. */ |
48 |
|
|
49 |
|
|
50 |
|
#ifdef HAVE_CONFIG_H |
51 |
|
#include "config.h" |
52 |
|
#endif |
53 |
|
|
54 |
#include "pcre_internal.h" |
#include "pcre_internal.h" |
55 |
|
|
56 |
|
|
60 |
*************************************************/ |
*************************************************/ |
61 |
|
|
62 |
/* It is guaranteed that the initial value of ptr is less than the end of the |
/* It is guaranteed that the initial value of ptr is less than the end of the |
63 |
string that is being processed. |
string that is being processed. |
64 |
|
|
65 |
Arguments: |
Arguments: |
66 |
ptr pointer to possible newline |
ptr pointer to possible newline |
67 |
type the newline type |
type the newline type |
68 |
endptr pointer to the end of the string |
endptr pointer to the end of the string |
69 |
lenptr where to return the length |
lenptr where to return the length |
70 |
utf8 TRUE if in utf8 mode |
utf TRUE if in utf mode |
71 |
|
|
72 |
Returns: TRUE or FALSE |
Returns: TRUE or FALSE |
73 |
*/ |
*/ |
74 |
|
|
75 |
BOOL |
BOOL |
76 |
_pcre_is_newline(const uschar *ptr, int type, const uschar *endptr, |
PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr, |
77 |
int *lenptr, BOOL utf8) |
BOOL utf) |
78 |
{ |
{ |
79 |
int c; |
int c; |
80 |
if (utf8) { GETCHAR(c, ptr); } else c = *ptr; |
(void)utf; |
81 |
|
#ifdef SUPPORT_UTF |
82 |
|
if (utf) |
83 |
|
{ |
84 |
|
GETCHAR(c, ptr); |
85 |
|
} |
86 |
|
else |
87 |
|
#endif /* SUPPORT_UTF */ |
88 |
|
c = *ptr; |
89 |
|
|
90 |
|
/* Note that this function is called only for ANY or ANYCRLF. */ |
91 |
|
|
92 |
if (type == NLTYPE_ANYCRLF) switch(c) |
if (type == NLTYPE_ANYCRLF) switch(c) |
93 |
{ |
{ |
94 |
case 0x000a: *lenptr = 1; return TRUE; /* LF */ |
case CHAR_LF: *lenptr = 1; return TRUE; |
95 |
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; |
case CHAR_CR: *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; |
96 |
return TRUE; /* CR */ |
return TRUE; |
97 |
default: return FALSE; |
default: return FALSE; |
98 |
} |
} |
99 |
|
|
101 |
|
|
102 |
else switch(c) |
else switch(c) |
103 |
{ |
{ |
104 |
case 0x000a: /* LF */ |
#ifdef EBCDIC |
105 |
case 0x000b: /* VT */ |
case CHAR_NEL: |
106 |
case 0x000c: *lenptr = 1; return TRUE; /* FF */ |
#endif |
107 |
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; |
case CHAR_LF: |
108 |
return TRUE; /* CR */ |
case CHAR_VT: |
109 |
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ |
case CHAR_FF: *lenptr = 1; return TRUE; |
110 |
|
|
111 |
|
case CHAR_CR: |
112 |
|
*lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; |
113 |
|
return TRUE; |
114 |
|
|
115 |
|
#ifndef EBCDIC |
116 |
|
#ifdef COMPILE_PCRE8 |
117 |
|
case CHAR_NEL: *lenptr = utf? 2 : 1; return TRUE; |
118 |
case 0x2028: /* LS */ |
case 0x2028: /* LS */ |
119 |
case 0x2029: *lenptr = 3; return TRUE; /* PS */ |
case 0x2029: *lenptr = 3; return TRUE; /* PS */ |
120 |
|
#else /* 16-bit (can't be EBCDIC) */ |
121 |
|
case CHAR_NEL: |
122 |
|
case 0x2028: /* LS */ |
123 |
|
case 0x2029: *lenptr = 1; return TRUE; /* PS */ |
124 |
|
#endif /* COMPILE_PCRE8 */ |
125 |
|
#endif /* Not EBCDIC */ |
126 |
|
|
127 |
default: return FALSE; |
default: return FALSE; |
128 |
} |
} |
129 |
} |
} |
142 |
type the newline type |
type the newline type |
143 |
startptr pointer to the start of the string |
startptr pointer to the start of the string |
144 |
lenptr where to return the length |
lenptr where to return the length |
145 |
utf8 TRUE if in utf8 mode |
utf TRUE if in utf mode |
146 |
|
|
147 |
Returns: TRUE or FALSE |
Returns: TRUE or FALSE |
148 |
*/ |
*/ |
149 |
|
|
150 |
BOOL |
BOOL |
151 |
_pcre_was_newline(const uschar *ptr, int type, const uschar *startptr, |
PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr, |
152 |
int *lenptr, BOOL utf8) |
BOOL utf) |
153 |
{ |
{ |
154 |
int c; |
int c; |
155 |
|
(void)utf; |
156 |
ptr--; |
ptr--; |
157 |
if (utf8) |
#ifdef SUPPORT_UTF |
158 |
|
if (utf) |
159 |
{ |
{ |
160 |
BACKCHAR(ptr); |
BACKCHAR(ptr); |
161 |
GETCHAR(c, ptr); |
GETCHAR(c, ptr); |
162 |
} |
} |
163 |
else c = *ptr; |
else |
164 |
|
#endif /* SUPPORT_UTF */ |
165 |
|
c = *ptr; |
166 |
|
|
167 |
|
/* Note that this function is called only for ANY or ANYCRLF. */ |
168 |
|
|
169 |
if (type == NLTYPE_ANYCRLF) switch(c) |
if (type == NLTYPE_ANYCRLF) switch(c) |
170 |
{ |
{ |
171 |
case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; |
case CHAR_LF: |
172 |
return TRUE; /* LF */ |
*lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; |
173 |
case 0x000d: *lenptr = 1; return TRUE; /* CR */ |
return TRUE; |
174 |
|
|
175 |
|
case CHAR_CR: *lenptr = 1; return TRUE; |
176 |
default: return FALSE; |
default: return FALSE; |
177 |
} |
} |
178 |
|
|
179 |
|
/* NLTYPE_ANY */ |
180 |
|
|
181 |
else switch(c) |
else switch(c) |
182 |
{ |
{ |
183 |
case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; |
case CHAR_LF: |
184 |
return TRUE; /* LF */ |
*lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; |
185 |
case 0x000b: /* VT */ |
return TRUE; |
186 |
case 0x000c: /* FF */ |
|
187 |
case 0x000d: *lenptr = 1; return TRUE; /* CR */ |
#ifdef EBCDIC |
188 |
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ |
case CHAR_NEL: |
189 |
case 0x2028: /* LS */ |
#endif |
190 |
case 0x2029: *lenptr = 3; return TRUE; /* PS */ |
case CHAR_VT: |
191 |
|
case CHAR_FF: |
192 |
|
case CHAR_CR: *lenptr = 1; return TRUE; |
193 |
|
|
194 |
|
#ifndef EBCDIC |
195 |
|
#ifdef COMPILE_PCRE8 |
196 |
|
case CHAR_NEL: *lenptr = utf? 2 : 1; return TRUE; |
197 |
|
case 0x2028: /* LS */ |
198 |
|
case 0x2029: *lenptr = 3; return TRUE; /* PS */ |
199 |
|
#else |
200 |
|
case CHAR_NEL: |
201 |
|
case 0x2028: /* LS */ |
202 |
|
case 0x2029: *lenptr = 1; return TRUE; /* PS */ |
203 |
|
#endif /* COMPILE_PCRE8 */ |
204 |
|
#endif /* NotEBCDIC */ |
205 |
|
|
206 |
default: return FALSE; |
default: return FALSE; |
207 |
} |
} |
208 |
} |
} |