59 
can be turned off for maximum performance, but the consequences of supplying 
can be turned off for maximum performance, but the consequences of supplying 
60 
an invalid string are then undefined. 
an invalid string are then undefined. 
61 


62 

Originally, this function checked according to RFC 2279, allowing for values in 
63 

the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in 
64 

the canonical format. Once somebody had pointed out RFC 3629 to me (it 
65 

obsoletes 2279), additional restrictions were applies. The values are now 
66 

limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the 
67 

subrange 0xd000 to 0xdfff is excluded. 
68 


69 
Arguments: 
Arguments: 
70 
string points to the string 
string points to the string 
71 
length length of string, or 1 if the string is zeroterminated 
length length of string, or 1 if the string is zeroterminated 
92 
register int c = *p; 
register int c = *p; 
93 
if (c < 128) continue; 
if (c < 128) continue; 
94 
if (c < 0xc0) return p  string; 
if (c < 0xc0) return p  string; 
95 
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ 
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ 
96 
if (length < ab) return p  string; 
if (length < ab  ab > 3) return p  string; 
97 
length = ab; 
length = ab; 
98 


99 
/* Check top bits in the second byte */ 
/* Check top bits in the second byte */ 
100 
if ((*(++p) & 0xc0) != 0x80) return p  string; 
if ((*(++p) & 0xc0) != 0x80) return p  string; 
101 


102 
/* Check for overlong sequences for each different length */ 
/* Check for overlong sequences for each different length, and for the 
103 

excluded range 0xd000 to 0xdfff. */ 
104 


105 
switch (ab) 
switch (ab) 
106 
{ 
{ 
107 
/* Check for xx00 000x */ 
/* Check for xx00 000x (overlong sequence) */ 
108 


109 
case 1: 
case 1: 
110 
if ((c & 0x3e) == 0) return p  string; 
if ((c & 0x3e) == 0) return p  string; 
111 
continue; /* We know there aren't any more bytes to check */ 
continue; /* We know there aren't any more bytes to check */ 
112 


113 
/* Check for 1110 0000, xx0x xxxx */ 
/* Check for 1110 0000, xx0x xxxx (overlong sequence) or 
114 

1110 1101, 1010 xxxx (0xd000  0xdfff) */ 
115 


116 
case 2: 
case 2: 
117 
if (c == 0xe0 && (*p & 0x20) == 0) return p  string; 
if ((c == 0xe0 && (*p & 0x20) == 0)  
118 

(c == 0xed && *p >= 0xa0)) 
119 

return p  string; 
120 
break; 
break; 
121 


122 
/* Check for 1111 0000, xx00 xxxx */ 
/* Check for 1111 0000, xx00 xxxx (overlong sequence) or 
123 

greater than 0x0010ffff (f4 8f bf bf) */ 
124 


125 
case 3: 
case 3: 
126 
if (c == 0xf0 && (*p & 0x30) == 0) return p  string; 
if ((c == 0xf0 && (*p & 0x30) == 0)  
127 

(c > 0xf4 )  
128 

(c == 0xf4 && *p > 0x8f)) 
129 

return p  string; 
130 
break; 
break; 
131 


132 

#if 0 
133 

/* These cases can no longer occur, as we restrict to a maximum of four 
134 

bytes nowadays. Leave the code here in case we ever want to add an option 
135 

for longer sequences. */ 
136 


137 
/* Check for 1111 1000, xx00 0xxx */ 
/* Check for 1111 1000, xx00 0xxx */ 
138 
case 4: 
case 4: 
139 
if (c == 0xf8 && (*p & 0x38) == 0) return p  string; 
if (c == 0xf8 && (*p & 0x38) == 0) return p  string; 
144 
if (c == 0xfe  c == 0xff  
if (c == 0xfe  c == 0xff  
145 
(c == 0xfc && (*p & 0x3c) == 0)) return p  string; 
(c == 0xfc && (*p & 0x3c) == 0)) return p  string; 
146 
break; 
break; 
147 

#endif 
148 


149 
} 
} 
150 


151 
/* Check for valid bytes after the 2nd, if any; all must start 10 */ 
/* Check for valid bytes after the 2nd, if any; all must start 10 */ 