1 |
<html>
|
2 |
<head>
|
3 |
<title>pcre16 specification</title>
|
4 |
</head>
|
5 |
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
6 |
<h1>pcre16 man page</h1>
|
7 |
<p>
|
8 |
Return to the <a href="index.html">PCRE index page</a>.
|
9 |
</p>
|
10 |
<p>
|
11 |
This page is part of the PCRE HTML documentation. It was generated automatically
|
12 |
from the original man page. If there is any nonsense in it, please consult the
|
13 |
man page, in case the conversion went wrong.
|
14 |
<br>
|
15 |
<ul>
|
16 |
<li><a name="TOC1" href="#SEC1">PCRE 16-BIT API BASIC FUNCTIONS</a>
|
17 |
<li><a name="TOC2" href="#SEC2">PCRE 16-BIT API STRING EXTRACTION FUNCTIONS</a>
|
18 |
<li><a name="TOC3" href="#SEC3">PCRE 16-BIT API AUXILIARY FUNCTIONS</a>
|
19 |
<li><a name="TOC4" href="#SEC4">PCRE 16-BIT API INDIRECTED FUNCTIONS</a>
|
20 |
<li><a name="TOC5" href="#SEC5">PCRE 16-BIT API 16-BIT-ONLY FUNCTION</a>
|
21 |
<li><a name="TOC6" href="#SEC6">THE PCRE 16-BIT LIBRARY</a>
|
22 |
<li><a name="TOC7" href="#SEC7">THE HEADER FILE</a>
|
23 |
<li><a name="TOC8" href="#SEC8">THE LIBRARY NAME</a>
|
24 |
<li><a name="TOC9" href="#SEC9">STRING TYPES</a>
|
25 |
<li><a name="TOC10" href="#SEC10">STRUCTURE TYPES</a>
|
26 |
<li><a name="TOC11" href="#SEC11">16-BIT FUNCTIONS</a>
|
27 |
<li><a name="TOC12" href="#SEC12">SUBJECT STRING OFFSETS</a>
|
28 |
<li><a name="TOC13" href="#SEC13">NAMED SUBPATTERNS</a>
|
29 |
<li><a name="TOC14" href="#SEC14">OPTION NAMES</a>
|
30 |
<li><a name="TOC15" href="#SEC15">CHARACTER CODES</a>
|
31 |
<li><a name="TOC16" href="#SEC16">ERROR NAMES</a>
|
32 |
<li><a name="TOC17" href="#SEC17">ERROR TEXTS</a>
|
33 |
<li><a name="TOC18" href="#SEC18">CALLOUTS</a>
|
34 |
<li><a name="TOC19" href="#SEC19">TESTING</a>
|
35 |
<li><a name="TOC20" href="#SEC20">NOT SUPPORTED IN 16-BIT MODE</a>
|
36 |
<li><a name="TOC21" href="#SEC21">AUTHOR</a>
|
37 |
<li><a name="TOC22" href="#SEC22">REVISION</a>
|
38 |
</ul>
|
39 |
<P>
|
40 |
<b>#include <pcre.h></b>
|
41 |
</P>
|
42 |
<br><a name="SEC1" href="#TOC1">PCRE 16-BIT API BASIC FUNCTIONS</a><br>
|
43 |
<P>
|
44 |
<b>pcre16 *pcre16_compile(PCRE_SPTR16 <i>pattern</i>, int <i>options</i>,</b>
|
45 |
<b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b>
|
46 |
<b>const unsigned char *<i>tableptr</i>);</b>
|
47 |
</P>
|
48 |
<P>
|
49 |
<b>pcre16 *pcre16_compile2(PCRE_SPTR16 <i>pattern</i>, int <i>options</i>,</b>
|
50 |
<b>int *<i>errorcodeptr</i>,</b>
|
51 |
<b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b>
|
52 |
<b>const unsigned char *<i>tableptr</i>);</b>
|
53 |
</P>
|
54 |
<P>
|
55 |
<b>pcre16_extra *pcre16_study(const pcre16 *<i>code</i>, int <i>options</i>,</b>
|
56 |
<b>const char **<i>errptr</i>);</b>
|
57 |
</P>
|
58 |
<P>
|
59 |
<b>void pcre16_free_study(pcre16_extra *<i>extra</i>);</b>
|
60 |
</P>
|
61 |
<P>
|
62 |
<b>int pcre16_exec(const pcre16 *<i>code</i>, const pcre16_extra *<i>extra</i>,</b>
|
63 |
<b>PCRE_SPTR16 <i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
|
64 |
<b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>);</b>
|
65 |
</P>
|
66 |
<P>
|
67 |
<b>int pcre16_dfa_exec(const pcre16 *<i>code</i>, const pcre16_extra *<i>extra</i>,</b>
|
68 |
<b>PCRE_SPTR16 <i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
|
69 |
<b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>,</b>
|
70 |
<b>int *<i>workspace</i>, int <i>wscount</i>);</b>
|
71 |
</P>
|
72 |
<br><a name="SEC2" href="#TOC1">PCRE 16-BIT API STRING EXTRACTION FUNCTIONS</a><br>
|
73 |
<P>
|
74 |
<b>int pcre16_copy_named_substring(const pcre16 *<i>code</i>,</b>
|
75 |
<b>PCRE_SPTR16 <i>subject</i>, int *<i>ovector</i>,</b>
|
76 |
<b>int <i>stringcount</i>, PCRE_SPTR16 <i>stringname</i>,</b>
|
77 |
<b>PCRE_UCHAR16 *<i>buffer</i>, int <i>buffersize</i>);</b>
|
78 |
</P>
|
79 |
<P>
|
80 |
<b>int pcre16_copy_substring(PCRE_SPTR16 <i>subject</i>, int *<i>ovector</i>,</b>
|
81 |
<b>int <i>stringcount</i>, int <i>stringnumber</i>, PCRE_UCHAR16 *<i>buffer</i>,</b>
|
82 |
<b>int <i>buffersize</i>);</b>
|
83 |
</P>
|
84 |
<P>
|
85 |
<b>int pcre16_get_named_substring(const pcre16 *<i>code</i>,</b>
|
86 |
<b>PCRE_SPTR16 <i>subject</i>, int *<i>ovector</i>,</b>
|
87 |
<b>int <i>stringcount</i>, PCRE_SPTR16 <i>stringname</i>,</b>
|
88 |
<b>PCRE_SPTR16 *<i>stringptr</i>);</b>
|
89 |
</P>
|
90 |
<P>
|
91 |
<b>int pcre16_get_stringnumber(const pcre16 *<i>code</i>,</b>
|
92 |
<b>PCRE_SPTR16 <i>name</i>);</b>
|
93 |
</P>
|
94 |
<P>
|
95 |
<b>int pcre16_get_stringtable_entries(const pcre16 *<i>code</i>,</b>
|
96 |
<b>PCRE_SPTR16 <i>name</i>, PCRE_UCHAR16 **<i>first</i>, PCRE_UCHAR16 **<i>last</i>);</b>
|
97 |
</P>
|
98 |
<P>
|
99 |
<b>int pcre16_get_substring(PCRE_SPTR16 <i>subject</i>, int *<i>ovector</i>,</b>
|
100 |
<b>int <i>stringcount</i>, int <i>stringnumber</i>,</b>
|
101 |
<b>PCRE_SPTR16 *<i>stringptr</i>);</b>
|
102 |
</P>
|
103 |
<P>
|
104 |
<b>int pcre16_get_substring_list(PCRE_SPTR16 <i>subject</i>,</b>
|
105 |
<b>int *<i>ovector</i>, int <i>stringcount</i>, PCRE_SPTR16 **<i>listptr</i>);</b>
|
106 |
</P>
|
107 |
<P>
|
108 |
<b>void pcre16_free_substring(PCRE_SPTR16 <i>stringptr</i>);</b>
|
109 |
</P>
|
110 |
<P>
|
111 |
<b>void pcre16_free_substring_list(PCRE_SPTR16 *<i>stringptr</i>);</b>
|
112 |
</P>
|
113 |
<br><a name="SEC3" href="#TOC1">PCRE 16-BIT API AUXILIARY FUNCTIONS</a><br>
|
114 |
<P>
|
115 |
<b>pcre16_jit_stack *pcre16_jit_stack_alloc(int <i>startsize</i>, int <i>maxsize</i>);</b>
|
116 |
</P>
|
117 |
<P>
|
118 |
<b>void pcre16_jit_stack_free(pcre16_jit_stack *<i>stack</i>);</b>
|
119 |
</P>
|
120 |
<P>
|
121 |
<b>void pcre16_assign_jit_stack(pcre16_extra *<i>extra</i>,</b>
|
122 |
<b>pcre16_jit_callback <i>callback</i>, void *<i>data</i>);</b>
|
123 |
</P>
|
124 |
<P>
|
125 |
<b>const unsigned char *pcre16_maketables(void);</b>
|
126 |
</P>
|
127 |
<P>
|
128 |
<b>int pcre16_fullinfo(const pcre16 *<i>code</i>, const pcre16_extra *<i>extra</i>,</b>
|
129 |
<b>int <i>what</i>, void *<i>where</i>);</b>
|
130 |
</P>
|
131 |
<P>
|
132 |
<b>int pcre16_refcount(pcre16 *<i>code</i>, int <i>adjust</i>);</b>
|
133 |
</P>
|
134 |
<P>
|
135 |
<b>int pcre16_config(int <i>what</i>, void *<i>where</i>);</b>
|
136 |
</P>
|
137 |
<P>
|
138 |
<b>const char *pcre16_version(void);</b>
|
139 |
</P>
|
140 |
<P>
|
141 |
<b>int pcre16_pattern_to_host_byte_order(pcre16 *<i>code</i>,</b>
|
142 |
<b>pcre16_extra *<i>extra</i>, const unsigned char *<i>tables</i>);</b>
|
143 |
</P>
|
144 |
<br><a name="SEC4" href="#TOC1">PCRE 16-BIT API INDIRECTED FUNCTIONS</a><br>
|
145 |
<P>
|
146 |
<b>void *(*pcre16_malloc)(size_t);</b>
|
147 |
</P>
|
148 |
<P>
|
149 |
<b>void (*pcre16_free)(void *);</b>
|
150 |
</P>
|
151 |
<P>
|
152 |
<b>void *(*pcre16_stack_malloc)(size_t);</b>
|
153 |
</P>
|
154 |
<P>
|
155 |
<b>void (*pcre16_stack_free)(void *);</b>
|
156 |
</P>
|
157 |
<P>
|
158 |
<b>int (*pcre16_callout)(pcre16_callout_block *);</b>
|
159 |
</P>
|
160 |
<br><a name="SEC5" href="#TOC1">PCRE 16-BIT API 16-BIT-ONLY FUNCTION</a><br>
|
161 |
<P>
|
162 |
<b>int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *<i>output</i>,</b>
|
163 |
<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>byte_order</i>,</b>
|
164 |
<b>int <i>keep_boms</i>);</b>
|
165 |
</P>
|
166 |
<br><a name="SEC6" href="#TOC1">THE PCRE 16-BIT LIBRARY</a><br>
|
167 |
<P>
|
168 |
Starting with release 8.30, it is possible to compile a PCRE library that
|
169 |
supports 16-bit character strings, including UTF-16 strings, as well as or
|
170 |
instead of the original 8-bit library. The majority of the work to make this
|
171 |
possible was done by Zoltan Herczeg. The two libraries contain identical sets
|
172 |
of functions, used in exactly the same way. Only the names of the functions and
|
173 |
the data types of their arguments and results are different. To avoid
|
174 |
over-complication and reduce the documentation maintenance load, most of the
|
175 |
PCRE documentation describes the 8-bit library, with only occasional references
|
176 |
to the 16-bit library. This page describes what is different when you use the
|
177 |
16-bit library.
|
178 |
</P>
|
179 |
<P>
|
180 |
WARNING: A single application can be linked with both libraries, but you must
|
181 |
take care when processing any particular pattern to use functions from just one
|
182 |
library. For example, if you want to study a pattern that was compiled with
|
183 |
<b>pcre16_compile()</b>, you must do so with <b>pcre16_study()</b>, not
|
184 |
<b>pcre_study()</b>, and you must free the study data with
|
185 |
<b>pcre16_free_study()</b>.
|
186 |
</P>
|
187 |
<br><a name="SEC7" href="#TOC1">THE HEADER FILE</a><br>
|
188 |
<P>
|
189 |
There is only one header file, <b>pcre.h</b>. It contains prototypes for all the
|
190 |
functions in all libraries, as well as definitions of flags, structures, error
|
191 |
codes, etc.
|
192 |
</P>
|
193 |
<br><a name="SEC8" href="#TOC1">THE LIBRARY NAME</a><br>
|
194 |
<P>
|
195 |
In Unix-like systems, the 16-bit library is called <b>libpcre16</b>, and can
|
196 |
normally be accesss by adding <b>-lpcre16</b> to the command for linking an
|
197 |
application that uses PCRE.
|
198 |
</P>
|
199 |
<br><a name="SEC9" href="#TOC1">STRING TYPES</a><br>
|
200 |
<P>
|
201 |
In the 8-bit library, strings are passed to PCRE library functions as vectors
|
202 |
of bytes with the C type "char *". In the 16-bit library, strings are passed as
|
203 |
vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
|
204 |
appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
|
205 |
very many environments, "short int" is a 16-bit data type. When PCRE is built,
|
206 |
it defines PCRE_UCHAR16 as "unsigned short int", but checks that it really is a
|
207 |
16-bit data type. If it is not, the build fails with an error message telling
|
208 |
the maintainer to modify the definition appropriately.
|
209 |
</P>
|
210 |
<br><a name="SEC10" href="#TOC1">STRUCTURE TYPES</a><br>
|
211 |
<P>
|
212 |
The types of the opaque structures that are used for compiled 16-bit patterns
|
213 |
and JIT stacks are <b>pcre16</b> and <b>pcre16_jit_stack</b> respectively. The
|
214 |
type of the user-accessible structure that is returned by <b>pcre16_study()</b>
|
215 |
is <b>pcre16_extra</b>, and the type of the structure that is used for passing
|
216 |
data to a callout function is <b>pcre16_callout_block</b>. These structures
|
217 |
contain the same fields, with the same names, as their 8-bit counterparts. The
|
218 |
only difference is that pointers to character strings are 16-bit instead of
|
219 |
8-bit types.
|
220 |
</P>
|
221 |
<br><a name="SEC11" href="#TOC1">16-BIT FUNCTIONS</a><br>
|
222 |
<P>
|
223 |
For every function in the 8-bit library there is a corresponding function in
|
224 |
the 16-bit library with a name that starts with <b>pcre16_</b> instead of
|
225 |
<b>pcre_</b>. The prototypes are listed above. In addition, there is one extra
|
226 |
function, <b>pcre16_utf16_to_host_byte_order()</b>. This is a utility function
|
227 |
that converts a UTF-16 character string to host byte order if necessary. The
|
228 |
other 16-bit functions expect the strings they are passed to be in host byte
|
229 |
order.
|
230 |
</P>
|
231 |
<P>
|
232 |
The <i>input</i> and <i>output</i> arguments of
|
233 |
<b>pcre16_utf16_to_host_byte_order()</b> may point to the same address, that is,
|
234 |
conversion in place is supported. The output buffer must be at least as long as
|
235 |
the input.
|
236 |
</P>
|
237 |
<P>
|
238 |
The <i>length</i> argument specifies the number of 16-bit data units in the
|
239 |
input string; a negative value specifies a zero-terminated string.
|
240 |
</P>
|
241 |
<P>
|
242 |
If <i>byte_order</i> is NULL, it is assumed that the string starts off in host
|
243 |
byte order. This may be changed by byte-order marks (BOMs) anywhere in the
|
244 |
string (commonly as the first character).
|
245 |
</P>
|
246 |
<P>
|
247 |
If <i>byte_order</i> is not NULL, a non-zero value of the integer to which it
|
248 |
points means that the input starts off in host byte order, otherwise the
|
249 |
opposite order is assumed. Again, BOMs in the string can change this. The final
|
250 |
byte order is passed back at the end of processing.
|
251 |
</P>
|
252 |
<P>
|
253 |
If <i>keep_boms</i> is not zero, byte-order mark characters (0xfeff) are copied
|
254 |
into the output string. Otherwise they are discarded.
|
255 |
</P>
|
256 |
<P>
|
257 |
The result of the function is the number of 16-bit units placed into the output
|
258 |
buffer, including the zero terminator if the string was zero-terminated.
|
259 |
</P>
|
260 |
<br><a name="SEC12" href="#TOC1">SUBJECT STRING OFFSETS</a><br>
|
261 |
<P>
|
262 |
The offsets within subject strings that are returned by the matching functions
|
263 |
are in 16-bit units rather than bytes.
|
264 |
</P>
|
265 |
<br><a name="SEC13" href="#TOC1">NAMED SUBPATTERNS</a><br>
|
266 |
<P>
|
267 |
The name-to-number translation table that is maintained for named subpatterns
|
268 |
uses 16-bit characters. The <b>pcre16_get_stringtable_entries()</b> function
|
269 |
returns the length of each entry in the table as the number of 16-bit data
|
270 |
units.
|
271 |
</P>
|
272 |
<br><a name="SEC14" href="#TOC1">OPTION NAMES</a><br>
|
273 |
<P>
|
274 |
There are two new general option names, PCRE_UTF16 and PCRE_NO_UTF16_CHECK,
|
275 |
which correspond to PCRE_UTF8 and PCRE_NO_UTF8_CHECK in the 8-bit library. In
|
276 |
fact, these new options define the same bits in the options word. There is a
|
277 |
discussion about the
|
278 |
<a href="pcreunicode.html#utf16strings">validity of UTF-16 strings</a>
|
279 |
in the
|
280 |
<a href="pcreunicode.html"><b>pcreunicode</b></a>
|
281 |
page.
|
282 |
</P>
|
283 |
<P>
|
284 |
For the <b>pcre16_config()</b> function there is an option PCRE_CONFIG_UTF16
|
285 |
that returns 1 if UTF-16 support is configured, otherwise 0. If this option is
|
286 |
given to <b>pcre_config()</b> or <b>pcre32_config()</b>, or if the
|
287 |
PCRE_CONFIG_UTF8 or PCRE_CONFIG_UTF32 option is given to <b>pcre16_config()</b>,
|
288 |
the result is the PCRE_ERROR_BADOPTION error.
|
289 |
</P>
|
290 |
<br><a name="SEC15" href="#TOC1">CHARACTER CODES</a><br>
|
291 |
<P>
|
292 |
In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
|
293 |
same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
|
294 |
from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
|
295 |
0xff can therefore be influenced by the locale in the same way as before.
|
296 |
Characters greater than 0xff have only one case, and no "type" (such as letter
|
297 |
or digit).
|
298 |
</P>
|
299 |
<P>
|
300 |
In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
|
301 |
the exception of values in the range 0xd800 to 0xdfff because those are
|
302 |
"surrogate" values that are used in pairs to encode values greater than 0xffff.
|
303 |
</P>
|
304 |
<P>
|
305 |
A UTF-16 string can indicate its endianness by special code knows as a
|
306 |
byte-order mark (BOM). The PCRE functions do not handle this, expecting strings
|
307 |
to be in host byte order. A utility function called
|
308 |
<b>pcre16_utf16_to_host_byte_order()</b> is provided to help with this (see
|
309 |
above).
|
310 |
</P>
|
311 |
<br><a name="SEC16" href="#TOC1">ERROR NAMES</a><br>
|
312 |
<P>
|
313 |
The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
|
314 |
their 8-bit counterparts. The error PCRE_ERROR_BADMODE is given when a compiled
|
315 |
pattern is passed to a function that processes patterns in the other
|
316 |
mode, for example, if a pattern compiled with <b>pcre_compile()</b> is passed to
|
317 |
<b>pcre16_exec()</b>.
|
318 |
</P>
|
319 |
<P>
|
320 |
There are new error codes whose names begin with PCRE_UTF16_ERR for invalid
|
321 |
UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
|
322 |
are described in the section entitled
|
323 |
<a href="pcreapi.html#badutf8reasons">"Reason codes for invalid UTF-8 strings"</a>
|
324 |
in the main
|
325 |
<a href="pcreapi.html"><b>pcreapi</b></a>
|
326 |
page. The UTF-16 errors are:
|
327 |
<pre>
|
328 |
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
329 |
PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate
|
330 |
PCRE_UTF16_ERR3 Isolated low surrogate
|
331 |
PCRE_UTF16_ERR4 Non-character
|
332 |
</PRE>
|
333 |
</P>
|
334 |
<br><a name="SEC17" href="#TOC1">ERROR TEXTS</a><br>
|
335 |
<P>
|
336 |
If there is an error while compiling a pattern, the error text that is passed
|
337 |
back by <b>pcre16_compile()</b> or <b>pcre16_compile2()</b> is still an 8-bit
|
338 |
character string, zero-terminated.
|
339 |
</P>
|
340 |
<br><a name="SEC18" href="#TOC1">CALLOUTS</a><br>
|
341 |
<P>
|
342 |
The <i>subject</i> and <i>mark</i> fields in the callout block that is passed to
|
343 |
a callout function point to 16-bit vectors.
|
344 |
</P>
|
345 |
<br><a name="SEC19" href="#TOC1">TESTING</a><br>
|
346 |
<P>
|
347 |
The <b>pcretest</b> program continues to operate with 8-bit input and output
|
348 |
files, but it can be used for testing the 16-bit library. If it is run with the
|
349 |
command line option <b>-16</b>, patterns and subject strings are converted from
|
350 |
8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
|
351 |
are used instead of the 8-bit ones. Returned 16-bit strings are converted to
|
352 |
8-bit for output. If both the 8-bit and the 32-bit libraries were not compiled,
|
353 |
<b>pcretest</b> defaults to 16-bit and the <b>-16</b> option is ignored.
|
354 |
</P>
|
355 |
<P>
|
356 |
When PCRE is being built, the <b>RunTest</b> script that is called by "make
|
357 |
check" uses the <b>pcretest</b> <b>-C</b> option to discover which of the 8-bit,
|
358 |
16-bit and 32-bit libraries has been built, and runs the tests appropriately.
|
359 |
</P>
|
360 |
<br><a name="SEC20" href="#TOC1">NOT SUPPORTED IN 16-BIT MODE</a><br>
|
361 |
<P>
|
362 |
Not all the features of the 8-bit library are available with the 16-bit
|
363 |
library. The C++ and POSIX wrapper functions support only the 8-bit library,
|
364 |
and the <b>pcregrep</b> program is at present 8-bit only.
|
365 |
</P>
|
366 |
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
|
367 |
<P>
|
368 |
Philip Hazel
|
369 |
<br>
|
370 |
University Computing Service
|
371 |
<br>
|
372 |
Cambridge CB2 3QH, England.
|
373 |
<br>
|
374 |
</P>
|
375 |
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
|
376 |
<P>
|
377 |
Last updated: 08 November 2012
|
378 |
<br>
|
379 |
Copyright © 1997-2012 University of Cambridge.
|
380 |
<br>
|
381 |
<p>
|
382 |
Return to the <a href="index.html">PCRE index page</a>.
|
383 |
</p>
|