1 |
.TH PCRE 3 "14 April 2012" "PCRE 8.31"
|
2 |
.SH NAME
|
3 |
PCRE - Perl-compatible regular expressions
|
4 |
.sp
|
5 |
.B #include <pcre.h>
|
6 |
.
|
7 |
.
|
8 |
.SH "PCRE 16-BIT API BASIC FUNCTIONS"
|
9 |
.rs
|
10 |
.sp
|
11 |
.SM
|
12 |
.B pcre16 *pcre16_compile(PCRE_SPTR16 \fIpattern\fP, int \fIoptions\fP,
|
13 |
.ti +5n
|
14 |
.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
|
15 |
.ti +5n
|
16 |
.B const unsigned char *\fItableptr\fP);
|
17 |
.PP
|
18 |
.B pcre16 *pcre16_compile2(PCRE_SPTR16 \fIpattern\fP, int \fIoptions\fP,
|
19 |
.ti +5n
|
20 |
.B int *\fIerrorcodeptr\fP,
|
21 |
.ti +5n
|
22 |
.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
|
23 |
.ti +5n
|
24 |
.B const unsigned char *\fItableptr\fP);
|
25 |
.PP
|
26 |
.B pcre16_extra *pcre16_study(const pcre16 *\fIcode\fP, int \fIoptions\fP,
|
27 |
.ti +5n
|
28 |
.B const char **\fIerrptr\fP);
|
29 |
.PP
|
30 |
.B void pcre16_free_study(pcre16_extra *\fIextra\fP);
|
31 |
.PP
|
32 |
.B int pcre16_exec(const pcre16 *\fIcode\fP, "const pcre16_extra *\fIextra\fP,"
|
33 |
.ti +5n
|
34 |
.B "PCRE_SPTR16 \fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
|
35 |
.ti +5n
|
36 |
.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
|
37 |
.PP
|
38 |
.B int pcre16_dfa_exec(const pcre16 *\fIcode\fP, "const pcre16_extra *\fIextra\fP,"
|
39 |
.ti +5n
|
40 |
.B "PCRE_SPTR16 \fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
|
41 |
.ti +5n
|
42 |
.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,
|
43 |
.ti +5n
|
44 |
.B int *\fIworkspace\fP, int \fIwscount\fP);
|
45 |
.
|
46 |
.
|
47 |
.SH "PCRE 16-BIT API STRING EXTRACTION FUNCTIONS"
|
48 |
.rs
|
49 |
.sp
|
50 |
.B int pcre16_copy_named_substring(const pcre16 *\fIcode\fP,
|
51 |
.ti +5n
|
52 |
.B PCRE_SPTR16 \fIsubject\fP, int *\fIovector\fP,
|
53 |
.ti +5n
|
54 |
.B int \fIstringcount\fP, PCRE_SPTR16 \fIstringname\fP,
|
55 |
.ti +5n
|
56 |
.B PCRE_UCHAR16 *\fIbuffer\fP, int \fIbuffersize\fP);
|
57 |
.PP
|
58 |
.B int pcre16_copy_substring(PCRE_SPTR16 \fIsubject\fP, int *\fIovector\fP,
|
59 |
.ti +5n
|
60 |
.B int \fIstringcount\fP, int \fIstringnumber\fP, PCRE_UCHAR16 *\fIbuffer\fP,
|
61 |
.ti +5n
|
62 |
.B int \fIbuffersize\fP);
|
63 |
.PP
|
64 |
.B int pcre16_get_named_substring(const pcre16 *\fIcode\fP,
|
65 |
.ti +5n
|
66 |
.B PCRE_SPTR16 \fIsubject\fP, int *\fIovector\fP,
|
67 |
.ti +5n
|
68 |
.B int \fIstringcount\fP, PCRE_SPTR16 \fIstringname\fP,
|
69 |
.ti +5n
|
70 |
.B PCRE_SPTR16 *\fIstringptr\fP);
|
71 |
.PP
|
72 |
.B int pcre16_get_stringnumber(const pcre16 *\fIcode\fP,
|
73 |
.ti +5n
|
74 |
.B PCRE_SPTR16 \fIname\fP);
|
75 |
.PP
|
76 |
.B int pcre16_get_stringtable_entries(const pcre16 *\fIcode\fP,
|
77 |
.ti +5n
|
78 |
.B PCRE_SPTR16 \fIname\fP, PCRE_UCHAR16 **\fIfirst\fP, PCRE_UCHAR16 **\fIlast\fP);
|
79 |
.PP
|
80 |
.B int pcre16_get_substring(PCRE_SPTR16 \fIsubject\fP, int *\fIovector\fP,
|
81 |
.ti +5n
|
82 |
.B int \fIstringcount\fP, int \fIstringnumber\fP,
|
83 |
.ti +5n
|
84 |
.B PCRE_SPTR16 *\fIstringptr\fP);
|
85 |
.PP
|
86 |
.B int pcre16_get_substring_list(PCRE_SPTR16 \fIsubject\fP,
|
87 |
.ti +5n
|
88 |
.B int *\fIovector\fP, int \fIstringcount\fP, "PCRE_SPTR16 **\fIlistptr\fP);"
|
89 |
.PP
|
90 |
.B void pcre16_free_substring(PCRE_SPTR16 \fIstringptr\fP);
|
91 |
.PP
|
92 |
.B void pcre16_free_substring_list(PCRE_SPTR16 *\fIstringptr\fP);
|
93 |
.
|
94 |
.
|
95 |
.SH "PCRE 16-BIT API AUXILIARY FUNCTIONS"
|
96 |
.rs
|
97 |
.sp
|
98 |
.B pcre16_jit_stack *pcre16_jit_stack_alloc(int \fIstartsize\fP, int \fImaxsize\fP);
|
99 |
.PP
|
100 |
.B void pcre16_jit_stack_free(pcre16_jit_stack *\fIstack\fP);
|
101 |
.PP
|
102 |
.B void pcre16_assign_jit_stack(pcre16_extra *\fIextra\fP,
|
103 |
.ti +5n
|
104 |
.B pcre16_jit_callback \fIcallback\fP, void *\fIdata\fP);
|
105 |
.PP
|
106 |
.B const unsigned char *pcre16_maketables(void);
|
107 |
.PP
|
108 |
.B int pcre16_fullinfo(const pcre16 *\fIcode\fP, "const pcre16_extra *\fIextra\fP,"
|
109 |
.ti +5n
|
110 |
.B int \fIwhat\fP, void *\fIwhere\fP);
|
111 |
.PP
|
112 |
.B int pcre16_refcount(pcre16 *\fIcode\fP, int \fIadjust\fP);
|
113 |
.PP
|
114 |
.B int pcre16_config(int \fIwhat\fP, void *\fIwhere\fP);
|
115 |
.PP
|
116 |
.B const char *pcre16_version(void);
|
117 |
.PP
|
118 |
.B int pcre16_pattern_to_host_byte_order(pcre16 *\fIcode\fP,
|
119 |
.ti +5n
|
120 |
.B pcre16_extra *\fIextra\fP, const unsigned char *\fItables\fP);
|
121 |
.
|
122 |
.
|
123 |
.SH "PCRE 16-BIT API INDIRECTED FUNCTIONS"
|
124 |
.rs
|
125 |
.sp
|
126 |
.B void *(*pcre16_malloc)(size_t);
|
127 |
.PP
|
128 |
.B void (*pcre16_free)(void *);
|
129 |
.PP
|
130 |
.B void *(*pcre16_stack_malloc)(size_t);
|
131 |
.PP
|
132 |
.B void (*pcre16_stack_free)(void *);
|
133 |
.PP
|
134 |
.B int (*pcre16_callout)(pcre16_callout_block *);
|
135 |
.
|
136 |
.
|
137 |
.SH "PCRE 16-BIT API 16-BIT-ONLY FUNCTION"
|
138 |
.rs
|
139 |
.sp
|
140 |
.B int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *\fIoutput\fP,
|
141 |
.ti +5n
|
142 |
.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIbyte_order\fP,
|
143 |
.ti +5n
|
144 |
.B int \fIkeep_boms\fP);
|
145 |
.
|
146 |
.
|
147 |
.SH "THE PCRE 16-BIT LIBRARY"
|
148 |
.rs
|
149 |
.sp
|
150 |
Starting with release 8.30, it is possible to compile a PCRE library that
|
151 |
supports 16-bit character strings, including UTF-16 strings, as well as or
|
152 |
instead of the original 8-bit library. The majority of the work to make this
|
153 |
possible was done by Zoltan Herczeg. The two libraries contain identical sets
|
154 |
of functions, used in exactly the same way. Only the names of the functions and
|
155 |
the data types of their arguments and results are different. To avoid
|
156 |
over-complication and reduce the documentation maintenance load, most of the
|
157 |
PCRE documentation describes the 8-bit library, with only occasional references
|
158 |
to the 16-bit library. This page describes what is different when you use the
|
159 |
16-bit library.
|
160 |
.P
|
161 |
WARNING: A single application can be linked with both libraries, but you must
|
162 |
take care when processing any particular pattern to use functions from just one
|
163 |
library. For example, if you want to study a pattern that was compiled with
|
164 |
\fBpcre16_compile()\fP, you must do so with \fBpcre16_study()\fP, not
|
165 |
\fBpcre_study()\fP, and you must free the study data with
|
166 |
\fBpcre16_free_study()\fP.
|
167 |
.
|
168 |
.
|
169 |
.SH "THE HEADER FILE"
|
170 |
.rs
|
171 |
.sp
|
172 |
There is only one header file, \fBpcre.h\fP. It contains prototypes for all the
|
173 |
functions in both libraries, as well as definitions of flags, structures, error
|
174 |
codes, etc.
|
175 |
.
|
176 |
.
|
177 |
.SH "THE LIBRARY NAME"
|
178 |
.rs
|
179 |
.sp
|
180 |
In Unix-like systems, the 16-bit library is called \fBlibpcre16\fP, and can
|
181 |
normally be accesss by adding \fB-lpcre16\fP to the command for linking an
|
182 |
application that uses PCRE.
|
183 |
.
|
184 |
.
|
185 |
.SH "STRING TYPES"
|
186 |
.rs
|
187 |
.sp
|
188 |
In the 8-bit library, strings are passed to PCRE library functions as vectors
|
189 |
of bytes with the C type "char *". In the 16-bit library, strings are passed as
|
190 |
vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
|
191 |
appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
|
192 |
very many environments, "short int" is a 16-bit data type. When PCRE is built,
|
193 |
it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit
|
194 |
data type. If it is not, the build fails with an error message telling the
|
195 |
maintainer to modify the definition appropriately.
|
196 |
.
|
197 |
.
|
198 |
.SH "STRUCTURE TYPES"
|
199 |
.rs
|
200 |
.sp
|
201 |
The types of the opaque structures that are used for compiled 16-bit patterns
|
202 |
and JIT stacks are \fBpcre16\fP and \fBpcre16_jit_stack\fP respectively. The
|
203 |
type of the user-accessible structure that is returned by \fBpcre16_study()\fP
|
204 |
is \fBpcre16_extra\fP, and the type of the structure that is used for passing
|
205 |
data to a callout function is \fBpcre16_callout_block\fP. These structures
|
206 |
contain the same fields, with the same names, as their 8-bit counterparts. The
|
207 |
only difference is that pointers to character strings are 16-bit instead of
|
208 |
8-bit types.
|
209 |
.
|
210 |
.
|
211 |
.SH "16-BIT FUNCTIONS"
|
212 |
.rs
|
213 |
.sp
|
214 |
For every function in the 8-bit library there is a corresponding function in
|
215 |
the 16-bit library with a name that starts with \fBpcre16_\fP instead of
|
216 |
\fBpcre_\fP. The prototypes are listed above. In addition, there is one extra
|
217 |
function, \fBpcre16_utf16_to_host_byte_order()\fP. This is a utility function
|
218 |
that converts a UTF-16 character string to host byte order if necessary. The
|
219 |
other 16-bit functions expect the strings they are passed to be in host byte
|
220 |
order.
|
221 |
.P
|
222 |
The \fIinput\fP and \fIoutput\fP arguments of
|
223 |
\fBpcre16_utf16_to_host_byte_order()\fP may point to the same address, that is,
|
224 |
conversion in place is supported. The output buffer must be at least as long as
|
225 |
the input.
|
226 |
.P
|
227 |
The \fIlength\fP argument specifies the number of 16-bit data units in the
|
228 |
input string; a negative value specifies a zero-terminated string.
|
229 |
.P
|
230 |
If \fIbyte_order\fP is NULL, it is assumed that the string starts off in host
|
231 |
byte order. This may be changed by byte-order marks (BOMs) anywhere in the
|
232 |
string (commonly as the first character).
|
233 |
.P
|
234 |
If \fIbyte_order\fP is not NULL, a non-zero value of the integer to which it
|
235 |
points means that the input starts off in host byte order, otherwise the
|
236 |
opposite order is assumed. Again, BOMs in the string can change this. The final
|
237 |
byte order is passed back at the end of processing.
|
238 |
.P
|
239 |
If \fIkeep_boms\fP is not zero, byte-order mark characters (0xfeff) are copied
|
240 |
into the output string. Otherwise they are discarded.
|
241 |
.P
|
242 |
The result of the function is the number of 16-bit units placed into the output
|
243 |
buffer, including the zero terminator if the string was zero-terminated.
|
244 |
.
|
245 |
.
|
246 |
.SH "SUBJECT STRING OFFSETS"
|
247 |
.rs
|
248 |
.sp
|
249 |
The offsets within subject strings that are returned by the matching functions
|
250 |
are in 16-bit units rather than bytes.
|
251 |
.
|
252 |
.
|
253 |
.SH "NAMED SUBPATTERNS"
|
254 |
.rs
|
255 |
.sp
|
256 |
The name-to-number translation table that is maintained for named subpatterns
|
257 |
uses 16-bit characters. The \fBpcre16_get_stringtable_entries()\fP function
|
258 |
returns the length of each entry in the table as the number of 16-bit data
|
259 |
units.
|
260 |
.
|
261 |
.
|
262 |
.SH "OPTION NAMES"
|
263 |
.rs
|
264 |
.sp
|
265 |
There are two new general option names, PCRE_UTF16 and PCRE_NO_UTF16_CHECK,
|
266 |
which correspond to PCRE_UTF8 and PCRE_NO_UTF8_CHECK in the 8-bit library. In
|
267 |
fact, these new options define the same bits in the options word. There is a
|
268 |
discussion about the
|
269 |
.\" HTML <a href="pcreunicode.html#utf16strings">
|
270 |
.\" </a>
|
271 |
validity of UTF-16 strings
|
272 |
.\"
|
273 |
in the
|
274 |
.\" HREF
|
275 |
\fBpcreunicode\fP
|
276 |
.\"
|
277 |
page.
|
278 |
.P
|
279 |
For the \fBpcre16_config()\fP function there is an option PCRE_CONFIG_UTF16
|
280 |
that returns 1 if UTF-16 support is configured, otherwise 0. If this option is
|
281 |
given to \fBpcre_config()\fP or \fBpcre32_config()\fP, or if the
|
282 |
PCRE_CONFIG_UTF8 or PCRE_CONFIG_UTF32 option is given to \fBpcre16_config()\fP,
|
283 |
the result is the PCRE_ERROR_BADOPTION error.
|
284 |
.
|
285 |
.
|
286 |
.SH "CHARACTER CODES"
|
287 |
.rs
|
288 |
.sp
|
289 |
In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
|
290 |
same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
|
291 |
from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
|
292 |
0xff can therefore be influenced by the locale in the same way as before.
|
293 |
Characters greater than 0xff have only one case, and no "type" (such as letter
|
294 |
or digit).
|
295 |
.P
|
296 |
In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
|
297 |
the exception of values in the range 0xd800 to 0xdfff because those are
|
298 |
"surrogate" values that are used in pairs to encode values greater than 0xffff.
|
299 |
.P
|
300 |
A UTF-16 string can indicate its endianness by special code knows as a
|
301 |
byte-order mark (BOM). The PCRE functions do not handle this, expecting strings
|
302 |
to be in host byte order. A utility function called
|
303 |
\fBpcre16_utf16_to_host_byte_order()\fP is provided to help with this (see
|
304 |
above).
|
305 |
.
|
306 |
.
|
307 |
.SH "ERROR NAMES"
|
308 |
.rs
|
309 |
.sp
|
310 |
The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
|
311 |
their 8-bit counterparts. The error PCRE_ERROR_BADMODE is given when a compiled
|
312 |
pattern is passed to a function that processes patterns in the other
|
313 |
mode, for example, if a pattern compiled with \fBpcre_compile()\fP is passed to
|
314 |
\fBpcre16_exec()\fP.
|
315 |
.P
|
316 |
There are new error codes whose names begin with PCRE_UTF16_ERR for invalid
|
317 |
UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
|
318 |
are described in the section entitled
|
319 |
.\" HTML <a href="pcreapi.html#badutf8reasons">
|
320 |
.\" </a>
|
321 |
"Reason codes for invalid UTF-8 strings"
|
322 |
.\"
|
323 |
in the main
|
324 |
.\" HREF
|
325 |
\fBpcreapi\fP
|
326 |
.\"
|
327 |
page. The UTF-16 errors are:
|
328 |
.sp
|
329 |
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
330 |
PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate
|
331 |
PCRE_UTF16_ERR3 Isolated low surrogate
|
332 |
PCRE_UTF16_ERR4 Non-character
|
333 |
.
|
334 |
.
|
335 |
.SH "ERROR TEXTS"
|
336 |
.rs
|
337 |
.sp
|
338 |
If there is an error while compiling a pattern, the error text that is passed
|
339 |
back by \fBpcre16_compile()\fP or \fBpcre16_compile2()\fP is still an 8-bit
|
340 |
character string, zero-terminated.
|
341 |
.
|
342 |
.
|
343 |
.SH "CALLOUTS"
|
344 |
.rs
|
345 |
.sp
|
346 |
The \fIsubject\fP and \fImark\fP fields in the callout block that is passed to
|
347 |
a callout function point to 16-bit vectors.
|
348 |
.
|
349 |
.
|
350 |
.SH "TESTING"
|
351 |
.rs
|
352 |
.sp
|
353 |
The \fBpcretest\fP program continues to operate with 8-bit input and output
|
354 |
files, but it can be used for testing the 16-bit library. If it is run with the
|
355 |
command line option \fB-16\fP, patterns and subject strings are converted from
|
356 |
8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
|
357 |
are used instead of the 8-bit ones. Returned 16-bit strings are converted to
|
358 |
8-bit for output. If both the 8-bit and the 32-bit libraries were not compiled,
|
359 |
\fBpcretest\fP defaults to 16-bit and the \fB-16\fP option is ignored.
|
360 |
.P
|
361 |
When PCRE is being built, the \fBRunTest\fP script that is called by "make
|
362 |
check" uses the \fBpcretest\fP \fB-C\fP option to discover which of the 8-bit,
|
363 |
16-bit and 32-bit libraries has been built, and runs the tests appropriately.
|
364 |
.
|
365 |
.
|
366 |
.SH "NOT SUPPORTED IN 16-BIT MODE"
|
367 |
.rs
|
368 |
.sp
|
369 |
Not all the features of the 8-bit library are available with the 16-bit
|
370 |
library. The C++ and POSIX wrapper functions support only the 8-bit library,
|
371 |
and the \fBpcregrep\fP program is at present 8-bit only.
|
372 |
.
|
373 |
.
|
374 |
.SH AUTHOR
|
375 |
.rs
|
376 |
.sp
|
377 |
.nf
|
378 |
Philip Hazel
|
379 |
University Computing Service
|
380 |
Cambridge CB2 3QH, England.
|
381 |
.fi
|
382 |
.
|
383 |
.
|
384 |
.SH REVISION
|
385 |
.rs
|
386 |
.sp
|
387 |
.nf
|
388 |
Last updated: 14 April 2012
|
389 |
Copyright (c) 1997-2012 University of Cambridge.
|
390 |
.fi
|