ViewVC logotype

Contents of /code/trunk/doc/pcre2api.3

Parent Directory Parent Directory | Revision Log Revision Log

Revision 696 - (show annotations)
Tue Mar 21 17:46:21 2017 UTC (2 years, 4 months ago) by ph10
File size: 144846 byte(s)
Fix 32-bit error buffer size bug in pcre2test (Bugzilla 2079).
1 .TH PCRE2API 3 "21 March 2017" "PCRE2 10.30"
3 PCRE2 - Perl-compatible regular expressions (revised API)
4 .sp
5 .B #include <pcre2.h>
6 .sp
7 PCRE2 is a new API for PCRE. This document contains a description of all its
8 functions. See the
9 .\" HREF
10 \fBpcre2\fP
11 .\"
12 document for an overview of all the PCRE2 documentation.
13 .
14 .
16 .rs
17 .sp
18 .nf
19 .B pcre2_code *pcre2_compile(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP,
20 .B " uint32_t \fIoptions\fP, int *\fIerrorcode\fP, PCRE2_SIZE *\fIerroroffset,\fP"
21 .B " pcre2_compile_context *\fIccontext\fP);"
22 .sp
23 .B void pcre2_code_free(pcre2_code *\fIcode\fP);
24 .sp
25 .B pcre2_match_data *pcre2_match_data_create(uint32_t \fIovecsize\fP,
26 .B " pcre2_general_context *\fIgcontext\fP);"
27 .sp
28 .B pcre2_match_data *pcre2_match_data_create_from_pattern(
29 .B " const pcre2_code *\fIcode\fP, pcre2_general_context *\fIgcontext\fP);"
30 .sp
31 .B int pcre2_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP,
32 .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
33 .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
34 .B " pcre2_match_context *\fImcontext\fP);"
35 .sp
36 .B int pcre2_dfa_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP,
37 .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
38 .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
39 .B " pcre2_match_context *\fImcontext\fP,"
40 .B " int *\fIworkspace\fP, PCRE2_SIZE \fIwscount\fP);"
41 .sp
42 .B void pcre2_match_data_free(pcre2_match_data *\fImatch_data\fP);
43 .fi
44 .
45 .
47 .rs
48 .sp
49 .nf
50 .B PCRE2_SPTR pcre2_get_mark(pcre2_match_data *\fImatch_data\fP);
51 .sp
52 .B uint32_t pcre2_get_ovector_count(pcre2_match_data *\fImatch_data\fP);
53 .sp
54 .B PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *\fImatch_data\fP);
55 .sp
56 .B PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *\fImatch_data\fP);
57 .fi
58 .
59 .
61 .rs
62 .sp
63 .nf
64 .B pcre2_general_context *pcre2_general_context_create(
65 .B " void *(*\fIprivate_malloc\fP)(PCRE2_SIZE, void *),"
66 .B " void (*\fIprivate_free\fP)(void *, void *), void *\fImemory_data\fP);"
67 .sp
68 .B pcre2_general_context *pcre2_general_context_copy(
69 .B " pcre2_general_context *\fIgcontext\fP);"
70 .sp
71 .B void pcre2_general_context_free(pcre2_general_context *\fIgcontext\fP);
72 .fi
73 .
74 .
76 .rs
77 .sp
78 .nf
79 .B pcre2_compile_context *pcre2_compile_context_create(
80 .B " pcre2_general_context *\fIgcontext\fP);"
81 .sp
82 .B pcre2_compile_context *pcre2_compile_context_copy(
83 .B " pcre2_compile_context *\fIccontext\fP);"
84 .sp
85 .B void pcre2_compile_context_free(pcre2_compile_context *\fIccontext\fP);
86 .sp
87 .B int pcre2_set_bsr(pcre2_compile_context *\fIccontext\fP,
88 .B " uint32_t \fIvalue\fP);"
89 .sp
90 .B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP,
91 .B " const unsigned char *\fItables\fP);"
92 .sp
93 .B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP,
94 .B " PCRE2_SIZE \fIvalue\fP);"
95 .sp
96 .B int pcre2_set_newline(pcre2_compile_context *\fIccontext\fP,
97 .B " uint32_t \fIvalue\fP);"
98 .sp
99 .B int pcre2_set_parens_nest_limit(pcre2_compile_context *\fIccontext\fP,
100 .B " uint32_t \fIvalue\fP);"
101 .sp
102 .B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP,
103 .B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);"
104 .fi
105 .
106 .
108 .rs
109 .sp
110 .nf
111 .B pcre2_match_context *pcre2_match_context_create(
112 .B " pcre2_general_context *\fIgcontext\fP);"
113 .sp
114 .B pcre2_match_context *pcre2_match_context_copy(
115 .B " pcre2_match_context *\fImcontext\fP);"
116 .sp
117 .B void pcre2_match_context_free(pcre2_match_context *\fImcontext\fP);
118 .sp
119 .B int pcre2_set_callout(pcre2_match_context *\fImcontext\fP,
120 .B " int (*\fIcallout_function\fP)(pcre2_callout_block *, void *),"
121 .B " void *\fIcallout_data\fP);"
122 .sp
123 .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
124 .B " uint32_t \fIvalue\fP);"
125 .sp
126 .B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP,
127 .B " PCRE2_SIZE \fIvalue\fP);"
128 .sp
129 .B int pcre2_set_recursion_limit(pcre2_match_context *\fImcontext\fP,
130 .B " uint32_t \fIvalue\fP);"
131 .sp
132 .B int pcre2_set_recursion_memory_management(
133 .B " pcre2_match_context *\fImcontext\fP,"
134 .B " void *(*\fIprivate_malloc\fP)(PCRE2_SIZE, void *),"
135 .B " void (*\fIprivate_free\fP)(void *, void *), void *\fImemory_data\fP);"
136 .fi
137 .
138 .
140 .rs
141 .sp
142 .nf
143 .B int pcre2_substring_copy_byname(pcre2_match_data *\fImatch_data\fP,
144 .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR *\fIbuffer\fP, PCRE2_SIZE *\fIbufflen\fP);"
145 .sp
146 .B int pcre2_substring_copy_bynumber(pcre2_match_data *\fImatch_data\fP,
147 .B " uint32_t \fInumber\fP, PCRE2_UCHAR *\fIbuffer\fP,"
148 .B " PCRE2_SIZE *\fIbufflen\fP);"
149 .sp
150 .B void pcre2_substring_free(PCRE2_UCHAR *\fIbuffer\fP);
151 .sp
152 .B int pcre2_substring_get_byname(pcre2_match_data *\fImatch_data\fP,
153 .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR **\fIbufferptr\fP, PCRE2_SIZE *\fIbufflen\fP);"
154 .sp
155 .B int pcre2_substring_get_bynumber(pcre2_match_data *\fImatch_data\fP,
156 .B " uint32_t \fInumber\fP, PCRE2_UCHAR **\fIbufferptr\fP,"
157 .B " PCRE2_SIZE *\fIbufflen\fP);"
158 .sp
159 .B int pcre2_substring_length_byname(pcre2_match_data *\fImatch_data\fP,
160 .B " PCRE2_SPTR \fIname\fP, PCRE2_SIZE *\fIlength\fP);"
161 .sp
162 .B int pcre2_substring_length_bynumber(pcre2_match_data *\fImatch_data\fP,
163 .B " uint32_t \fInumber\fP, PCRE2_SIZE *\fIlength\fP);"
164 .sp
165 .B int pcre2_substring_nametable_scan(const pcre2_code *\fIcode\fP,
166 .B " PCRE2_SPTR \fIname\fP, PCRE2_SPTR *\fIfirst\fP, PCRE2_SPTR *\fIlast\fP);"
167 .sp
168 .B int pcre2_substring_number_from_name(const pcre2_code *\fIcode\fP,
169 .B " PCRE2_SPTR \fIname\fP);"
170 .sp
171 .B void pcre2_substring_list_free(PCRE2_SPTR *\fIlist\fP);
172 .sp
173 .B int pcre2_substring_list_get(pcre2_match_data *\fImatch_data\fP,
174 .B " PCRE2_UCHAR ***\fIlistptr\fP, PCRE2_SIZE **\fIlengthsptr\fP);
175 .fi
176 .
177 .
179 .rs
180 .sp
181 .nf
182 .B int pcre2_substitute(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP,
183 .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
184 .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
185 .B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacementzfP,"
186 .B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\fP,"
187 .B " PCRE2_SIZE *\fIoutlengthptr\fP);"
188 .fi
189 .
190 .
192 .rs
193 .sp
194 .nf
195 .B int pcre2_jit_compile(pcre2_code *\fIcode\fP, uint32_t \fIoptions\fP);
196 .sp
197 .B int pcre2_jit_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP,
198 .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
199 .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
200 .B " pcre2_match_context *\fImcontext\fP);"
201 .sp
202 .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
203 .sp
204 .B pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE \fIstartsize\fP,
205 .B " PCRE2_SIZE \fImaxsize\fP, pcre2_general_context *\fIgcontext\fP);"
206 .sp
207 .B void pcre2_jit_stack_assign(pcre2_match_context *\fImcontext\fP,
208 .B " pcre2_jit_callback \fIcallback_function\fP, void *\fIcallback_data\fP);"
209 .sp
210 .B void pcre2_jit_stack_free(pcre2_jit_stack *\fIjit_stack\fP);
211 .fi
212 .
213 .
215 .rs
216 .sp
217 .nf
218 .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP,
219 .B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP,"
220 .B " pcre2_general_context *\fIgcontext\fP);"
221 .sp
222 .B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP,
223 .B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP,"
224 .B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);"
225 .sp
226 .B void pcre2_serialize_free(uint8_t *\fIbytes\fP);
227 .sp
228 .B int32_t pcre2_serialize_get_number_of_codes(const uint8_t *\fIbytes\fP);
229 .fi
230 .
231 .
233 .rs
234 .sp
235 .nf
236 .B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP);
237 .sp
238 .B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP);
239 .sp
240 .B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP,
241 .B " PCRE2_SIZE \fIbufflen\fP);"
242 .sp
243 .B const unsigned char *pcre2_maketables(pcre2_general_context *\fIgcontext\fP);
244 .sp
245 .B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
246 .sp
247 .B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
248 .B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
249 .B " void *\fIuser_data\fP);"
250 .sp
251 .B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP);
252 .fi
253 .
254 .
256 .rs
257 .sp
258 There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit code
259 units, respectively. However, there is just one header file, \fBpcre2.h\fP.
260 This contains the function prototypes and other definitions for all three
261 libraries. One, two, or all three can be installed simultaneously. On Unix-like
262 systems the libraries are called \fBlibpcre2-8\fP, \fBlibpcre2-16\fP, and
263 \fBlibpcre2-32\fP, and they can also co-exist with the original PCRE libraries.
264 .P
265 Character strings are passed to and from a PCRE2 library as a sequence of
266 unsigned integers in code units of the appropriate width. Every PCRE2 function
267 comes in three different forms, one for each library, for example:
268 .sp
269 \fBpcre2_compile_8()\fP
270 \fBpcre2_compile_16()\fP
271 \fBpcre2_compile_32()\fP
272 .sp
273 There are also three different sets of data types:
274 .sp
277 .sp
278 The UCHAR types define unsigned code units of the appropriate widths. For
279 example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR types are
280 constant pointers to the equivalent UCHAR types, that is, they are pointers to
281 vectors of unsigned code units.
282 .P
283 Many applications use only one code unit width. For their convenience, macros
284 are defined whose names are the generic forms such as \fBpcre2_compile()\fP and
285 PCRE2_SPTR. These macros use the value of the macro PCRE2_CODE_UNIT_WIDTH to
286 generate the appropriate width-specific function and macro names.
287 PCRE2_CODE_UNIT_WIDTH is not defined by default. An application must define it
288 to be 8, 16, or 32 before including \fBpcre2.h\fP in order to make use of the
289 generic names.
290 .P
291 Applications that use more than one code unit width can be linked with more
292 than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to be 0 before
293 including \fBpcre2.h\fP, and then use the real function names. Any code that is
294 to be included in an environment where the value of PCRE2_CODE_UNIT_WIDTH is
295 unknown should also use the real function names. (Unfortunately, it is not
296 possible in C code to save and restore the value of a macro.)
297 .P
298 If PCRE2_CODE_UNIT_WIDTH is not defined before including \fBpcre2.h\fP, a
299 compiler error occurs.
300 .P
301 When using multiple libraries in an application, you must take care when
302 processing any particular pattern to use only functions from a single library.
303 For example, if you want to run a match using a pattern that was compiled with
304 \fBpcre2_compile_16()\fP, you must do so with \fBpcre2_match_16()\fP, not
305 \fBpcre2_match_8()\fP.
306 .P
307 In the function summaries above, and in the rest of this document and other
308 PCRE2 documents, functions and data types are described using their generic
309 names, without the 8, 16, or 32 suffix.
310 .
311 .
313 .rs
314 .sp
315 PCRE2 has its own native API, which is described in this document. There are
316 also some wrapper functions for the 8-bit library that correspond to the
317 POSIX regular expression API, but they do not give access to all the
318 functionality. They are described in the
319 .\" HREF
320 \fBpcre2posix\fP
321 .\"
322 documentation. Both these APIs define a set of C function calls.
323 .P
324 The native API C data types, function prototypes, option values, and error
325 codes are defined in the header file \fBpcre2.h\fP, which contains definitions
326 of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers for the
327 library. Applications can use these to include support for different releases
328 of PCRE2.
329 .P
330 In a Windows environment, if you want to statically link an application program
331 against a non-dll PCRE2 library, you must define PCRE2_STATIC before including
332 \fBpcre2.h\fP.
333 .P
334 The functions \fBpcre2_compile()\fP, and \fBpcre2_match()\fP are used for
335 compiling and matching regular expressions in a Perl-compatible manner. A
336 sample program that demonstrates the simplest way of using them is provided in
337 the file called \fIpcre2demo.c\fP in the PCRE2 source distribution. A listing
338 of this program is given in the
339 .\" HREF
340 \fBpcre2demo\fP
341 .\"
342 documentation, and the
343 .\" HREF
344 \fBpcre2sample\fP
345 .\"
346 documentation describes how to compile and run it.
347 .P
348 Just-in-time compiler support is an optional feature of PCRE2 that can be built
349 in appropriate hardware environments. It greatly speeds up the matching
350 performance of many patterns. Programs can request that it be used if
351 available, by calling \fBpcre2_jit_compile()\fP after a pattern has been
352 successfully compiled by \fBpcre2_compile()\fP. This does nothing if JIT
353 support is not available.
354 .P
355 More complicated programs might need to make use of the specialist functions
356 \fBpcre2_jit_stack_create()\fP, \fBpcre2_jit_stack_free()\fP, and
357 \fBpcre2_jit_stack_assign()\fP in order to control the JIT code's memory usage.
358 .P
359 JIT matching is automatically used by \fBpcre2_match()\fP if it is available,
360 unless the PCRE2_NO_JIT option is set. There is also a direct interface for JIT
361 matching, which gives improved performance. The JIT-specific functions are
362 discussed in the
363 .\" HREF
364 \fBpcre2jit\fP
365 .\"
366 documentation.
367 .P
368 A second matching function, \fBpcre2_dfa_match()\fP, which is not
369 Perl-compatible, is also provided. This uses a different algorithm for the
370 matching. The alternative algorithm finds all possible matches (at a given
371 point in the subject), and scans the subject just once (unless there are
372 lookbehind assertions). However, this algorithm does not return captured
373 substrings. A description of the two matching algorithms and their advantages
374 and disadvantages is given in the
375 .\" HREF
376 \fBpcre2matching\fP
377 .\"
378 documentation. There is no JIT support for \fBpcre2_dfa_match()\fP.
379 .P
380 In addition to the main compiling and matching functions, there are convenience
381 functions for extracting captured substrings from a subject string that has
382 been matched by \fBpcre2_match()\fP. They are:
383 .sp
384 \fBpcre2_substring_copy_byname()\fP
385 \fBpcre2_substring_copy_bynumber()\fP
386 \fBpcre2_substring_get_byname()\fP
387 \fBpcre2_substring_get_bynumber()\fP
388 \fBpcre2_substring_list_get()\fP
389 \fBpcre2_substring_length_byname()\fP
390 \fBpcre2_substring_length_bynumber()\fP
391 \fBpcre2_substring_nametable_scan()\fP
392 \fBpcre2_substring_number_from_name()\fP
393 .sp
394 \fBpcre2_substring_free()\fP and \fBpcre2_substring_list_free()\fP are also
395 provided, to free the memory used for extracted strings.
396 .P
397 The function \fBpcre2_substitute()\fP can be called to match a pattern and
398 return a copy of the subject string with substitutions for parts that were
399 matched.
400 .P
401 Functions whose names begin with \fBpcre2_serialize_\fP are used for saving
402 compiled patterns on disc or elsewhere, and reloading them later.
403 .P
404 Finally, there are functions for finding out information about a compiled
405 pattern (\fBpcre2_pattern_info()\fP) and about the configuration with which
406 PCRE2 was built (\fBpcre2_config()\fP).
407 .P
408 Functions with names ending with \fB_free()\fP are used for freeing memory
409 blocks of various sorts. In all cases, if one of these functions is called with
410 a NULL argument, it does nothing.
411 .
412 .
414 .rs
415 .sp
416 The PCRE2 API uses string lengths and offsets into strings of code units in
417 several places. These values are always of type PCRE2_SIZE, which is an
418 unsigned integer type, currently always defined as \fIsize_t\fP. The largest
419 value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved
420 as a special indicator for zero-terminated strings and unset offsets.
421 Therefore, the longest string that can be handled is one less than this
422 maximum.
423 .
424 .
425 .\" HTML <a name="newlines"></a>
427 .rs
428 .sp
429 PCRE2 supports five different conventions for indicating line breaks in
430 strings: a single CR (carriage return) character, a single LF (linefeed)
431 character, the two-character sequence CRLF, any of the three preceding, or any
432 Unicode newline sequence. The Unicode newline sequences are the three just
433 mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed,
434 U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
435 (paragraph separator, U+2029).
436 .P
437 Each of the first three conventions is used by at least one operating system as
438 its standard newline sequence. When PCRE2 is built, a default can be specified.
439 The default default is LF, which is the Unix standard. However, the newline
440 convention can be changed by an application when calling \fBpcre2_compile()\fP,
441 or it can be specified by special text at the start of the pattern itself; this
442 overrides any other settings. See the
443 .\" HREF
444 \fBpcre2pattern\fP
445 .\"
446 page for details of the special character sequences.
447 .P
448 In the PCRE2 documentation the word "newline" is used to mean "the character or
449 pair of characters that indicate a line break". The choice of newline
450 convention affects the handling of the dot, circumflex, and dollar
451 metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
452 recognized line ending sequence, the match position advancement for a
453 non-anchored pattern. There is more detail about this in the
454 .\" HTML <a href="#matchoptions">
455 .\" </a>
456 section on \fBpcre2_match()\fP options
457 .\"
458 below.
459 .P
460 The choice of newline convention does not affect the interpretation of
461 the \en or \er escape sequences, nor does it affect what \eR matches; this has
462 its own separate convention.
463 .
464 .
466 .rs
467 .sp
468 In a multithreaded application it is important to keep thread-specific data
469 separate from data that can be shared between threads. The PCRE2 library code
470 itself is thread-safe: it contains no static or global variables. The API is
471 designed to be fairly simple for non-threaded applications while at the same
472 time ensuring that multithreaded applications can use it.
473 .P
474 There are several different blocks of data that are used to pass information
475 between the application and the PCRE2 libraries.
476 .
477 .
478 .SS "The compiled pattern"
479 .rs
480 .sp
481 A pointer to the compiled form of a pattern is returned to the user when
482 \fBpcre2_compile()\fP is successful. The data in the compiled pattern is fixed,
483 and does not change when the pattern is matched. Therefore, it is thread-safe,
484 that is, the same compiled pattern can be used by more than one thread
485 simultaneously. For example, an application can compile all its patterns at the
486 start, before forking off multiple threads that use them. However, if the
487 just-in-time optimization feature is being used, it needs separate memory stack
488 areas for each thread. See the
489 .\" HREF
490 \fBpcre2jit\fP
491 .\"
492 documentation for more details.
493 .P
494 In a more complicated situation, where patterns are compiled only when they are
495 first needed, but are still shared between threads, pointers to compiled
496 patterns must be protected from simultaneous writing by multiple threads, at
497 least until a pattern has been compiled. The logic can be something like this:
498 .sp
499 Get a read-only (shared) lock (mutex) for pointer
500 if (pointer == NULL)
501 {
502 Get a write (unique) lock for pointer
503 pointer = pcre2_compile(...
504 }
505 Release the lock
506 Use pointer in pcre2_match()
507 .sp
508 Of course, testing for compilation errors should also be included in the code.
509 .P
510 If JIT is being used, but the JIT compilation is not being done immediately,
511 (perhaps waiting to see if the pattern is used often enough) similar logic is
512 required. JIT compilation updates a pointer within the compiled code block, so
513 a thread must gain unique write access to the pointer before calling
514 \fBpcre2_jit_compile()\fP. Alternatively, \fBpcre2_code_copy()\fP or
515 \fBpcre2_code_copy_with_tables()\fP can be used to obtain a private copy of the
516 compiled code.
517 .
518 .
519 .SS "Context blocks"
520 .rs
521 .sp
522 The next main section below introduces the idea of "contexts" in which PCRE2
523 functions are called. A context is nothing more than a collection of parameters
524 that control the way PCRE2 operates. Grouping a number of parameters together
525 in a context is a convenient way of passing them to a PCRE2 function without
526 using lots of arguments. The parameters that are stored in contexts are in some
527 sense "advanced features" of the API. Many straightforward applications will
528 not need to use contexts.
529 .P
530 In a multithreaded application, if the parameters in a context are values that
531 are never changed, the same context can be used by all the threads. However, if
532 any thread needs to change any value in a context, it must make its own
533 thread-specific copy.
534 .
535 .
536 .SS "Match blocks"
537 .rs
538 .sp
539 The matching functions need a block of memory for working space and for storing
540 the results of a match. This includes details of what was matched, as well as
541 additional information such as the name of a (*MARK) setting. Each thread must
542 provide its own copy of this memory.
543 .
544 .
546 .rs
547 .sp
548 Some PCRE2 functions have a lot of parameters, many of which are used only by
549 specialist applications, for example, those that use custom memory management
550 or non-standard character tables. To keep function argument lists at a
551 reasonable size, and at the same time to keep the API extensible, "uncommon"
552 parameters are passed to certain functions in a \fBcontext\fP instead of
553 directly. A context is just a block of memory that holds the parameter values.
554 Applications that do not need to adjust any of the context parameters can pass
555 NULL when a context pointer is required.
556 .P
557 There are three different types of context: a general context that is relevant
558 for several PCRE2 operations, a compile-time context, and a match-time context.
559 .
560 .
561 .SS "The general context"
562 .rs
563 .sp
564 At present, this context just contains pointers to (and data for) external
565 memory management functions that are called from several places in the PCRE2
566 library. The context is named `general' rather than specifically `memory'
567 because in future other fields may be added. If you do not want to supply your
568 own custom memory management functions, you do not need to bother with a
569 general context. A general context is created by:
570 .sp
571 .nf
572 .B pcre2_general_context *pcre2_general_context_create(
573 .B " void *(*\fIprivate_malloc\fP)(PCRE2_SIZE, void *),"
574 .B " void (*\fIprivate_free\fP)(void *, void *), void *\fImemory_data\fP);"
575 .fi
576 .sp
577 The two function pointers specify custom memory management functions, whose
578 prototypes are:
579 .sp
580 \fBvoid *private_malloc(PCRE2_SIZE, void *);\fP
581 \fBvoid private_free(void *, void *);\fP
582 .sp
583 Whenever code in PCRE2 calls these functions, the final argument is the value
584 of \fImemory_data\fP. Either of the first two arguments of the creation
585 function may be NULL, in which case the system memory management functions
586 \fImalloc()\fP and \fIfree()\fP are used. (This is not currently useful, as
587 there are no other fields in a general context, but in future there might be.)
588 The \fIprivate_malloc()\fP function is used (if supplied) to obtain memory for
589 storing the context, and all three values are saved as part of the context.
590 .P
591 Whenever PCRE2 creates a data block of any kind, the block contains a pointer
592 to the \fIfree()\fP function that matches the \fImalloc()\fP function that was
593 used. When the time comes to free the block, this function is called.
594 .P
595 A general context can be copied by calling:
596 .sp
597 .nf
598 .B pcre2_general_context *pcre2_general_context_copy(
599 .B " pcre2_general_context *\fIgcontext\fP);"
600 .fi
601 .sp
602 The memory used for a general context should be freed by calling:
603 .sp
604 .nf
605 .B void pcre2_general_context_free(pcre2_general_context *\fIgcontext\fP);
606 .fi
607 .sp
608 .
609 .
610 .\" HTML <a name="compilecontext"></a>
611 .SS "The compile context"
612 .rs
613 .sp
614 A compile context is required if you want to change the default values of any
615 of the following compile-time parameters:
616 .sp
617 What \eR matches (Unicode newlines or CR, LF, CRLF only)
618 PCRE2's character tables
619 The newline character sequence
620 The compile time nested parentheses limit
621 The maximum length of the pattern string
622 An external function for stack checking
623 .sp
624 A compile context is also required if you are using custom memory management.
625 If none of these apply, just pass NULL as the context argument of
626 \fIpcre2_compile()\fP.
627 .P
628 A compile context is created, copied, and freed by the following functions:
629 .sp
630 .nf
631 .B pcre2_compile_context *pcre2_compile_context_create(
632 .B " pcre2_general_context *\fIgcontext\fP);"
633 .sp
634 .B pcre2_compile_context *pcre2_compile_context_copy(
635 .B " pcre2_compile_context *\fIccontext\fP);"
636 .sp
637 .B void pcre2_compile_context_free(pcre2_compile_context *\fIccontext\fP);
638 .fi
639 .sp
640 A compile context is created with default values for its parameters. These can
641 be changed by calling the following functions, which return 0 on success, or
642 PCRE2_ERROR_BADDATA if invalid data is detected.
643 .sp
644 .nf
645 .B int pcre2_set_bsr(pcre2_compile_context *\fIccontext\fP,
646 .B " uint32_t \fIvalue\fP);"
647 .fi
648 .sp
649 The value must be PCRE2_BSR_ANYCRLF, to specify that \eR matches only CR, LF,
650 or CRLF, or PCRE2_BSR_UNICODE, to specify that \eR matches any Unicode line
651 ending sequence. The value is used by the JIT compiler and by the two
652 interpreted matching functions, \fIpcre2_match()\fP and
653 \fIpcre2_dfa_match()\fP.
654 .sp
655 .nf
656 .B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP,
657 .B " const unsigned char *\fItables\fP);"
658 .fi
659 .sp
660 The value must be the result of a call to \fIpcre2_maketables()\fP, whose only
661 argument is a general context. This function builds a set of character tables
662 in the current locale.
663 .sp
664 .nf
665 .B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP,
666 .B " PCRE2_SIZE \fIvalue\fP);"
667 .fi
668 .sp
669 This sets a maximum length, in code units, for the pattern string that is to be
670 compiled. If the pattern is longer, an error is generated. This facility is
671 provided so that applications that accept patterns from external sources can
672 limit their size. The default is the largest number that a PCRE2_SIZE variable
673 can hold, which is effectively unlimited.
674 .sp
675 .nf
676 .B int pcre2_set_newline(pcre2_compile_context *\fIccontext\fP,
677 .B " uint32_t \fIvalue\fP);"
678 .fi
679 .sp
680 This specifies which characters or character sequences are to be recognized as
681 newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only),
682 PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character
683 sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), or
684 PCRE2_NEWLINE_ANY (any Unicode newline sequence).
685 .P
686 When a pattern is compiled with the PCRE2_EXTENDED option, the value of this
687 parameter affects the recognition of white space and the end of internal
688 comments starting with #. The value is saved with the compiled pattern for
689 subsequent use by the JIT compiler and by the two interpreted matching
690 functions, \fIpcre2_match()\fP and \fIpcre2_dfa_match()\fP.
691 .sp
692 .nf
693 .B int pcre2_set_parens_nest_limit(pcre2_compile_context *\fIccontext\fP,
694 .B " uint32_t \fIvalue\fP);"
695 .fi
696 .sp
697 This parameter ajusts the limit, set when PCRE2 is built (default 250), on the
698 depth of parenthesis nesting in a pattern. This limit stops rogue patterns
699 using up too much system stack when being compiled. The limit applies to
700 parentheses of all kinds, not just capturing parentheses.
701 .sp
702 .nf
703 .B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP,
704 .B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);"
705 .fi
706 .sp
707 There is at least one application that runs PCRE2 in threads with very limited
708 system stack, where running out of stack is to be avoided at all costs. The
709 parenthesis limit above cannot take account of how much stack is actually
710 available. For a finer control, you can supply a function that is called
711 whenever \fBpcre2_compile()\fP starts to compile a parenthesized part of a
712 pattern. This function can check the actual stack size (or anything else that
713 it wants to, of course).
714 .P
715 The first argument to the callout function gives the current depth of
716 nesting, and the second is user data that is set up by the last argument of
717 \fBpcre2_set_compile_recursion_guard()\fP. The callout function should return
718 zero if all is well, or non-zero to force an error.
719 .
720 .
721 .\" HTML <a name="matchcontext"></a>
722 .SS "The match context"
723 .rs
724 .sp
725 A match context is required if you want to change the default values of any
726 of the following match-time parameters:
727 .sp
728 A callout function
729 The offset limit for matching an unanchored pattern
730 The limit for calling \fBmatch()\fP (see below)
731 The limit for calling \fBmatch()\fP recursively
732 .sp
733 A match context is also required if you are using custom memory management.
734 If none of these apply, just pass NULL as the context argument of
735 \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP.
736 .P
737 A match context is created, copied, and freed by the following functions:
738 .sp
739 .nf
740 .B pcre2_match_context *pcre2_match_context_create(
741 .B " pcre2_general_context *\fIgcontext\fP);"
742 .sp
743 .B pcre2_match_context *pcre2_match_context_copy(
744 .B " pcre2_match_context *\fImcontext\fP);"
745 .sp
746 .B void pcre2_match_context_free(pcre2_match_context *\fImcontext\fP);
747 .fi
748 .sp
749 A match context is created with default values for its parameters. These can
750 be changed by calling the following functions, which return 0 on success, or
751 PCRE2_ERROR_BADDATA if invalid data is detected.
752 .sp
753 .nf
754 .B int pcre2_set_callout(pcre2_match_context *\fImcontext\fP,
755 .B " int (*\fIcallout_function\fP)(pcre2_callout_block *, void *),"
756 .B " void *\fIcallout_data\fP);"
757 .fi
758 .sp
759 This sets up a "callout" function, which PCRE2 will call at specified points
760 during a matching operation. Details are given in the
761 .\" HREF
762 \fBpcre2callout\fP
763 .\"
764 documentation.
765 .sp
766 .nf
767 .B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP,
768 .B " PCRE2_SIZE \fIvalue\fP);"
769 .fi
770 .sp
771 The \fIoffset_limit\fP parameter limits how far an unanchored search can
772 advance in the subject string. The default value is PCRE2_UNSET. The
773 \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP functions return
774 PCRE2_ERROR_NOMATCH if a match with a starting point before or at the given
775 offset is not found. For example, if the pattern /abc/ is matched against
776 "123abc" with an offset limit less than 3, the result is PCRE2_ERROR_NO_MATCH.
777 A match can never be found if the \fIstartoffset\fP argument of
778 \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP is greater than the offset
779 limit.
780 .P
781 When using this facility, you must set PCRE2_USE_OFFSET_LIMIT when calling
782 \fBpcre2_compile()\fP so that when JIT is in use, different code can be
783 compiled. If a match is started with a non-default match limit when
784 PCRE2_USE_OFFSET_LIMIT is not set, an error is generated.
785 .P
786 The offset limit facility can be used to track progress when searching large
787 subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
788 start within the first line of the subject. If this is set with an offset
789 limit, a match must occur in the first line and also within the offset limit.
790 In other words, whichever limit comes first is used.
791 .sp
792 .nf
793 .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
794 .B " uint32_t \fIvalue\fP);"
795 .fi
796 .sp
797 The \fImatch_limit\fP parameter provides a means of preventing PCRE2 from using
798 up too many resources when processing patterns that are not going to match, but
799 which have a very large number of possibilities in their search trees. The
800 classic example is a pattern that uses nested unlimited repeats.
801 .P
802 Internally, \fBpcre2_match()\fP uses a function called \fBmatch()\fP, which it
803 calls repeatedly (sometimes recursively). The limit set by \fImatch_limit\fP is
804 imposed on the number of times this function is called during a match, which
805 has the effect of limiting the amount of backtracking that can take place. For
806 patterns that are not anchored, the count restarts from zero for each position
807 in the subject string. This limit is not relevant to \fBpcre2_dfa_match()\fP,
808 which ignores it.
809 .P
810 When \fBpcre2_match()\fP is called with a pattern that was successfully
811 processed by \fBpcre2_jit_compile()\fP, the way in which matching is executed
812 is entirely different. However, there is still the possibility of runaway
813 matching that goes on for a very long time, and so the \fImatch_limit\fP value
814 is also used in this case (but in a different way) to limit how long the
815 matching can continue.
816 .P
817 The default value for the limit can be set when PCRE2 is built; the default
818 default is 10 million, which handles all but the most extreme cases. If the
819 limit is exceeded, \fBpcre2_match()\fP returns PCRE2_ERROR_MATCHLIMIT. A value
820 for the match limit may also be supplied by an item at the start of a pattern
821 of the form
822 .sp
823 (*LIMIT_MATCH=ddd)
824 .sp
825 where ddd is a decimal number. However, such a setting is ignored unless ddd is
826 less than the limit set by the caller of \fBpcre2_match()\fP or, if no such
827 limit is set, less than the default.
828 .sp
829 .nf
830 .B int pcre2_set_recursion_limit(pcre2_match_context *\fImcontext\fP,
831 .B " uint32_t \fIvalue\fP);"
832 .fi
833 .sp
834 The \fIrecursion_limit\fP parameter is similar to \fImatch_limit\fP, but
835 instead of limiting the total number of times that \fBmatch()\fP is called, it
836 limits the depth of recursion. The recursion depth is a smaller number than the
837 total number of calls, because not all calls to \fBmatch()\fP are recursive.
838 This limit is of use only if it is set smaller than \fImatch_limit\fP.
839 .P
840 Limiting the recursion depth limits the amount of system stack that can be
841 used, or, when PCRE2 has been compiled to use memory on the heap instead of the
842 stack, the amount of heap memory that can be used. This limit is not relevant,
843 and is ignored, when matching is done using JIT compiled code. However, it is
844 supported by \fBpcre2_dfa_match()\fP, which uses recursive function calls less
845 frequently than \fBpcre2_match()\fP, but which can be caused to use a lot of
846 stack by a recursive pattern such as /(.)(?1)/ matched to a very long string.
847 .P
848 The default value for \fIrecursion_limit\fP can be set when PCRE2 is built; the
849 default default is the same value as the default for \fImatch_limit\fP. If the
850 limit is exceeded, \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP return
851 PCRE2_ERROR_RECURSIONLIMIT. A value for the recursion limit may also be
852 supplied by an item at the start of a pattern of the form
853 .sp
855 .sp
856 where ddd is a decimal number. However, such a setting is ignored unless ddd is
857 less than the limit set by the caller of \fBpcre2_match()\fP or
858 \fBpcre2_dfa_match()\fP or, if no such limit is set, less than the default.
859 .sp
860 .nf
861 .B int pcre2_set_recursion_memory_management(
862 .B " pcre2_match_context *\fImcontext\fP,"
863 .B " void *(*\fIprivate_malloc\fP)(PCRE2_SIZE, void *),"
864 .B " void (*\fIprivate_free\fP)(void *, void *), void *\fImemory_data\fP);"
865 .fi
866 .sp
867 This function sets up two additional custom memory management functions for use
868 by \fBpcre2_match()\fP when PCRE2 is compiled to use the heap for remembering
869 backtracking data, instead of recursive function calls that use the system
870 stack. There is a discussion about PCRE2's stack usage in the
871 .\" HREF
872 \fBpcre2stack\fP
873 .\"
874 documentation. See the
875 .\" HREF
876 \fBpcre2build\fP
877 .\"
878 documentation for details of how to build PCRE2.
879 .P
880 Using the heap for recursion is a non-standard way of building PCRE2, for use
881 in environments that have limited stacks. Because of the greater use of memory
882 management, \fBpcre2_match()\fP runs more slowly. Functions that are different
883 to the general custom memory functions are provided so that special-purpose
884 external code can be used for this case, because the memory blocks are all the
885 same size. The blocks are retained by \fBpcre2_match()\fP until it is about to
886 exit so that they can be re-used when possible during the match. In the absence
887 of these functions, the normal custom memory management functions are used, if
888 supplied, otherwise the system functions.
889 .
890 .
892 .rs
893 .sp
894 .B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP);
895 .P
896 The function \fBpcre2_config()\fP makes it possible for a PCRE2 client to
897 discover which optional features have been compiled into the PCRE2 library. The
898 .\" HREF
899 \fBpcre2build\fP
900 .\"
901 documentation has more details about these optional features.
902 .P
903 The first argument for \fBpcre2_config()\fP specifies which information is
904 required. The second argument is a pointer to memory into which the information
905 is placed. If NULL is passed, the function returns the amount of memory that is
906 needed for the requested information. For calls that return numerical values,
907 the value is in bytes; when requesting these values, \fIwhere\fP should point
908 to appropriately aligned memory. For calls that return strings, the required
909 length is given in code units, not counting the terminating zero.
910 .P
911 When requesting information, the returned value from \fBpcre2_config()\fP is
912 non-negative on success, or the negative error code PCRE2_ERROR_BADOPTION if
913 the value in the first argument is not recognized. The following information is
914 available:
915 .sp
917 .sp
918 The output is a uint32_t integer whose value indicates what character
919 sequences the \eR escape sequence matches by default. A value of
920 PCRE2_BSR_UNICODE means that \eR matches any Unicode line ending sequence; a
921 value of PCRE2_BSR_ANYCRLF means that \eR matches only CR, LF, or CRLF. The
922 default can be overridden when a pattern is compiled.
923 .sp
925 .sp
926 The output is a uint32_t integer that is set to one if support for just-in-time
927 compiling is available; otherwise it is set to zero.
928 .sp
930 .sp
931 The \fIwhere\fP argument should point to a buffer that is at least 48 code
932 units long. (The exact length required can be found by calling
933 \fBpcre2_config()\fP with \fBwhere\fP set to NULL.) The buffer is filled with a
934 string that contains the name of the architecture for which the JIT compiler is
935 configured, for example "x86 32bit (little endian + unaligned)". If JIT support
936 is not available, PCRE2_ERROR_BADOPTION is returned, otherwise the number of
937 code units used is returned. This is the length of the string, plus one unit
938 for the terminating zero.
939 .sp
941 .sp
942 The output is a uint32_t integer that contains the number of bytes used for
943 internal linkage in compiled regular expressions. When PCRE2 is configured, the
944 value can be set to 2, 3, or 4, with the default being 2. This is the value
945 that is returned by \fBpcre2_config()\fP. However, when the 16-bit library is
946 compiled, a value of 3 is rounded up to 4, and when the 32-bit library is
947 compiled, internal linkages always use 4 bytes, so the configured value is not
948 relevant.
949 .P
950 The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all
951 but the most massive patterns, since it allows the size of the compiled pattern
952 to be up to 64K code units. Larger values allow larger regular expressions to
953 be compiled by those two libraries, but at the expense of slower matching.
954 .sp
956 .sp
957 The output is a uint32_t integer that gives the default limit for the number of
958 internal matching function calls in a \fBpcre2_match()\fP execution. Further
959 details are given with \fBpcre2_match()\fP below.
960 .sp
962 .sp
963 The output is a uint32_t integer whose value specifies the default character
964 sequence that is recognized as meaning "newline". The values are:
965 .sp
966 PCRE2_NEWLINE_CR Carriage return (CR)
967 PCRE2_NEWLINE_LF Linefeed (LF)
968 PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF)
969 PCRE2_NEWLINE_ANY Any Unicode line ending
971 .sp
972 The default should normally correspond to the standard sequence for your
973 operating system.
974 .sp
976 .sp
977 The output is a uint32_t integer that gives the maximum depth of nesting
978 of parentheses (of any kind) in a pattern. This limit is imposed to cap the
979 amount of system stack used when a pattern is compiled. It is specified when
980 PCRE2 is built; the default is 250. This limit does not take into account the
981 stack that may already be used by the calling application. For finer control
982 over compilation stack usage, see \fBpcre2_set_compile_recursion_guard()\fP.
983 .sp
985 .sp
986 The output is a uint32_t integer that gives the default limit for the depth of
987 recursion when calling the internal matching function in a \fBpcre2_match()\fP
988 execution. Further details are given with \fBpcre2_match()\fP below.
989 .sp
991 .sp
992 The output is a uint32_t integer that is set to one if internal recursion when
993 running \fBpcre2_match()\fP is implemented by recursive function calls that use
994 the system stack to remember their state. This is the usual way that PCRE2 is
995 compiled. The output is zero if PCRE2 was compiled to use blocks of data on the
996 heap instead of recursive function calls.
997 .sp
999 .sp
1000 The \fIwhere\fP argument should point to a buffer that is at least 24 code
1001 units long. (The exact length required can be found by calling
1002 \fBpcre2_config()\fP with \fBwhere\fP set to NULL.) If PCRE2 has been compiled
1003 without Unicode support, the buffer is filled with the text "Unicode not
1004 supported". Otherwise, the Unicode version string (for example, "8.0.0") is
1005 inserted. The number of code units used is returned. This is the length of the
1006 string plus one unit for the terminating zero.
1007 .sp
1009 .sp
1010 The output is a uint32_t integer that is set to one if Unicode support is
1011 available; otherwise it is set to zero. Unicode support implies UTF support.
1012 .sp
1014 .sp
1015 The \fIwhere\fP argument should point to a buffer that is at least 12 code
1016 units long. (The exact length required can be found by calling
1017 \fBpcre2_config()\fP with \fBwhere\fP set to NULL.) The buffer is filled with
1018 the PCRE2 version string, zero-terminated. The number of code units used is
1019 returned. This is the length of the string plus one unit for the terminating
1020 zero.
1021 .
1022 .
1023 .\" HTML <a name="compiling"></a>
1025 .rs
1026 .sp
1027 .nf
1028 .B pcre2_code *pcre2_compile(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP,
1029 .B " uint32_t \fIoptions\fP, int *\fIerrorcode\fP, PCRE2_SIZE *\fIerroroffset,\fP"
1030 .B " pcre2_compile_context *\fIccontext\fP);"
1031 .sp
1032 .B void pcre2_code_free(pcre2_code *\fIcode\fP);
1033 .sp
1034 .B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP);
1035 .sp
1036 .B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP);
1037 .fi
1038 .P
1039 The \fBpcre2_compile()\fP function compiles a pattern into an internal form.
1040 The pattern is defined by a pointer to a string of code units and a length. If
1041 the pattern is zero-terminated, the length can be specified as
1042 PCRE2_ZERO_TERMINATED. The function returns a pointer to a block of memory that
1043 contains the compiled pattern and related data, or NULL if an error occurred.
1044 .P
1045 If the compile context argument \fIccontext\fP is NULL, memory for the compiled
1046 pattern is obtained by calling \fBmalloc()\fP. Otherwise, it is obtained from
1047 the same memory function that was used for the compile context. The caller must
1048 free the memory by calling \fBpcre2_code_free()\fP when it is no longer needed.
1049 .P
1050 The function \fBpcre2_code_copy()\fP makes a copy of the compiled code in new
1051 memory, using the same memory allocator as was used for the original. However,
1052 if the code has been processed by the JIT compiler (see
1053 .\" HTML <a href="#jitcompiling">
1054 .\" </a>
1055 below),
1056 .\"
1057 the JIT information cannot be copied (because it is position-dependent).
1058 The new copy can initially be used only for non-JIT matching, though it can be
1059 passed to \fBpcre2_jit_compile()\fP if required.
1060 .P
1061 The \fBpcre2_code_copy()\fP function provides a way for individual threads in a
1062 multithreaded application to acquire a private copy of shared compiled code.
1063 However, it does not make a copy of the character tables used by the compiled
1064 pattern; the new pattern code points to the same tables as the original code.
1065 (See
1066 .\" HTML <a href="#jitcompiling">
1067 .\" </a>
1068 "Locale Support"
1069 .\"
1070 below for details of these character tables.) In many applications the same
1071 tables are used throughout, so this behaviour is appropriate. Nevertheless,
1072 there are occasions when a copy of a compiled pattern and the relevant tables
1073 are needed. The \fBpcre2_code_copy_with_tables()\fP provides this facility.
1074 Copies of both the code and the tables are made, with the new code pointing to
1075 the new tables. The memory for the new tables is automatically freed when
1076 \fBpcre2_code_free()\fP is called for the new copy of the compiled code.
1077 .P
1078 NOTE: When one of the matching functions is called, pointers to the compiled
1079 pattern and the subject string are set in the match data block so that they can
1080 be referenced by the substring extraction functions. After running a match, you
1081 must not free a compiled pattern (or a subject string) until after all
1082 operations on the
1083 .\" HTML <a href="#matchdatablock">
1084 .\" </a>
1085 match data block
1086 .\"
1087 have taken place.
1088 .P
1089 The \fIoptions\fP argument for \fBpcre2_compile()\fP contains various bit
1090 settings that affect the compilation. It should be zero if no options are
1091 required. The available options are described below. Some of them (in
1092 particular, those that are compatible with Perl, but some others as well) can
1093 also be set and unset from within the pattern (see the detailed description in
1094 the
1095 .\" HREF
1096 \fBpcre2pattern\fP
1097 .\"
1098 documentation).
1099 .P
1100 For those options that can be different in different parts of the pattern, the
1101 contents of the \fIoptions\fP argument specifies their settings at the start of
1102 compilation. The PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK options can be set at
1103 the time of matching as well as at compile time.
1104 .P
1105 Other, less frequently required compile-time parameters (for example, the
1106 newline setting) can be provided in a compile context (as described
1107 .\" HTML <a href="#compilecontext">
1108 .\" </a>
1109 above).
1110 .\"
1111 .P
1112 If \fIerrorcode\fP or \fIerroroffset\fP is NULL, \fBpcre2_compile()\fP returns
1113 NULL immediately. Otherwise, the variables to which these point are set to an
1114 error code and an offset (number of code units) within the pattern,
1115 respectively, when \fBpcre2_compile()\fP returns NULL because a compilation
1116 error has occurred. The values are not defined when compilation is successful
1117 and \fBpcre2_compile()\fP returns a non-NULL value.
1118 .P
1119 The value returned in \fIerroroffset\fP is an indication of where in the
1120 pattern the error occurred. It is not necessarily the furthest point in the
1121 pattern that was read. For example, after the error "lookbehind assertion is
1122 not fixed length", the error offset points to the start of the failing
1123 assertion.
1124 .P
1125 The \fBpcre2_get_error_message()\fP function (see "Obtaining a textual error
1126 message"
1127 .\" HTML <a href="#geterrormessage">
1128 .\" </a>
1129 below)
1130 .\"
1131 provides a textual message for each error code. Compilation errors have
1132 positive error codes; UTF formatting error codes are negative. For an invalid
1133 UTF-8 or UTF-16 string, the offset is that of the first code unit of the
1134 failing character.
1135 .P
1136 Some errors are not detected until the whole pattern has been scanned; in these
1137 cases, the offset passed back is the length of the pattern. Note that the
1138 offset is in code units, not characters, even in a UTF mode. It may sometimes
1139 point into the middle of a UTF-8 or UTF-16 character.
1140 .P
1141 This code fragment shows a typical straightforward call to
1142 \fBpcre2_compile()\fP:
1143 .sp
1144 pcre2_code *re;
1145 PCRE2_SIZE erroffset;
1146 int errorcode;
1147 re = pcre2_compile(
1148 "^A.*Z", /* the pattern */
1149 PCRE2_ZERO_TERMINATED, /* the pattern is zero-terminated */
1150 0, /* default options */
1151 &errorcode, /* for error code */
1152 &erroffset, /* for error offset */
1153 NULL); /* no compile context */
1154 .sp
1155 The following names for option bits are defined in the \fBpcre2.h\fP header
1156 file:
1157 .sp
1159 .sp
1160 If this bit is set, the pattern is forced to be "anchored", that is, it is
1161 constrained to match only at the first matching point in the string that is
1162 being searched (the "subject string"). This effect can also be achieved by
1163 appropriate constructs in the pattern itself, which is the only way to do it in
1164 Perl.
1165 .sp
1167 .sp
1168 By default, for compatibility with Perl, a closing square bracket that
1169 immediately follows an opening one is treated as a data character for the
1170 class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the class, which
1171 therefore contains no characters and so can never match.
1172 .sp
1174 .sp
1175 This option request alternative handling of three escape sequences, which
1176 makes PCRE2's behaviour more like ECMAscript (aka JavaScript). When it is set:
1177 .P
1178 (1) \eU matches an upper case "U" character; by default \eU causes a compile
1179 time error (Perl uses \eU to upper case subsequent characters).
1180 .P
1181 (2) \eu matches a lower case "u" character unless it is followed by four
1182 hexadecimal digits, in which case the hexadecimal number defines the code point
1183 to match. By default, \eu causes a compile time error (Perl uses it to upper
1184 case the following character).
1185 .P
1186 (3) \ex matches a lower case "x" character unless it is followed by two
1187 hexadecimal digits, in which case the hexadecimal number defines the code point
1188 to match. By default, as in Perl, a hexadecimal number is always expected after
1189 \ex, but it may have zero, one, or two digits (so, for example, \exz matches a
1190 binary zero character followed by z).
1191 .sp
1193 .sp
1194 In multiline mode (when PCRE2_MULTILINE is set), the circumflex metacharacter
1195 matches at the start of the subject (unless PCRE2_NOTBOL is set), and also
1196 after any internal newline. However, it does not match after a newline at the
1197 end of the subject, for compatibility with Perl. If you want a multiline
1198 circumflex also to match after a terminating newline, you must set
1200 .sp
1202 .sp
1203 By default, for compatibility with Perl, the name in any verb sequence such as
1204 (*MARK:NAME) is any sequence of characters that does not include a closing
1205 parenthesis. The name is not processed in any way, and it is not possible to
1206 include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
1207 option is set, normal backslash processing is applied to verb names and only an
1208 unescaped closing parenthesis terminates the name. A closing parenthesis can be
1209 included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED
1210 option is set, unescaped whitespace in verb names is skipped and #-comments are
1211 recognized, exactly as in the rest of the pattern.
1212 .sp
1214 .sp
1215 If this bit is set, \fBpcre2_compile()\fP automatically inserts callout items,
1216 all with number 255, before each pattern item, except immediately before or
1217 after a callout in the pattern. For discussion of the callout facility, see the
1218 .\" HREF
1219 \fBpcre2callout\fP
1220 .\"
1221 documentation.
1222 .sp
1224 .sp
1225 If this bit is set, letters in the pattern match both upper and lower case
1226 letters in the subject. It is equivalent to Perl's /i option, and it can be
1227 changed within a pattern by a (?i) option setting.
1228 .sp
1230 .sp
1231 If this bit is set, a dollar metacharacter in the pattern matches only at the
1232 end of the subject string. Without this option, a dollar also matches
1233 immediately before a newline at the end of the string (but not before any other
1234 newlines). The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is
1235 set. There is no equivalent to this option in Perl, and no way to set it within
1236 a pattern.
1237 .sp
1239 .sp
1240 If this bit is set, a dot metacharacter in the pattern matches any character,
1241 including one that indicates a newline. However, it only ever matches one
1242 character, even if newlines are coded as CRLF. Without this option, a dot does
1243 not match when the current position in the subject is at a newline. This option
1244 is equivalent to Perl's /s option, and it can be changed within a pattern by a
1245 (?s) option setting. A negative class such as [^a] always matches newline
1246 characters, independent of the setting of this option.
1247 .sp
1249 .sp
1250 If this bit is set, names used to identify capturing subpatterns need not be
1251 unique. This can be helpful for certain types of pattern when it is known that
1252 only one instance of the named subpattern can ever be matched. There are more
1253 details of named subpatterns below; see also the
1254 .\" HREF
1255 \fBpcre2pattern\fP
1256 .\"
1257 documentation.
1258 .sp
1260 .sp
1261 If this bit is set, most white space characters in the pattern are totally
1262 ignored except when escaped or inside a character class. However, white space
1263 is not allowed within sequences such as (?> that introduce various
1264 parenthesized subpatterns, nor within numerical quantifiers such as {1,3}.
1265 Ignorable white space is permitted between an item and a following quantifier
1266 and between a quantifier and a following + that indicates possessiveness.
1267 .P
1268 PCRE2_EXTENDED also causes characters between an unescaped # outside a
1269 character class and the next newline, inclusive, to be ignored, which makes it
1270 possible to include comments inside complicated patterns. Note that the end of
1271 this type of comment is a literal newline sequence in the pattern; escape
1272 sequences that happen to represent a newline do not count. PCRE2_EXTENDED is
1273 equivalent to Perl's /x option, and it can be changed within a pattern by a
1274 (?x) option setting.
1275 .P
1276 Which characters are interpreted as newlines can be specified by a setting in
1277 the compile context that is passed to \fBpcre2_compile()\fP or by a special
1278 sequence at the start of the pattern, as described in the section entitled
1279 .\" HTML <a href="pcre2pattern.html#newlines">
1280 .\" </a>
1281 "Newline conventions"
1282 .\"
1283 in the \fBpcre2pattern\fP documentation. A default is defined when PCRE2 is
1284 built.
1285 .sp
1287 .sp
1288 If this option is set, an unanchored pattern is required to match before or at
1289 the first newline in the subject string, though the matched text may continue
1290 over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
1291 general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a
1292 match must occur in the first line and also within the offset limit. In other
1293 words, whichever limit comes first is used.
1294 .sp
1296 .sp
1297 If this option is set, a back reference to an unset subpattern group matches an
1298 empty string (by default this causes the current matching alternative to fail).
1299 A pattern such as (\e1)(a) succeeds when this option is set (assuming it can
1300 find an "a" in the subject), whereas it fails by default, for Perl
1301 compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka
1302 JavaScript).
1303 .sp
1305 .sp
1306 By default, for the purposes of matching "start of line" and "end of line",
1307 PCRE2 treats the subject string as consisting of a single line of characters,
1308 even if it actually contains newlines. The "start of line" metacharacter (^)
1309 matches only at the start of the string, and the "end of line" metacharacter
1310 ($) matches only at the end of the string, or before a terminating newline
1311 (except when PCRE2_DOLLAR_ENDONLY is set). Note, however, that unless
1312 PCRE2_DOTALL is set, the "any character" metacharacter (.) does not match at a
1313 newline. This behaviour (for ^, $, and dot) is the same as Perl.
1314 .P
1315 When PCRE2_MULTILINE it is set, the "start of line" and "end of line"
1316 constructs match immediately following or immediately before internal newlines
1317 in the subject string, respectively, as well as at the very start and end. This
1318 is equivalent to Perl's /m option, and it can be changed within a pattern by a
1319 (?m) option setting. Note that the "start of line" metacharacter does not match
1320 after a newline at the end of the subject, for compatibility with Perl.
1321 However, you can change this by setting the PCRE2_ALT_CIRCUMFLEX option. If
1322 there are no newlines in a subject string, or no occurrences of ^ or $ in a
1323 pattern, setting PCRE2_MULTILINE has no effect.
1324 .sp
1326 .sp
1327 This option locks out the use of \eC in the pattern that is being compiled.
1328 This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
1329 it may leave the current matching point in the middle of a multi-code-unit
1330 character. This option may be useful in applications that process patterns from
1331 external sources. Note that there is also a build-time option that permanently
1332 locks out the use of \eC.
1333 .sp
1335 .sp
1336 This option locks out the use of Unicode properties for handling \eB, \eb, \eD,
1337 \ed, \eS, \es, \eW, \ew, and some of the POSIX character classes, as described
1338 for the PCRE2_UCP option below. In particular, it prevents the creator of the
1339 pattern from enabling this facility by starting the pattern with (*UCP). This
1340 option may be useful in applications that process patterns from external
1341 sources. The option combination PCRE_UCP and PCRE_NEVER_UCP causes an error.
1342 .sp
1344 .sp
1345 This option locks out interpretation of the pattern as UTF-8, UTF-16, or
1346 UTF-32, depending on which library is in use. In particular, it prevents the
1347 creator of the pattern from switching to UTF interpretation by starting the
1348 pattern with (*UTF). This option may be useful in applications that process
1349 patterns from external sources. The combination of PCRE2_UTF and
1350 PCRE2_NEVER_UTF causes an error.
1351 .sp
1353 .sp
1354 If this option is set, it disables the use of numbered capturing parentheses in
1355 the pattern. Any opening parenthesis that is not followed by ? behaves as if it
1356 were followed by ?: but named parentheses can still be used for capturing (and
1357 they acquire numbers in the usual way). There is no equivalent of this option
1358 in Perl. Note that, if this option is set, references to capturing groups (back
1359 references or recursion/subroutine calls) may only refer to named groups,
1360 though the reference can be by name or by number.
1361 .sp
1363 .sp
1364 If this option is set, it disables "auto-possessification", which is an
1365 optimization that, for example, turns a+b into a++b in order to avoid
1366 backtracks into a+ that can never be successful. However, if callouts are in
1367 use, auto-possessification means that some callouts are never taken. You can
1368 set this option if you want the matching functions to do a full unoptimized
1369 search and run all the callouts, but it is mainly provided for testing
1370 purposes.
1371 .sp
1373 .sp
1374 If this option is set, it disables an optimization that is applied when .* is
1375 the first significant item in a top-level branch of a pattern, and all the
1376 other branches also start with .* or with \eA or \eG or ^. The optimization is
1377 automatically disabled for .* if it is inside an atomic group or a capturing
1378 group that is the subject of a back reference, or if the pattern contains
1379 (*PRUNE) or (*SKIP). When the optimization is not disabled, such a pattern is
1380 automatically anchored if PCRE2_DOTALL is set for all the .* items and
1381 PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match
1382 must start either at the start of the subject or following a newline is
1383 remembered. Like other optimizations, this can cause callouts to be skipped.
1384 .sp
1386 .sp
1387 This is an option whose main effect is at matching time. It does not change
1388 what \fBpcre2_compile()\fP generates, but it does affect the output of the JIT
1389 compiler.
1390 .P
1391 There are a number of optimizations that may occur at the start of a match, in
1392 order to speed up the process. For example, if it is known that an unanchored
1393 match must start with a specific character, the matching code searches the
1394 subject for that character, and fails immediately if it cannot find it, without
1395 actually running the main matching function. This means that a special item
1396 such as (*COMMIT) at the start of a pattern is not considered until after a
1397 suitable starting point for the match has been found. Also, when callouts or
1398 (*MARK) items are in use, these "start-up" optimizations can cause them to be
1399 skipped if the pattern is never actually used. The start-up optimizations are
1400 in effect a pre-scan of the subject that takes place before the pattern is run.
1401 .P
1402 The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
1403 possibly causing performance to suffer, but ensuring that in cases where the
1404 result is "no match", the callouts do occur, and that items such as (*COMMIT)
1405 and (*MARK) are considered at every possible starting position in the subject
1406 string.
1407 .P
1408 Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation.
1409 Consider the pattern
1410 .sp
1412 .sp
1413 When this is compiled, PCRE2 records the fact that a match must start with the
1414 character "A". Suppose the subject string is "DEFABC". The start-up
1415 optimization scans along the subject, finds "A" and runs the first match
1416 attempt from there. The (*COMMIT) item means that the pattern must match the
1417 current starting position, which in this case, it does. However, if the same
1418 match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
1419 subject string does not happen. The first match attempt is run starting from
1420 "D" and when this fails, (*COMMIT) prevents any further matches being tried, so
1421 the overall result is "no match". There are also other start-up optimizations.
1422 For example, a minimum length for the subject may be recorded. Consider the
1423 pattern
1424 .sp
1425 (*MARK:A)(X|Y)
1426 .sp
1427 The minimum length for a match is one character. If the subject is "ABC", there
1428 will be attempts to match "ABC", "BC", and "C". An attempt to match an empty
1429 string at the end of the subject does not take place, because PCRE2 knows that
1430 the subject is now too short, and so the (*MARK) is never encountered. In this
1431 case, the optimization does not affect the overall match result, which is still
1432 "no match", but it does affect the auxiliary information that is returned.
1433 .sp
1435 .sp
1436 When PCRE2_UTF is set, the validity of the pattern as a UTF string is
1437 automatically checked. There are discussions about the validity of
1438 .\" HTML <a href="pcre2unicode.html#utf8strings">
1439 .\" </a>
1440 UTF-8 strings,
1441 .\"
1442 .\" HTML <a href="pcre2unicode.html#utf16strings">
1443 .\" </a>
1444 UTF-16 strings,
1445 .\"
1446 and
1447 .\" HTML <a href="pcre2unicode.html#utf32strings">
1448 .\" </a>
1449 UTF-32 strings
1450 .\"
1451 in the
1452 .\" HREF
1453 \fBpcre2unicode\fP
1454 .\"
1455 document.
1456 If an invalid UTF sequence is found, \fBpcre2_compile()\fP returns a negative
1457 error code.
1458 .P
1459 If you know that your pattern is valid, and you want to skip this check for
1460 performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set,
1461 the effect of passing an invalid UTF string as a pattern is undefined. It may
1462 cause your program to crash or loop. Note that this option can also be passed
1463 to \fBpcre2_match()\fP and \fBpcre_dfa_match()\fP, to suppress validity
1464 checking of the subject string.
1465 .sp
1466 PCRE2_UCP
1467 .sp
1468 This option changes the way PCRE2 processes \eB, \eb, \eD, \ed, \eS, \es, \eW,
1469 \ew, and some of the POSIX character classes. By default, only ASCII characters
1470 are recognized, but if PCRE2_UCP is set, Unicode properties are used instead to
1471 classify characters. More details are given in the section on
1472 .\" HTML <a href="pcre2pattern.html#genericchartypes">
1473 .\" </a>
1474 generic character types
1475 .\"
1476 in the
1477 .\" HREF
1478 \fBpcre2pattern\fP
1479 .\"
1480 page. If you set PCRE2_UCP, matching one of the items it affects takes much
1481 longer. The option is available only if PCRE2 has been compiled with Unicode
1482 support.
1483 .sp
1485 .sp
1486 This option inverts the "greediness" of the quantifiers so that they are not
1487 greedy by default, but become greedy if followed by "?". It is not compatible
1488 with Perl. It can also be set by a (?U) option setting within the pattern.
1489 .sp
1491 .sp
1492 This option must be set for \fBpcre2_compile()\fP if
1493 \fBpcre2_set_offset_limit()\fP is going to be used to set a non-default offset
1494 limit in a match context for matches that use this pattern. An error is
1495 generated if an offset limit is set without this option. For more details, see
1496 the description of \fBpcre2_set_offset_limit()\fP in the
1497 .\" HTML <a href="#matchcontext">
1498 .\" </a>
1499 section
1500 .\"
1501 that describes match contexts. See also the PCRE2_FIRSTLINE
1502 option above.
1503 .sp
1504 PCRE2_UTF
1505 .sp
1506 This option causes PCRE2 to regard both the pattern and the subject strings
1507 that are subsequently processed as strings of UTF characters instead of
1508 single-code-unit strings. It is available when PCRE2 is built to include
1509 Unicode support (which is the default). If Unicode support is not available,
1510 the use of this option provokes an error. Details of how this option changes
1511 the behaviour of PCRE2 are given in the
1512 .\" HREF
1513 \fBpcre2unicode\fP
1514 .\"
1515 page.
1516 .
1517 .
1519 .rs
1520 .sp
1521 There are over 80 positive error codes that \fBpcre2_compile()\fP may return
1522 (via \fIerrorcode\fP) if it finds an error in the pattern. There are also some
1523 negative error codes that are used for invalid UTF strings. These are the same
1524 as given by \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP, and are described
1525 in the
1526 .\" HREF
1527 \fBpcre2unicode\fP
1528 .\"
1529 page. The \fBpcre2_get_error_message()\fP function (see "Obtaining a textual
1530 error message"
1531 .\" HTML <a href="#geterrormessage">
1532 .\" </a>
1533 below)
1534 .\"
1535 can be called to obtain a textual error message from any error code.
1536 .
1537 .
1538 .\" HTML <a name="jitcompiling"></a>
1540 .rs
1541 .sp
1542 .nf
1543 .B int pcre2_jit_compile(pcre2_code *\fIcode\fP, uint32_t \fIoptions\fP);
1544 .sp
1545 .B int pcre2_jit_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP,
1546 .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
1547 .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
1548 .B " pcre2_match_context *\fImcontext\fP);"
1549 .sp
1550 .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
1551 .sp
1552 .B pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE \fIstartsize\fP,
1553 .B " PCRE2_SIZE \fImaxsize\fP, pcre2_general_context *\fIgcontext\fP);"
1554 .sp
1555 .B void pcre2_jit_stack_assign(pcre2_match_context *\fImcontext\fP,
1556 .B " pcre2_jit_callback \fIcallback_function\fP, void *\fIcallback_data\fP);"
1557 .sp
1558 .B void pcre2_jit_stack_free(pcre2_jit_stack *\fIjit_stack\fP);
1559 .fi
1560 .P
1561 These functions provide support for JIT compilation, which, if the just-in-time
1562 compiler is available, further processes a compiled pattern into machine code
1563 that executes much faster than the \fBpcre2_match()\fP interpretive matching
1564 function. Full details are given in the
1565 .\" HREF
1566 \fBpcre2jit\fP
1567 .\"
1568 documentation.
1569 .P
1570 JIT compilation is a heavyweight optimization. It can take some time for
1571 patterns to be analyzed, and for one-off matches and simple patterns the
1572 benefit of faster execution might be offset by a much slower compilation time.
1573 Most, but not all patterns can be optimized by the JIT compiler.
1574 .
1575 .
1576 .\" HTML <a name="localesupport"></a>
1578 .rs
1579 .sp
1580 PCRE2 handles caseless matching, and determines whether characters are letters,
1581 digits, or whatever, by reference to a set of tables, indexed by character code
1582 point. This applies only to characters whose code points are less than 256. By
1583 default, higher-valued code points never match escapes such as \ew or \ed.
1584 However, if PCRE2 is built with UTF support, all characters can be tested with
1585 \ep and \eP, or, alternatively, the PCRE2_UCP option can be set when a pattern
1586 is compiled; this causes \ew and friends to use Unicode property support
1587 instead of the built-in tables.
1588 .P
1589 The use of locales with Unicode is discouraged. If you are handling characters
1590 with code points greater than 128, you should either use Unicode support, or
1591 use locales, but not try to mix the two.
1592 .P
1593 PCRE2 contains an internal set of character tables that are used by default.
1594 These are sufficient for many applications. Normally, the internal tables
1595 recognize only ASCII characters. However, when PCRE2 is built, it is possible
1596 to cause the internal tables to be rebuilt in the default "C" locale of the
1597 local system, which may cause them to be different.
1598 .P
1599 The internal tables can be overridden by tables supplied by the application
1600 that calls PCRE2. These may be created in a different locale from the default.
1601 As more and more applications change to using Unicode, the need for this locale
1602 support is expected to die away.
1603 .P
1604 External tables are built by calling the \fBpcre2_maketables()\fP function, in
1605 the relevant locale. The result can be passed to \fBpcre2_compile()\fP as often
1606 as necessary, by creating a compile context and calling
1607 \fBpcre2_set_character_tables()\fP to set the tables pointer therein. For
1608 example, to build and use tables that are appropriate for the French locale
1609 (where accented characters with values greater than 128 are treated as
1610 letters), the following code could be used:
1611 .sp
1612 setlocale(LC_CTYPE, "fr_FR");
1613 tables = pcre2_maketables(NULL);
1614 ccontext = pcre2_compile_context_create(NULL);
1615 pcre2_set_character_tables(ccontext, tables);
1616 re = pcre2_compile(..., ccontext);
1617 .sp
1618 The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
1619 are using Windows, the name for the French locale is "french". It is the
1620 caller's responsibility to ensure that the memory containing the tables remains
1621 available for as long as it is needed.
1622 .P
1623 The pointer that is passed (via the compile context) to \fBpcre2_compile()\fP
1624 is saved with the compiled pattern, and the same tables are used by
1625 \fBpcre2_match()\fP and \fBpcre_dfa_match()\fP. Thus, for any single pattern,
1626 compilation, and matching all happen in the same locale, but different patterns
1627 can be processed in different locales.
1628 .
1629 .
1630 .\" HTML <a name="infoaboutpattern"></a>
1632 .rs
1633 .sp
1634 .nf
1635 .B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
1636 .fi
1637 .P
1638 The \fBpcre2_pattern_info()\fP function returns general information about a
1639 compiled pattern. For information about callouts, see the
1640 .\" HTML <a href="pcre2pattern.html#infoaboutcallouts">
1641 .\" </a>
1642 next section.
1643 .\"
1644 The first argument for \fBpcre2_pattern_info()\fP is a pointer to the compiled
1645 pattern. The second argument specifies which piece of information is required,
1646 and the third argument is a pointer to a variable to receive the data. If the
1647 third argument is NULL, the first argument is ignored, and the function returns
1648 the size in bytes of the variable that is required for the information
1649 requested. Otherwise, The yield of the function is zero for success, or one of
1650 the following negative numbers:
1651 .sp
1652 PCRE2_ERROR_NULL the argument \fIcode\fP was NULL
1653 PCRE2_ERROR_BADMAGIC the "magic number" was not found
1654 PCRE2_ERROR_BADOPTION the value of \fIwhat\fP was invalid
1655 PCRE2_ERROR_UNSET the requested field is not set
1656 .sp
1657 The "magic number" is placed at the start of each compiled pattern as an simple
1658 check against passing an arbitrary memory pointer. Here is a typical call of
1659 \fBpcre2_pattern_info()\fP, to obtain the length of the compiled pattern:
1660 .sp
1661 int rc;
1662 size_t length;
1663 rc = pcre2_pattern_info(
1664 re, /* result of pcre2_compile() */
1665 PCRE2_INFO_SIZE, /* what is required */
1666 &length); /* where to put the data */
1667 .sp
1668 The possible values for the second argument are defined in \fBpcre2.h\fP, and
1669 are as follows:
1670 .sp
1673 .sp
1674 Return a copy of the pattern's options. The third argument should point to a
1675 \fBuint32_t\fP variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that
1676 were passed to \fBpcre2_compile()\fP, whereas PCRE2_INFO_ALLOPTIONS returns
1677 the compile options as modified by any top-level (*XXX) option settings such as
1678 (*UTF) at the start of the pattern itself.
1679 .P
1680 For example, if the pattern /(*UTF)abc/ is compiled with the PCRE2_EXTENDED
1681 option, the result for PCRE2_INFO_ALLOPTIONS is PCRE2_EXTENDED and PCRE2_UTF.
1682 Option settings such as (?i) that can change within a pattern do not affect the
1683 result of PCRE2_INFO_ALLOPTIONS, even if they appear right at the start of the
1684 pattern. (This was different in some earlier releases.)
1685 .P
1686 A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if
1687 the first significant item in every top-level branch is one of the following:
1688 .sp
1689 ^ unless PCRE2_MULTILINE is set
1690 \eA always
1691 \eG always
1692 .* sometimes - see below
1693 .sp
1694 When .* is the first significant item, anchoring is possible only when all the
1695 following are true:
1696 .sp
1697 .* is not in an atomic group
1698 .\" JOIN
1699 .* is not in a capturing group that is the subject
1700 of a back reference
1701 PCRE2_DOTALL is in force for .*
1702 Neither (*PRUNE) nor (*SKIP) appears in the pattern.
1703 PCRE2_NO_DOTSTAR_ANCHOR is not set.
1704 .sp
1705 For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the
1706 options returned for PCRE2_INFO_ALLOPTIONS.
1707 .sp
1709 .sp
1710 Return the number of the highest back reference in the pattern. The third
1711 argument should point to an \fBuint32_t\fP variable. Named subpatterns acquire
1712 numbers as well as names, and these count towards the highest back reference.
1713 Back references such as \e4 or \eg{12} match the captured characters of the
1714 given group, but in addition, the check that a capturing group is set in a
1715 conditional subpattern such as (?(3)a|b) is also a back reference. Zero is
1716 returned if there are no back references.
1717 .sp
1719 .sp
1720 The output is a uint32_t whose value indicates what character sequences the \eR
1721 escape sequence matches. A value of PCRE2_BSR_UNICODE means that \eR matches
1722 any Unicode line ending sequence; a value of PCRE2_BSR_ANYCRLF means that \eR
1723 matches only CR, LF, or CRLF.
1724 .sp
1726 .sp
1727 Return the highest capturing subpattern number in the pattern. In patterns
1728 where (?| is not used, this is also the total number of capturing subpatterns.
1729 The third argument should point to an \fBuint32_t\fP variable.
1730 .sp
1732 .sp
1733 In the absence of a single first code unit for a non-anchored pattern,
1734 \fBpcre2_compile()\fP may construct a 256-bit table that defines a fixed set of
1735 values for the first code unit in any match. For example, a pattern that starts
1736 with [abc] results in a table with three bits set. When code unit values
1737 greater than 255 are supported, the flag bit for 255 means "any code unit of
1738 value 255 or above". If such a table was constructed, a pointer to it is
1739 returned. Otherwise NULL is returned. The third argument should point to an
1740 \fBconst uint8_t *\fP variable.
1741 .sp
1743 .sp
1744 Return information about the first code unit of any matched string, for a
1745 non-anchored pattern. The third argument should point to an \fBuint32_t\fP
1746 variable. If there is a fixed first value, for example, the letter "c" from a
1747 pattern such as (cat|cow|coyote), 1 is returned, and the character value can be
1748 retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but
1749 it is known that a match can occur only at the start of the subject or
1750 following a newline in the subject, 2 is returned. Otherwise, and for anchored
1751 patterns, 0 is returned.
1752 .sp
1754 .sp
1755 Return the value of the first code unit of any matched string in the situation
1756 where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. The third
1757 argument should point to an \fBuint32_t\fP variable. In the 8-bit library, the
1758 value is always less than 256. In the 16-bit library the value can be up to
1759 0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff,
1760 and up to 0xffffffff when not using UTF-32 mode.
1761 .sp
1763 .sp
1764 Return 1 if the pattern contains any instances of \eC, otherwise 0. The third
1765 argument should point to an \fBuint32_t\fP variable.
1766 .sp
1768 .sp
1769 Return 1 if the pattern contains any explicit matches for CR or LF characters,
1770 otherwise 0. The third argument should point to an \fBuint32_t\fP variable. An
1771 explicit match is either a literal CR or LF character, or \er or \en.
1772 .sp
1774 .sp
1775 Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
1776 0. The third argument should point to an \fBuint32_t\fP variable. (?J) and
1777 (?-J) set and unset the local PCRE2_DUPNAMES option, respectively.
1778 .sp
1780 .sp
1781 If the compiled pattern was successfully processed by
1782 \fBpcre2_jit_compile()\fP, return the size of the JIT compiled code, otherwise
1783 return zero. The third argument should point to a \fBsize_t\fP variable.
1784 .sp
1786 .sp
1787 Returns 1 if there is a rightmost literal code unit that must exist in any
1788 matched string, other than at its start. The third argument should point to an
1789 \fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is
1790 returned, the code unit value itself can be retrieved using
1791 PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
1792 recorded only if it follows something of variable length. For example, for the
1793 pattern /^a\ed+z\ed+/ the returned value is 1 (with "z" returned from
1794 PCRE2_INFO_LASTCODEUNIT), but for /^a\edz\ed/ the returned value is 0.
1795 .sp
1797 .sp
1798 Return the value of the rightmost literal data unit that must exist in any
1799 matched string, other than at its start, if such a value has been recorded. The
1800 third argument should point to an \fBuint32_t\fP variable. If there is no such
1801 value, 0 is returned.
1802 .sp
1804 .sp
1805 Return 1 if the pattern might match an empty string, otherwise 0. The third
1806 argument should point to an \fBuint32_t\fP variable. When a pattern contains
1807 recursive subroutine calls it is not always possible to determine whether or
1808 not it can match an empty string. PCRE2 takes a cautious approach and returns 1
1809 in such cases.
1810 .sp
1812 .sp
1813 If the pattern set a match limit by including an item of the form
1814 (*LIMIT_MATCH=nnnn) at the start, the value is returned. The third argument
1815 should point to an unsigned 32-bit integer. If no such value has been set, the
1816 call to \fBpcre2_pattern_info()\fP returns the error PCRE2_ERROR_UNSET.
1817 .sp
1819 .sp
1820 Return the number of characters (not code units) in the longest lookbehind
1821 assertion in the pattern. The third argument should point to an unsigned 32-bit
1822 integer. This information is useful when doing multi-segment matching using the
1823 partial matching facilities. Note that the simple assertions \eb and \eB
1824 require a one-character lookbehind. \eA also registers a one-character
1825 lookbehind, though it does not actually inspect the previous character. This is
1826 to ensure that at least one character from the old segment is retained when a
1827 new segment is processed. Otherwise, if there are no lookbehinds in the
1828 pattern, \eA might match incorrectly at the start of a new segment.
1829 .sp
1831 .sp
1832 If a minimum length for matching subject strings was computed, its value is
1833 returned. Otherwise the returned value is 0. The value is a number of
1834 characters, which in UTF mode may be different from the number of code units.
1835 The third argument should point to an \fBuint32_t\fP variable. The value is a
1836 lower bound to the length of any matching string. There may not be any strings
1837 of that length that do actually match, but every string that does match is at
1838 least that long.
1839 .sp
1843 .sp
1844 PCRE2 supports the use of named as well as numbered capturing parentheses. The
1845 names are just an additional way of identifying the parentheses, which still
1846 acquire numbers. Several convenience functions such as
1847 \fBpcre2_substring_get_byname()\fP are provided for extracting captured
1848 substrings by name. It is also possible to extract the data directly, by first
1849 converting the name to a number in order to access the correct pointers in the
1850 output vector (described with \fBpcre2_match()\fP below). To do the conversion,
1851 you need to use the name-to-number map, which is described by these three
1852 values.
1853 .P
1854 The map consists of a number of fixed-size entries. PCRE2_INFO_NAMECOUNT gives
1855 the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives the size of each
1856 entry in code units; both of these return a \fBuint32_t\fP value. The entry
1857 size depends on the length of the longest name.
1858 .P
1859 PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table. This is
1860 a PCRE2_SPTR pointer to a block of code units. In the 8-bit library, the first
1861 two bytes of each entry are the number of the capturing parenthesis, most
1862 significant byte first. In the 16-bit library, the pointer points to 16-bit
1863 code units, the first of which contains the parenthesis number. In the 32-bit
1864 library, the pointer points to 32-bit code units, the first of which contains
1865 the parenthesis number. The rest of the entry is the corresponding name, zero
1866 terminated.
1867 .P
1868 The names are in alphabetical order. If (?| is used to create multiple groups
1869 with the same number, as described in the
1870 .\" HTML <a href="pcre2pattern.html#dupsubpatternnumber">
1871 .\" </a>
1872 section on duplicate subpattern numbers
1873 .\"
1874 in the
1875 .\" HREF
1876 \fBpcre2pattern\fP
1877 .\"
1878 page, the groups may be given the same name, but there is only one entry in the
1879 table. Different names for groups of the same number are not permitted.
1880 .P
1881 Duplicate names for subpatterns with different numbers are permitted, but only
1882 if PCRE2_DUPNAMES is set. They appear in the table in the order in which they
1883 were found in the pattern. In the absence of (?| this is the order of
1884 increasing number; when (?| is used this is not necessarily the case because
1885 later subpatterns may have lower numbers.
1886 .P
1887 As a simple example of the name/number table, consider the following pattern
1888 after compilation by the 8-bit library (assume PCRE2_EXTENDED is set, so white
1889 space - including newlines - is ignored):
1890 .sp
1891 .\" JOIN
1892 (?<date> (?<year>(\ed\ed)?\ed\ed) -
1893 (?<month>\ed\ed) - (?<day>\ed\ed) )
1894 .sp
1895 There are four named subpatterns, so the table has four entries, and each entry
1896 in the table is eight bytes long. The table is as follows, with non-printing
1897 bytes shows in hexadecimal, and undefined bytes shown as ??:
1898 .sp
1899 00 01 d a t e 00 ??
1900 00 05 d a y 00 ?? ??
1901 00 04 m o n t h 00
1902 00 02 y e a r 00 ??
1903 .sp
1904 When writing code to extract data from named subpatterns using the
1905 name-to-number map, remember that the length of the entries is likely to be
1906 different for each compiled pattern.
1907 .sp
1909 .sp
1910 The output is a \fBuint32_t\fP with one of the following values:
1911 .sp
1912 PCRE2_NEWLINE_CR Carriage return (CR)
1913 PCRE2_NEWLINE_LF Linefeed (LF)
1914 PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF)
1915 PCRE2_NEWLINE_ANY Any Unicode line ending
1917 .sp
1918 This specifies the default character sequence that will be recognized as
1919 meaning "newline" while matching.
1920 .sp
1922 .sp
1923 If the pattern set a recursion limit by including an item of the form
1924 (*LIMIT_RECURSION=nnnn) at the start, the value is returned. The third
1925 argument should point to an unsigned 32-bit integer. If no such value has been
1926 set, the call to \fBpcre2_pattern_info()\fP returns the error PCRE2_ERROR_UNSET.
1927 .sp
1929 .sp
1930 Return the size of the compiled pattern in bytes (for all three libraries). The
1931 third argument should point to a \fBsize_t\fP variable. This value includes the
1932 size of the general data block that precedes the code units of the compiled
1933 pattern itself. The value that is used when \fBpcre2_compile()\fP is getting
1934 memory in which to place the compiled pattern may be slightly larger than the
1935 value returned by this option, because there are cases where the code that
1936 calculates the size has to over-estimate. Processing a pattern with the JIT
1937 compiler does not alter the value returned by this option.
1938 .
1939 .
1940 .\" HTML <a name="infoaboutcallouts"></a>
1942 .rs
1943 .sp
1944 .nf
1945 .B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
1946 .B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
1947 .B " void *\fIuser_data\fP);"
1948 .fi
1949 .sp
1950 A script language that supports the use of string arguments in callouts might
1951 like to scan all the callouts in a pattern before running the match. This can
1952 be done by calling \fBpcre2_callout_enumerate()\fP. The first argument is a
1953 pointer to a compiled pattern, the second points to a callback function, and
1954 the third is arbitrary user data. The callback function is called for every
1955 callout in the pattern in the order in which they appear. Its first argument is
1956 a pointer to a callout enumeration block, and its second argument is the
1957 \fIuser_data\fP value that was passed to \fBpcre2_callout_enumerate()\fP. The
1958 contents of the callout enumeration block are described in the
1959 .\" HREF
1960 \fBpcre2callout\fP
1961 .\"
1962 documentation, which also gives further details about callouts.
1963 .
1964 .
1966 .rs
1967 .sp
1968 It is possible to save compiled patterns on disc or elsewhere, and reload them
1969 later, subject to a number of restrictions. The functions whose names begin
1970 with \fBpcre2_serialize_\fP are used for this purpose. They are described in
1971 the
1972 .\" HREF
1973 \fBpcre2serialize\fP
1974 .\"
1975 documentation.
1976 .
1977 .
1978 .\" HTML <a name="matchdatablock"></a>
1980 .rs
1981 .sp
1982 .nf
1983 .B pcre2_match_data *pcre2_match_data_create(uint32_t \fIovecsize\fP,
1984 .B " pcre2_general_context *\fIgcontext\fP);"
1985 .sp
1986 .B pcre2_match_data *pcre2_match_data_create_from_pattern(
1987 .B " const pcre2_code *\fIcode\fP, pcre2_general_context *\fIgcontext\fP);"
1988 .sp
1989 .B void pcre2_match_data_free(pcre2_match_data *\fImatch_data\fP);
1990 .fi
1991 .P
1992 Information about a successful or unsuccessful match is placed in a match
1993 data block, which is an opaque structure that is accessed by function calls. In
1994 particular, the match data block contains a vector of offsets into the subject
1995 string that define the matched part of the subject and any substrings that were
1996 captured. This is known as the \fIovector\fP.
1997 .P
1998 Before calling \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or
1999 \fBpcre2_jit_match()\fP you must create a match data block by calling one of
2000 the creation functions above. For \fBpcre2_match_data_create()\fP, the first
2001 argument is the number of pairs of offsets in the \fIovector\fP. One pair of
2002 offsets is required to identify the string that matched the whole pattern, with
2003 another pair for each captured substring. For example, a value of 4 creates
2004 enough space to record the matched portion of the subject plus three captured
2005 substrings. A minimum of at least 1 pair is imposed by
2006 \fBpcre2_match_data_create()\fP, so it is always possible to return the overall
2007 matched string.
2008 .P
2009 The second argument of \fBpcre2_match_data_create()\fP is a pointer to a
2010 general context, which can specify custom memory management for obtaining the
2011 memory for the match data block. If you are not using custom memory management,
2012 pass NULL, which causes \fBmalloc()\fP to be used.
2013 .P
2014 For \fBpcre2_match_data_create_from_pattern()\fP, the first argument is a
2015 pointer to a compiled pattern. The ovector is created to be exactly the right
2016 size to hold all the substrings a pattern might capture. The second argument is
2017 again a pointer to a general context, but in this case if NULL is passed, the
2018 memory is obtained using the same allocator that was used for the compiled
2019 pattern (custom or default).
2020 .P
2021 A match data block can be used many times, with the same or different compiled
2022 patterns. You can extract information from a match data block after a match
2023 operation has finished, using functions that are described in the sections on
2024 .\" HTML <a href="#matchedstrings">
2025 .\" </a>
2026 matched strings
2027 .\"
2028 and
2029 .\" HTML <a href="#matchotherdata">
2030 .\" </a>
2031 other match data
2032 .\"
2033 below.
2034 .P
2035 When a call of \fBpcre2_match()\fP fails, valid data is available in the match
2036 block only when the error is PCRE2_ERROR_NOMATCH, PCRE2_ERROR_PARTIAL, or one
2037 of the error codes for an invalid UTF string. Exactly what is available depends
2038 on the error, and is detailed below.
2039 .P
2040 When one of the matching functions is called, pointers to the compiled pattern
2041 and the subject string are set in the match data block so that they can be
2042 referenced by the extraction functions. After running a match, you must not
2043 free a compiled pattern or a subject string until after all operations on the
2044 match data block (for that match) have taken place.
2045 .P
2046 When a match data block itself is no longer needed, it should be freed by
2047 calling \fBpcre2_match_data_free()\fP.
2048 .
2049 .
2051 .rs
2052 .sp
2053 .nf
2054 .B int pcre2_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP,
2055 .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
2056 .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
2057 .B " pcre2_match_context *\fImcontext\fP);"
2058 .fi
2059 .P
2060 The function \fBpcre2_match()\fP is called to match a subject string against a
2061 compiled pattern, which is passed in the \fIcode\fP argument. You can call
2062 \fBpcre2_match()\fP with the same \fIcode\fP argument as many times as you
2063 like, in order to find multiple matches in the subject string or to match
2064 different subject strings with the same pattern.
2065 .P
2066 This function is the main matching facility of the library, and it operates in
2067 a Perl-like manner. For specialist use there is also an alternative matching
2068 function, which is described
2069 .\" HTML <a href="#dfamatch">
2070 .\" </a>
2071 below
2072 .\"
2073 in the section about the \fBpcre2_dfa_match()\fP function.
2074 .P
2075 Here is an example of a simple call to \fBpcre2_match()\fP:
2076 .sp
2077 pcre2_match_data *md = pcre2_match_data_create(4, NULL);
2078 int rc = pcre2_match(
2079 re, /* result of pcre2_compile() */
2080 "some string", /* the subject string */
2081 11, /* the length of the subject string */
2082 0, /* start at offset 0 in the subject */
2083 0, /* default options */
2084 match_data, /* the match data block */
2085 NULL); /* a match context; NULL means use defaults */
2086 .sp
2087 If the subject string is zero-terminated, the length can be given as
2088 PCRE2_ZERO_TERMINATED. A match context must be provided if certain less common
2089 matching parameters are to be changed. For details, see the section on
2090 .\" HTML <a href="#matchcontext">
2091 .\" </a>
2092 the match context
2093 .\"
2094 above.
2095 .
2096 .
2097 .SS "The string to be matched by \fBpcre2_match()\fP"
2098 .rs
2099 .sp
2100 The subject string is passed to \fBpcre2_match()\fP as a pointer in
2101 \fIsubject\fP, a length in \fIlength\fP, and a starting offset in
2102 \fIstartoffset\fP. The length and offset are in code units, not characters.
2103 That is, they are in bytes for the 8-bit library, 16-bit code units for the
2104 16-bit library, and 32-bit code units for the 32-bit library, whether or not
2105 UTF processing is enabled.
2106 .P
2107 If \fIstartoffset\fP is greater than the length of the subject,
2108 \fBpcre2_match()\fP returns PCRE2_ERROR_BADOFFSET. When the starting offset is
2109 zero, the search for a match starts at the beginning of the subject, and this
2110 is by far the most common case. In UTF-8 or UTF-16 mode, the starting offset
2111 must point to the start of a character, or to the end of the subject (in UTF-32
2112 mode, one code unit equals one character, so all offsets are valid). Like the
2113 pattern string, the subject may contain binary zeroes.
2114 .P
2115 A non-zero starting offset is useful when searching for another match in the
2116 same subject by calling \fBpcre2_match()\fP again after a previous success.
2117 Setting \fIstartoffset\fP differs from passing over a shortened string and
2118 setting PCRE2_NOTBOL in the case of a pattern that begins with any kind of
2119 lookbehind. For example, consider the pattern
2120 .sp
2121 \eBiss\eB
2122 .sp
2123 which finds occurrences of "iss" in the middle of words. (\eB matches only if
2124 the current position in the subject is not a word boundary.) When applied to
2125 the string "Mississipi" the first call to \fBpcre2_match()\fP finds the first
2126 occurrence. If \fBpcre2_match()\fP is called again with just the remainder of
2127 the subject, namely "issipi", it does not match, because \eB is always false at
2128 the start of the subject, which is deemed to be a word boundary. However, if
2129 \fBpcre2_match()\fP is passed the entire string again, but with
2130 \fIstartoffset\fP set to 4, it finds the second occurrence of "iss" because it
2131 is able to look behind the starting point to discover that it is preceded by a
2132 letter.
2133 .P
2134 Finding all the matches in a subject is tricky when the pattern can match an
2135 empty string. It is possible to emulate Perl's /g behaviour by first trying the
2136 match again at the same offset, with the PCRE2_NOTEMPTY_ATSTART and
2137 PCRE2_ANCHORED options, and then if that fails, advancing the starting offset
2138 and trying an ordinary match again. There is some code that demonstrates how to
2139 do this in the
2140 .\" HREF
2141 \fBpcre2demo\fP
2142 .\"
2143 sample program. In the most general case, you have to check to see if the
2144 newline convention recognizes CRLF as a newline, and if so, and the current
2145 character is CR followed by LF, advance the starting offset by two characters
2146 instead of one.
2147 .P
2148 If a non-zero starting offset is passed when the pattern is anchored, one
2149 attempt to match at the given offset is made. This can only succeed if the
2150 pattern does not require the match to be at the start of the subject.
2151 .
2152 .
2153 .\" HTML <a name="matchoptions"></a>
2154 .SS "Option bits for \fBpcre2_match()\fP"
2155 .rs
2156 .sp
2157 The unused bits of the \fIoptions\fP argument for \fBpcre2_match()\fP must be
2158 zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
2161 described below.
2162 .P
2163 Setting PCRE2_ANCHORED at match time is not supported by the just-in-time (JIT)
2164 compiler. If it is set, JIT matching is disabled and the normal interpretive
2165 code in \fBpcre2_match()\fP is run. Apart from PCRE2_NO_JIT (obviously), the
2166 remaining options are supported for JIT matching.
2167 .sp
2169 .sp
2170 The PCRE2_ANCHORED option limits \fBpcre2_match()\fP to matching at the first
2171 matching position. If a pattern was compiled with PCRE2_ANCHORED, or turned out
2172 to be anchored by virtue of its contents, it cannot be made unachored at
2173 matching time. Note that setting the option at match time disables JIT
2174 matching.
2175 .sp
2177 .sp
2178 This option specifies that first character of the subject string is not the
2179 beginning of a line, so the circumflex metacharacter should not match before
2180 it. Setting this without having set PCRE2_MULTILINE at compile time causes
2181 circumflex never to match. This option affects only the behaviour of the
2182 circumflex metacharacter. It does not affect \eA.
2183 .sp
2185 .sp
2186 This option specifies that the end of the subject string is not the end of a
2187 line, so the dollar metacharacter should not match it nor (except in multiline
2188 mode) a newline immediately before it. Setting this without having set
2189 PCRE2_MULTILINE at compile time causes dollar never to match. This option
2190 affects only the behaviour of the dollar metacharacter. It does not affect \eZ
2191 or \ez.
2192 .sp
2194 .sp
2195 An empty string is not considered to be a valid match if this option is set. If
2196 there are alternatives in the pattern, they are tried. If all the alternatives
2197 match the empty string, the entire match fails. For example, if the pattern
2198 .sp
2199 a?b?
2200 .sp
2201 is applied to a string not beginning with "a" or "b", it matches an empty
2202 string at the start of the subject. With PCRE2_NOTEMPTY set, this match is not
2203 valid, so \fBpcre2_match()\fP searches further into the string for occurrences
2204 of "a" or "b".
2205 .sp
2207 .sp
2208 This is like PCRE2_NOTEMPTY, except that it locks out an empty string match
2209 only at the first matching position, that is, at the start of the subject plus
2210 the starting offset. An empty string match later in the subject is permitted.
2211 If the pattern is anchored, such a match can occur only if the pattern contains
2212 \eK.
2213 .sp
2215 .sp
2216 By default, if a pattern has been successfully processed by
2217 \fBpcre2_jit_compile()\fP, JIT is automatically used when \fBpcre2_match()\fP
2218 is called with options that JIT supports. Setting PCRE2_NO_JIT disables the use
2219 of JIT; it forces matching to be done by the interpreter.
2220 .sp
2222 .sp
2223 When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
2224 string is checked by default when \fBpcre2_match()\fP is subsequently called.
2225 If a non-zero starting offset is given, the check is applied only to that part
2226 of the subject that could be inspected during matching, and there is a check
2227 that the starting offset points to the first code unit of a character or to the
2228 end of the subject. If there are no lookbehind assertions in the pattern, the
2229 check starts at the starting offset. Otherwise, it starts at the length of the
2230 longest lookbehind before the starting offset, or at the start of the subject
2231 if there are not that many characters before the starting offset. Note that the
2232 sequences \eb and \eB are one-character lookbehinds.
2233 .P
2234 The check is carried out before any other processing takes place, and a
2235 negative error code is returned if the check fails. There are several UTF error
2236 codes for each code unit width, corresponding to different problems with the
2237 code unit sequence. There are discussions about the validity of
2238 .\" HTML <a href="pcre2unicode.html#utf8strings">
2239 .\" </a>
2240 UTF-8 strings,
2241 .\"
2242 .\" HTML <a href="pcre2unicode.html#utf16strings">
2243 .\" </a>
2244 UTF-16 strings,
2245 .\"
2246 and
2247 .\" HTML <a href="pcre2unicode.html#utf32strings">
2248 .\" </a>
2249 UTF-32 strings
2250 .\"
2251 in the
2252 .\" HREF
2253 \fBpcre2unicode\fP
2254 .\"
2255 page.
2256 .P
2257 If you know that your subject is valid, and you want to skip these checks for
2258 performance reasons, you can set the PCRE2_NO_UTF_CHECK option when calling
2259 \fBpcre2_match()\fP. You might want to do this for the second and subsequent
2260 calls to \fBpcre2_match()\fP if you are making repeated calls to find all the
2261 matches in a single subject string.
2262 .P
2263 NOTE: When PCRE2_NO_UTF_CHECK is set, the effect of passing an invalid string
2264 as a subject, or an invalid value of \fIstartoffset\fP, is undefined. Your
2265 program may crash or loop indefinitely.
2266 .sp
2269 .sp
2270 These options turn on the partial matching feature. A partial match occurs if
2271 the end of the subject string is reached successfully, but there are not enough
2272 subject characters to complete the match. If this happens when
2273 PCRE2_PARTIAL_SOFT (but not PCRE2_PARTIAL_HARD) is set, matching continues by
2274 testing any remaining alternatives. Only if no complete match can be found is
2275 PCRE2_ERROR_PARTIAL returned instead of PCRE2_ERROR_NOMATCH. In other words,
2276 PCRE2_PARTIAL_SOFT specifies that the caller is prepared to handle a partial
2277 match, but only if no complete match can be found.
2278 .P
2279 If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this case, if
2280 a partial match is found, \fBpcre2_match()\fP immediately returns
2281 PCRE2_ERROR_PARTIAL, without considering any other alternatives. In other
2282 words, when PCRE2_PARTIAL_HARD is set, a partial match is considered to be more
2283 important that an alternative complete match.
2284 .P
2285 There is a more detailed discussion of partial and multi-segment matching, with
2286 examples, in the
2287 .\" HREF
2288 \fBpcre2partial\fP
2289 .\"
2290 documentation.
2291 .
2292 .
2293 .
2295 .rs
2296 .sp
2297 When PCRE2 is built, a default newline convention is set; this is usually the
2298 standard convention for the operating system. The default can be overridden in
2299 a
2300 .\" HTML <a href="#compilecontext">
2301 .\" </a>
2302 compile context
2303 .\"
2304 by calling \fBpcre2_set_newline()\fP. It can also be overridden by starting a
2305 pattern string with, for example, (*CRLF), as described in the
2306 .\" HTML <a href="pcre2pattern.html#newlines">
2307 .\" </a>
2308 section on newline conventions
2309 .\"
2310 in the
2311 .\" HREF
2312 \fBpcre2pattern\fP
2313 .\"
2314 page. During matching, the newline choice affects the behaviour of the dot,
2315 circumflex, and dollar metacharacters. It may also alter the way the match
2316 starting position is advanced after a match failure for an unanchored pattern.
2317 .P
2319 the newline convention, and a match attempt for an unanchored pattern fails
2320 when the current starting position is at a CRLF sequence, and the pattern
2321 contains no explicit matches for CR or LF characters, the match position is
2322 advanced by two characters instead of one, in other words, to after the CRLF.
2323 .P
2324 The above rule is a compromise that makes the most common cases work as
2325 expected. For example, if the pattern is .+A (and the PCRE2_DOTALL option is
2326 not set), it does not match the string "\er\enA" because, after failing at the
2327 start, it skips both the CR and the LF before retrying. However, the pattern
2328 [\er\en]A does match that string, because it contains an explicit CR or LF
2329 reference, and so advances only by one character after the first failure.
2330 .P
2331 An explicit match for CR of LF is either a literal appearance of one of those
2332 characters in the pattern, or one of the \er or \en escape sequences. Implicit
2333 matches such as [^X] do not count, nor does \es, even though it includes CR and
2334 LF in the characters that it matches.
2335 .P
2336 Notwithstanding the above, anomalous effects may still occur when CRLF is a
2337 valid newline sequence and explicit \er or \en escapes appear in the pattern.
2338 .
2339 .
2340 .\" HTML <a name="matchedstrings"></a>
2342 .rs
2343 .sp
2344 .nf
2345 .B uint32_t pcre2_get_ovector_count(pcre2_match_data *\fImatch_data\fP);
2346 .sp
2347 .B PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *\fImatch_data\fP);
2348 .fi
2349 .P
2350 In general, a pattern matches a certain portion of the subject, and in
2351 addition, further substrings from the subject may be picked out by
2352 parenthesized parts of the pattern. Following the usage in Jeffrey Friedl's
2353 book, this is called "capturing" in what follows, and the phrase "capturing
2354 subpattern" or "capturing group" is used for a fragment of a pattern that picks
2355 out a substring. PCRE2 supports several other kinds of parenthesized subpattern
2356 that do not cause substrings to be captured. The \fBpcre2_pattern_info()\fP
2357 function can be used to find out how many capturing subpatterns there are in a
2358 compiled pattern.
2359 .P
2360 You can use auxiliary functions for accessing captured substrings
2361 .\" HTML <a href="#extractbynumber">
2362 .\" </a>
2363 by number
2364 .\"
2365 or
2366 .\" HTML <a href="#extractbyname">
2367 .\" </a>
2368 by name,
2369 .\"
2370 as described in sections below.
2371 .P
2372 Alternatively, you can make direct use of the vector of PCRE2_SIZE values,
2373 called the \fBovector\fP, which contains the offsets of captured strings. It is
2374 part of the
2375 .\" HTML <a href="#matchdatablock">
2376 .\" </a>
2377 match data block.
2378 .\"
2379 The function \fBpcre2_get_ovector_pointer()\fP returns the address of the
2380 ovector, and \fBpcre2_get_ovector_count()\fP returns the number of pairs of
2381 values it contains.
2382 .P
2383 Within the ovector, the first in each pair of values is set to the offset of
2384 the first code unit of a substring, and the second is set to the offset of the
2385 first code unit after the end of a substring. These values are always code unit
2386 offsets, not character offsets. That is, they are byte offsets in the 8-bit
2387 library, 16-bit offsets in the 16-bit library, and 32-bit offsets in the 32-bit
2388 library.
2389 .P
2390 After a partial match (error return PCRE2_ERROR_PARTIAL), only the first pair
2391 of offsets (that is, \fIovector[0]\fP and \fIovector[1]\fP) are set. They
2392 identify the part of the subject that was partially matched. See the
2393 .\" HREF
2394 \fBpcre2partial\fP
2395 .\"
2396 documentation for details of partial matching.
2397 .P
2398 After a successful match, the first pair of offsets identifies the portion of
2399 the subject string that was matched by the entire pattern. The next pair is
2400 used for the first capturing subpattern, and so on. The value returned by
2401 \fBpcre2_match()\fP is one more than the highest numbered pair that has been
2402 set. For example, if two substrings have been captured, the returned value is
2403 3. If there are no capturing subpatterns, the return value from a successful
2404 match is 1, indicating that just the first pair of offsets has been set.
2405 .P
2406 If a pattern uses the \eK escape sequence within a positive assertion, the
2407 reported start of a successful match can be greater than the end of the match.
2408 For example, if the pattern (?=ab\eK) is matched against "ab", the start and
2409 end offset values for the match are 2 and 0.
2410 .P
2411 If a capturing subpattern group is matched repeatedly within a single match
2412 operation, it is the last portion of the subject that it matched that is
2413 returned.
2414 .P
2415 If the ovector is too small to hold all the captured substring offsets, as much
2416 as possible is filled in, and the function returns a value of zero. If captured
2417 substrings are not of interest, \fBpcre2_match()\fP may be called with a match
2418 data block whose ovector is of minimum length (that is, one pair). However, if
2419 the pattern contains back references and the \fIovector\fP is not big enough to
2420 remember the related substrings, PCRE2 has to get additional memory for use
2421 during matching. Thus it is usually advisable to set up a match data block
2422 containing an ovector of reasonable size.
2423 .P
2424 It is possible for capturing subpattern number \fIn+1\fP to match some part of
2425 the subject when subpattern \fIn\fP has not been used at all. For example, if
2426 the string "abc" is matched against the pattern (a|(z))(bc) the return from the
2427 function is 4, and subpatterns 1 and 3 are matched, but 2 is not. When this
2428 happens, both values in the offset pairs corresponding to unused subpatterns
2429 are set to PCRE2_UNSET.
2430 .P
2431 Offset values that correspond to unused subpatterns at the end of the
2432 expression are also set to PCRE2_UNSET. For example, if the string "abc" is
2433 matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched.
2434 The return from the function is 2, because the highest used capturing
2435 subpattern number is 1. The offsets for for the second and third capturing
2436 subpatterns (assuming the vector is large enough, of course) are set to
2438 .P
2439 Elements in the ovector that do not correspond to capturing parentheses in the
2440 pattern are never changed. That is, if a pattern contains \fIn\fP capturing
2441 parentheses, no more than \fIovector[0]\fP to \fIovector[2n+1]\fP are set by
2442 \fBpcre2_match()\fP. The other elements retain whatever values they previously
2443 had.
2444 .
2445 .
2446 .\" HTML <a name="matchotherdata"></a>
2448 .rs
2449 .sp
2450 .nf
2451 .B PCRE2_SPTR pcre2_get_mark(pcre2_match_data *\fImatch_data\fP);
2452 .sp
2453 .B PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *\fImatch_data\fP);
2454 .fi
2455 .P
2456 As well as the offsets in the ovector, other information about a match is
2457 retained in the match data block and can be retrieved by the above functions in
2458 appropriate circumstances. If they are called at other times, the result is
2459 undefined.
2460 .P
2461 After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure
2462 to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be available, and
2463 \fBpcre2_get_mark()\fP can be called. It returns a pointer to the
2464 zero-terminated name, which is within the compiled pattern. Otherwise NULL is
2465 returned. The length of the (*MARK) name (excluding the terminating zero) is
2466 stored in the code unit that preceeds the name. You should use this instead of
2467 relying on the terminating zero if the (*MARK) name might contain a binary
2468 zero.
2469 .P
2470 After a successful match, the (*MARK) name that is returned is the
2471 last one encountered on the matching path through the pattern. After a "no
2472 match" or a partial match, the last encountered (*MARK) name is returned. For
2473 example, consider this pattern:
2474 .sp
2475 ^(*MARK:A)((*MARK:B)a|b)c
2476 .sp
2477 When it matches "bc", the returned mark is A. The B mark is "seen" in the first
2478 branch of the group, but it is not on the matching path. On the other hand,
2479 when this pattern fails to match "bx", the returned mark is B.
2480 .P
2481 After a successful match, a partial match, or one of the invalid UTF errors
2482 (for example, PCRE2_ERROR_UTF8_ERR5), \fBpcre2_get_startchar()\fP can be
2483 called. After a successful or partial match it returns the code unit offset of
2484 the character at which the match started. For a non-partial match, this can be
2485 different to the value of \fIovector[0]\fP if the pattern contains the \eK
2486 escape sequence. After a partial match, however, this value is always the same
2487 as \fIovector[0]\fP because \eK does not affect the result of a partial match.
2488 .P
2489 After a UTF check failure, \fBpcre2_get_startchar()\fP can be used to obtain
2490 the code unit offset of the invalid UTF character. Details are given in the
2491 .\" HREF
2492 \fBpcre2unicode\fP
2493 .\"
2494 page.
2495 .
2496 .
2497 .\" HTML <a name="errorlist"></a>
2498 .SH "ERROR RETURNS FROM \fBpcre2_match()\fP"
2499 .rs
2500 .sp
2501 If \fBpcre2_match()\fP fails, it returns a negative number. This can be
2502 converted to a text string by calling the \fBpcre2_get_error_message()\fP
2503 function (see "Obtaining a textual error message"
2504 .\" HTML <a href="#geterrormessage">
2505 .\" </a>
2506 below).
2507 .\"
2508 Negative error codes are also returned by other functions, and are documented
2509 with them. The codes are given names in the header file. If UTF checking is in
2510 force and an invalid UTF subject string is detected, one of a number of
2511 UTF-specific negative error codes is returned. Details are given in the
2512 .\" HREF
2513 \fBpcre2unicode\fP
2514 .\"
2515 page. The following are the other errors that may be returned by
2516 \fBpcre2_match()\fP:
2517 .sp
2519 .sp
2520 The subject string did not match the pattern.
2521 .sp
2523 .sp
2524 The subject string did not match, but it did match partially. See the
2525 .\" HREF
2526 \fBpcre2partial\fP
2527 .\"
2528 documentation for details of partial matching.
2529 .sp
2531 .sp
2532 PCRE2 stores a 4-byte "magic number" at the start of the compiled code, to
2533 catch the case when it is passed a junk pointer. This is the error that is
2534 returned when the magic number is not present.
2535 .sp
2537 .sp
2538 This error is given when a pattern that was compiled by the 8-bit library is
2539 passed to a 16-bit or 32-bit library function, or vice versa.
2540 .sp
2542 .sp
2543 The value of \fIstartoffset\fP was greater than the length of the subject.
2544 .sp
2546 .sp
2547 An unrecognized bit was set in the \fIoptions\fP argument.
2548 .sp
2550 .sp
2551 The UTF code unit sequence that was passed as a subject was checked and found
2552 to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the value of
2553 \fIstartoffset\fP did not point to the beginning of a UTF character or the end
2554 of the subject.
2555 .sp
2557 .sp
2558 This error is never generated by \fBpcre2_match()\fP itself. It is provided for
2559 use by callout functions that want to cause \fBpcre2_match()\fP or
2560 \fBpcre2_callout_enumerate()\fP to return a distinctive error code. See the
2561 .\" HREF
2562 \fBpcre2callout\fP
2563 .\"
2564 documentation for details.
2565 .sp
2567 .sp
2568 An unexpected internal error has occurred. This error could be caused by a bug
2569 in PCRE2 or by overwriting of the compiled pattern.
2570 .sp
2572 .sp
2573 This error is returned when a pattern that was successfully studied using JIT
2574 is being matched, but the matching mode (partial or complete match) does not
2575 correspond to any JIT compilation mode. When the JIT fast path function is
2576 used, this error may be also given for invalid options. See the
2577 .\" HREF
2578 \fBpcre2jit\fP
2579 .\"
2580 documentation for more details.
2581 .sp
2583 .sp
2584 This error is returned when a pattern that was successfully studied using JIT
2585 is being matched, but the memory available for the just-in-time processing
2586 stack is not large enough. See the
2587 .\" HREF
2588 \fBpcre2jit\fP
2589 .\"
2590 documentation for more details.
2591 .sp
2593 .sp
2594 The backtracking limit was reached.
2595 .sp
2597 .sp
2598 If a pattern contains back references, but the ovector is not big enough to
2599 remember the referenced substrings, PCRE2 gets a block of memory at the start
2600 of matching to use for this purpose. There are some other special cases where
2601 extra memory is needed during matching. This error is given when memory cannot
2602 be obtained.
2603 .sp
2605 .sp
2606 Either the \fIcode\fP, \fIsubject\fP, or \fImatch_data\fP argument was passed
2607 as NULL.
2608 .sp
2610 .sp
2611 This error is returned when \fBpcre2_match()\fP detects a recursion loop within
2612 the pattern. Specifically, it means that either the whole pattern or a
2613 subpattern has been called recursively for the second time at the same position
2614 in the subject string. Some simple patterns that might do this are detected and
2615 faulted at compile time, but more complicated cases, in particular mutual
2616 recursions between two different subpatterns, cannot be detected until matching
2617 is attempted.
2618 .sp
2620 .sp
2621 The internal recursion limit was reached.
2622 .
2623 .
2624 .\" HTML <a name="geterrormessage"></a>
2626 .rs
2627 .sp
2628 .nf
2629 .B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP,
2630 .B " PCRE2_SIZE \fIbufflen\fP);"
2631 .fi
2632 .P
2633 A text message for an error code from any PCRE2 function (compile, match, or
2634 auxiliary) can be obtained by calling \fBpcre2_get_error_message()\fP. The code
2635 is passed as the first argument, with the remaining two arguments specifying a
2636 code unit buffer and its length in code units, into which the text message is
2637 placed. The message is returned in code units of the appropriate width for the
2638 library that is being used.
2639 .P
2640 The returned message is terminated with a trailing zero, and the function
2641 returns the number of code units used, excluding the trailing zero. If the
2642 error number is unknown, the negative error code PCRE2_ERROR_BADDATA is
2643 returned. If the buffer is too small, the message is truncated (but still with
2644 a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned.
2645 None of the messages are very long; a buffer size of 120 code units is ample.
2646 .
2647 .
2648 .\" HTML <a name="extractbynumber"></a>
2650 .rs
2651 .sp
2652 .nf
2653 .B int pcre2_substring_length_bynumber(pcre2_match_data *\fImatch_data\fP,
2654 .B " uint32_t \fInumber\fP, PCRE2_SIZE *\fIlength\fP);"
2655 .sp
2656 .B int pcre2_substring_copy_bynumber(pcre2_match_data *\fImatch_data\fP,
2657 .B " uint32_t \fInumber\fP, PCRE2_UCHAR *\fIbuffer\fP,"
2658 .B " PCRE2_SIZE *\fIbufflen\fP);"
2659 .sp
2660 .B int pcre2_substring_get_bynumber(pcre2_match_data *\fImatch_data\fP,
2661 .B " uint32_t \fInumber\fP, PCRE2_UCHAR **\fIbufferptr\fP,"
2662 .B " PCRE2_SIZE *\fIbufflen\fP);"
2663 .sp
2664 .B void pcre2_substring_free(PCRE2_UCHAR *\fIbuffer\fP);
2665 .fi
2666 .P
2667 Captured substrings can be accessed directly by using the ovector as described
2668 .\" HTML <a href="#matchedstrings">
2669 .\" </a>
2670 above.
2671 .\"
2672 For convenience, auxiliary functions are provided for extracting captured
2673 substrings as new, separate, zero-terminated strings. A substring that contains
2674 a binary zero is correctly extracted and has a further zero added on the end,
2675 but the result is not, of course, a C string.
2676 .P
2677 The functions in this section identify substrings by number. The number zero
2678 refers to the entire matched substring, with higher numbers referring to
2679 substrings captured by parenthesized groups. After a partial match, only
2680 substring zero is available. An attempt to extract any other substring gives
2681 the error PCRE2_ERROR_PARTIAL. The next section describes similar functions for
2682 extracting captured substrings by name.
2683 .P
2684 If a pattern uses the \eK escape sequence within a positive assertion, the
2685 reported start of a successful match can be greater than the end of the match.
2686 For example, if the pattern (?=ab\eK) is matched against "ab", the start and
2687 end offset values for the match are 2 and 0. In this situation, calling these
2688 functions with a zero substring number extracts a zero-length empty string.
2689 .P
2690 You can find the length in code units of a captured substring without
2691 extracting it by calling \fBpcre2_substring_length_bynumber()\fP. The first
2692 argument is a pointer to the match data block, the second is the group number,
2693 and the third is a pointer to a variable into which the length is placed. If
2694 you just want to know whether or not the substring has been captured, you can
2695 pass the third argument as NULL.
2696 .P
2697 The \fBpcre2_substring_copy_bynumber()\fP function copies a captured substring
2698 into a supplied buffer, whereas \fBpcre2_substring_get_bynumber()\fP copies it
2699 into new memory, obtained using the same memory allocation function that was
2700 used for the match data block. The first two arguments of these functions are a
2701 pointer to the match data block and a capturing group number.
2702 .P
2703 The final arguments of \fBpcre2_substring_copy_bynumber()\fP are a pointer to
2704 the buffer and a pointer to a variable that contains its length in code units.
2705 This is updated to contain the actual number of code units used for the
2706 extracted substring, excluding the terminating zero.
2707 .P
2708 For \fBpcre2_substring_get_bynumber()\fP the third and fourth arguments point
2709 to variables that are updated with a pointer to the new memory and the number
2710 of code units that comprise the substring, again excluding the terminating
2711 zero. When the substring is no longer needed, the memory should be freed by
2712 calling \fBpcre2_substring_free()\fP.
2713 .P
2714 The return value from all these functions is zero for success, or a negative
2715 error code. If the pattern match failed, the match failure code is returned.
2716 If a substring number greater than zero is used after a partial match,
2717 PCRE2_ERROR_PARTIAL is returned. Other possible error codes are:
2718 .sp
2720 .sp
2721 The buffer was too small for \fBpcre2_substring_copy_bynumber()\fP, or the
2722 attempt to get memory failed for \fBpcre2_substring_get_bynumber()\fP.
2723 .sp
2725 .sp
2726 There is no substring with that number in the pattern, that is, the number is
2727 greater than the number of capturing parentheses.
2728 .sp
2730 .sp
2731 The substring number, though not greater than the number of captures in the
2732 pattern, is greater than the number of slots in the ovector, so the substring
2733 could not be captured.
2734 .sp
2736 .sp
2737 The substring did not participate in the match. For example, if the pattern is
2738 (abc)|(def) and the subject is "def", and the ovector contains at least two
2739 capturing slots, substring number 1 is unset.
2740 .
2741 .
2743 .rs
2744 .sp
2745 .nf
2746 .B int pcre2_substring_list_get(pcre2_match_data *\fImatch_data\fP,
2747 .B " PCRE2_UCHAR ***\fIlistptr\fP, PCRE2_SIZE **\fIlengthsptr\fP);
2748 .sp
2749 .B void pcre2_substring_list_free(PCRE2_SPTR *\fIlist\fP);
2750 .fi
2751 .P
2752 The \fBpcre2_substring_list_get()\fP function extracts all available substrings
2753 and builds a list of pointers to them. It also (optionally) builds a second
2754 list that contains their lengths (in code units), excluding a terminating zero
2755 that is added to each of them. All this is done in a single block of memory
2756 that is obtained using the same memory allocation function that was used to get
2757 the match data block.
2758 .P
2759 This function must be called only after a successful match. If called after a
2760 partial match, the error code PCRE2_ERROR_PARTIAL is returned.
2761 .P
2762 The address of the memory block is returned via \fIlistptr\fP, which is also
2763 the start of the list of string pointers. The end of the list is marked by a
2764 NULL pointer. The address of the list of lengths is returned via
2765 \fIlengthsptr\fP. If your strings do not contain binary zeros and you do not
2766 therefore need the lengths, you may supply NULL as the \fBlengthsptr\fP
2767 argument to disable the creation of a list of lengths. The yield of the
2768 function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the memory block
2769 could not be obtained. When the list is no longer needed, it should be freed by
2770 calling \fBpcre2_substring_list_free()\fP.
2771 .P
2772 If this function encounters a substring that is unset, which can happen when
2773 capturing subpattern number \fIn+1\fP matches some part of the subject, but
2774 subpattern \fIn\fP has not been used at all, it returns an empty string. This
2775 can be distinguished from a genuine zero-length substring by inspecting the
2776 appropriate offset in the ovector, which contain PCRE2_UNSET for unset
2777 substrings, or by calling \fBpcre2_substring_length_bynumber()\fP.
2778 .
2779 .
2780 .\" HTML <a name="extractbyname"></a>
2782 .rs
2783 .sp
2784 .nf
2785 .B int pcre2_substring_number_from_name(const pcre2_code *\fIcode\fP,
2786 .B " PCRE2_SPTR \fIname\fP);"
2787 .sp
2788 .B int pcre2_substring_length_byname(pcre2_match_data *\fImatch_data\fP,
2789 .B " PCRE2_SPTR \fIname\fP, PCRE2_SIZE *\fIlength\fP);"
2790 .sp
2791 .B int pcre2_substring_copy_byname(pcre2_match_data *\fImatch_data\fP,
2792 .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR *\fIbuffer\fP, PCRE2_SIZE *\fIbufflen\fP);"
2793 .sp
2794 .B int pcre2_substring_get_byname(pcre2_match_data *\fImatch_data\fP,
2795 .B " PCRE2_SPTR \fIname\fP, PCRE2_UCHAR **\fIbufferptr\fP, PCRE2_SIZE *\fIbufflen\fP);"
2796 .sp
2797 .B void pcre2_substring_free(PCRE2_UCHAR *\fIbuffer\fP);
2798 .fi
2799 .P
2800 To extract a substring by name, you first have to find associated number.
2801 For example, for this pattern:
2802 .sp
2803 (a+)b(?<xxx>\ed+)...
2804 .sp
2805 the number of the subpattern called "xxx" is 2. If the name is known to be
2806 unique (PCRE2_DUPNAMES was not set), you can find the number from the name by
2807 calling \fBpcre2_substring_number_from_name()\fP. The first argument is the
2808 compiled pattern, and the second is the name. The yield of the function is the
2809 subpattern number, PCRE2_ERROR_NOSUBSTRING if there is no subpattern of that
2810 name, or PCRE2_ERROR_NOUNIQUESUBSTRING if there is more than one subpattern of
2811 that name. Given the number, you can extract the substring directly, or use one
2812 of the functions described above.
2813 .P
2814 For convenience, there are also "byname" functions that correspond to the
2815 "bynumber" functions, the only difference being that the second argument is a
2816 name instead of a number. If PCRE2_DUPNAMES is set and there are duplicate
2817 names, these functions scan all the groups with the given name, and return the
2818 first named string that is set.
2819 .P
2820 If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is
2821 returned. If all groups with the name have numbers that are greater than the
2822 number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is returned. If there
2823 is at least one group with a slot in the ovector, but no group is found to be
2824 set, PCRE2_ERROR_UNSET is returned.
2825 .P
2826 \fBWarning:\fP If the pattern uses the (?| feature to set up multiple
2827 subpatterns with the same number, as described in the
2828 .\" HTML <a href="pcre2pattern.html#dupsubpatternnumber">
2829 .\" </a>
2830 section on duplicate subpattern numbers
2831 .\"
2832 in the
2833 .\" HREF
2834 \fBpcre2pattern\fP
2835 .\"
2836 page, you cannot use names to distinguish the different subpatterns, because
2837 names are not included in the compiled code. The matching process uses only
2838 numbers. For this reason, the use of different names for subpatterns of the
2839 same number causes an error at compile time.
2840 .
2841 .
2843 .rs
2844 .sp
2845 .nf
2846 .B int pcre2_substitute(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP,
2847 .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
2848 .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
2849 .B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacement\fP,"
2850 .B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\zfP,"
2851 .B " PCRE2_SIZE *\fIoutlengthptr\fP);"
2852 .fi
2853 .P
2854 This function calls \fBpcre2_match()\fP and then makes a copy of the subject
2855 string in \fIoutputbuffer\fP, replacing the part that was matched with the
2856 \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can
2857 be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
2858 which a \eK item in a lookahead in the pattern causes the match to end before
2859 it starts are not supported, and give rise to an error return.
2860 .P
2861 The first seven arguments of \fBpcre2_substitute()\fP are the same as for
2862 \fBpcre2_match()\fP, except that the partial matching options are not
2863 permitted, and \fImatch_data\fP may be passed as NULL, in which case a match
2864 data block is obtained and freed within this function, using memory management
2865 functions from the match context, if provided, or else those that were used to
2866 allocate memory for the compiled code.
2867 .P
2868 The \fIoutlengthptr\fP argument must point to a variable that contains the
2869 length, in code units, of the output buffer. If the function is successful, the
2870 value is updated to contain the length of the new string, excluding the
2871 trailing zero that is automatically added.
2872 .P
2873 If the function is not successful, the value set via \fIoutlengthptr\fP depends
2874 on the type of error. For syntax errors in the replacement string, the value is
2875 the offset in the replacement string where the error was detected. For other
2876 errors, the value is PCRE2_UNSET by default. This includes the case of the
2877 output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set
2878 (see below), in which case the value is the minimum length needed, including
2879 space for the trailing zero. Note that in order to compute the required length,
2880 \fBpcre2_substitute()\fP has to simulate all the matching and copying, instead
2881 of giving an error return as soon as the buffer overflows. Note also that the
2882 length is in code units, not bytes.
2883 .P
2884 In the replacement string, which is interpreted as a UTF string in UTF mode,
2885 and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
2886 dollar character is an escape character that can specify the insertion of
2887 characters from capturing groups or (*MARK) items in the pattern. The following
2888 forms are always recognized:
2889 .sp
2890 $$ insert a dollar character
2891 $<n> or ${<n>} insert the contents of group <n>
2892 $*MARK or ${*MARK} insert the name of the last (*MARK) encountered
2893 .sp
2894 Either a group number or a group name can be given for <n>. Curly brackets are
2895 required only if the following character would be interpreted as part of the
2896 number or name. The number may be zero to include the entire matched string.
2897 For example, if the pattern a(b)c is matched with "=abc=" and the replacement
2898 string "+$1$0$1+", the result is "=+babcb+=".
2899 .P
2900 The facility for inserting a (*MARK) name can be used to perform simple
2901 simultaneous substitutions, as this \fBpcre2test\fP example shows:
2902 .sp
2903 /(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
2904 apple lemon
2905 2: pear orange
2906 .sp
2907 As well as the usual options for \fBpcre2_match()\fP, a number of additional
2908 options can be set in the \fIoptions\fP argument.
2909 .P
2910 PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string,
2911 replacing every matching substring. If this is not set, only the first matching
2912 substring is replaced. If any matched substring has zero length, after the
2913 substitution has happened, an attempt to find a non-empty match at the same
2914 position is performed. If this is not successful, the current position is
2915 advanced by one character except when CRLF is a valid newline sequence and the
2916 next two characters are CR, LF. In this case, the current position is advanced
2917 by two characters.
2918 .P
2919 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
2920 too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
2921 this option is set, however, \fBpcre2_substitute()\fP continues to go through
2922 the motions of matching and substituting (without, of course, writing anything)
2923 in order to compute the size of buffer that is needed. This value is passed
2924 back via the \fIoutlengthptr\fP variable, with the result of the function still
2926 .P
2927 Passing a buffer size of zero is a permitted way of finding out how much memory
2928 is needed for given substitution. However, this does mean that the entire
2929 operation is carried out twice. Depending on the application, it may be more
2930 efficient to allocate a large buffer and free the excess afterwards, instead of
2932 .P
2933 PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups that do
2934 not appear in the pattern to be treated as unset groups. This option should be
2935 used with care, because it means that a typo in a group name or number no
2936 longer causes the PCRE2_ERROR_NOSUBSTRING error.
2937 .P
2938 PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including unknown
2939 groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty
2940 strings when inserted as described above. If this option is not set, an attempt
2941 to insert an unset group causes the PCRE2_ERROR_UNSET error. This option does
2942 not influence the extended substitution syntax described below.
2943 .P
2944 PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the
2945 replacement string. Without this option, only the dollar character is special,
2946 and only the group insertion forms listed above are valid. When
2947 PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
2948 .P
2949 Firstly, backslash in a replacement string is interpreted as an escape
2950 character. The usual forms such as \en or \ex{ddd} can be used to specify
2951 particular character codes, and backslash followed by any non-alphanumeric
2952 character quotes that character. Extended quoting can be coded using \eQ...\eE,
2953 exactly as in pattern strings.
2954 .P
2955 There are also four escape sequences for forcing the case of inserted letters.
2956 The insertion mechanism has three states: no case forcing, force upper case,
2957 and force lower case. The escape sequences change the current state: \eU and
2958 \eL change to upper or lower case forcing, respectively, and \eE (when not
2959 terminating a \eQ quoted sequence) reverts to no case forcing. The sequences
2960 \eu and \el force the next character (if it is a letter) to upper or lower
2961 case, respectively, and then the state automatically reverts to no case
2962 forcing. Case forcing applies to all inserted characters, including those from
2963 captured groups and letters within \eQ...\eE quoted sequences.
2964 .P
2965 Note that case forcing sequences such as \eU...\eE do not nest. For example,
2966 the result of processing "\eUaa\eLBB\eEcc\eE" is "AAbbcc"; the final \eE has no
2967 effect.
2968 .P
2969 The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
2970 flexibility to group substitution. The syntax is similar to that used by Bash:
2971 .sp
2972 ${<n>:-<string>}
2973 ${<n>:+<string1>:<string2>}
2974 .sp
2975 As before, <n> may be a group number or a name. The first form specifies a
2976 default value. If group <n> is set, its value is inserted; if not, <string> is
2977 expanded and the result inserted. The second form specifies strings that are
2978 expanded and inserted when group <n> is set or unset, respectively. The first
2979 form is just a convenient shorthand for
2980 .sp
2981 ${<n>:+${<n>}:<string>}
2982 .sp
2983 Backslash can be used to escape colons and closing curly brackets in the
2984 replacement strings. A change of the case forcing state within a replacement
2985 string remains in force afterwards, as shown in this \fBpcre2test\fP example:
2986 .sp
2987 /(some)?(body)/substitute_extended,replace=${1:+\eU:\eL}HeLLo
2988 body
2989 1: hello
2990 somebody
2991 1: HELLO
2992 .sp
2993 The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
2994 substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
2995 groups in the extended syntax forms to be treated as unset.
2996 .P
2997 If successful, \fBpcre2_substitute()\fP returns the number of replacements that
2998 were made. This may be zero if no matches were found, and is never greater than
2999 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
3000 .P
3001 In the event of an error, a negative error code is returned. Except for
3002 PCRE2_ERROR_NOMATCH (which is never returned), errors from \fBpcre2_match()\fP
3003 are passed straight back.
3004 .P
3005 PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion,
3007 .P
3008 PCRE2_ERROR_UNSET is returned for an unset substring insertion (including an
3009 unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple
3010 (non-extended) syntax is used and PCRE2_SUBSTITUTE_UNSET_EMPTY is not set.
3011 .P
3012 PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the
3013 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is
3014 needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
3015 default.
3016 .P
3017 PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
3018 replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
3019 (invalid escape sequence), PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket
3020 not found), PCRE2_BADSUBSTITUTION (syntax error in extended group
3021 substitution), and PCRE2_BADSUBPATTERN (the pattern match ended before it
3022 started, which can happen if \eK is used in an assertion).
3023 .P
3024 As for all PCRE2 errors, a text message that describes the error can be
3025 obtained by calling the \fBpcre2_get_error_message()\fP function (see
3026 "Obtaining a textual error message"
3027 .\" HTML <a href="#geterrormessage">
3028 .\" </a>
3029 above).
3030 .\"
3031 .
3032 .
3034 .rs
3035 .sp
3036 .nf
3037 .B int pcre2_substring_nametable_scan(const pcre2_code *\fIcode\fP,
3038 .B " PCRE2_SPTR \fIname\fP, PCRE2_SPTR *\fIfirst\fP, PCRE2_SPTR *\fIlast\fP);"
3039 .fi
3040 .P
3041 When a pattern is compiled with the PCRE2_DUPNAMES option, names for
3042 subpatterns are not required to be unique. Duplicate names are always allowed
3043 for subpatterns with the same number, created by using the (?| feature. Indeed,
3044 if such subpatterns are named, they are required to use the same names.
3045 .P
3046 Normally, patterns with duplicate names are such that in any one match, only
3047 one of the named subpatterns participates. An example is shown in the
3048 .\" HREF
3049 \fBpcre2pattern\fP
3050 .\"
3051 documentation.
3052 .P
3053 When duplicates are present, \fBpcre2_substring_copy_byname()\fP and
3054 \fBpcre2_substring_get_byname()\fP return the first substring corresponding to
3055 the given name that is set. Only if none are set is PCRE2_ERROR_UNSET is
3056 returned. The \fBpcre2_substring_number_from_name()\fP function returns the
3057 error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names.
3058 .P
3059 If you want to get full details of all captured substrings for a given name,
3060 you must use the \fBpcre2_substring_nametable_scan()\fP function. The first
3061 argument is the compiled pattern, and the second is the name. If the third and
3062 fourth arguments are NULL, the function returns a group number for a unique
3063 name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
3064 .P
3065 When the third and fourth arguments are not NULL, they must be pointers to
3066 variables that are updated by the function. After it has run, they point to the
3067 first and last entries in the name-to-number table for the given name, and the
3068 function returns the length of each entry in code units. In both cases,
3069 PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.
3070 .P
3071 The format of the name table is described
3072 .\" HTML <a href="#infoaboutpattern">
3073 .\" </a>
3074 above
3075 .\"
3076 in the section entitled \fIInformation about a pattern\fP. Given all the
3077 relevant entries for the name, you can extract each of their numbers, and hence
3078 the captured data.
3079 .
3080 .
3082 .rs
3083 .sp
3084 The traditional matching function uses a similar algorithm to Perl, which stops
3085 when it finds the first match at a given point in the subject. If you want to
3086 find all possible matches, or the longest possible match at a given position,
3087 consider using the alternative matching function (see below) instead. If you
3088 cannot use the alternative function, you can kludge it up by making use of the
3089 callout facility, which is described in the
3090 .\" HREF
3091 \fBpcre2callout\fP
3092 .\"
3093 documentation.
3094 .P
3095 What you have to do is to insert a callout right at the end of the pattern.
3096 When your callout function is called, extract and save the current matched
3097 substring. Then return 1, which forces \fBpcre2_match()\fP to backtrack and try
3098 other alternatives. Ultimately, when it runs out of matches,
3099 \fBpcre2_match()\fP will yield PCRE2_ERROR_NOMATCH.
3100 .
3101 .
3102 .\" HTML <a name="dfamatch"></a>
3104 .rs
3105 .sp
3106 .nf
3107 .B int pcre2_dfa_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP,
3108 .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
3109 .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
3110 .B " pcre2_match_context *\fImcontext\fP,"
3111 .B " int *\fIworkspace\fP, PCRE2_SIZE \fIwscount\fP);"
3112 .fi
3113 .P
3114 The function \fBpcre2_dfa_match()\fP is called to match a subject string
3115 against a compiled pattern, using a matching algorithm that scans the subject
3116 string just once, and does not backtrack. This has different characteristics to
3117 the normal algorithm, and is not compatible with Perl. Some of the features of
3118 PCRE2 patterns are not supported. Nevertheless, there are times when this kind
3119 of matching can be useful. For a discussion of the two matching algorithms, and
3120 a list of features that \fBpcre2_dfa_match()\fP does not support, see the
3121 .\" HREF
3122 \fBpcre2matching\fP
3123 .\"
3124 documentation.
3125 .P
3126 The arguments for the \fBpcre2_dfa_match()\fP function are the same as for
3127 \fBpcre2_match()\fP, plus two extras. The ovector within the match data block
3128 is used in a different way, and this is described below. The other common
3129 arguments are used in the same way as for \fBpcre2_match()\fP, so their
3130 description is not repeated here.
3131 .P
3132 The two additional arguments provide workspace for the function. The workspace
3133 vector should contain at least 20 elements. It is used for keeping track of
3134 multiple paths through the pattern tree. More workspace is needed for patterns
3135 and subjects where there are a lot of potential matches.
3136 .P
3137 Here is an example of a simple call to \fBpcre2_dfa_match()\fP:
3138 .sp
3139 int wspace[20];
3140 pcre2_match_data *md = pcre2_match_data_create(4, NULL);
3141 int rc = pcre2_dfa_match(
3142 re, /* result of pcre2_compile() */
3143 "some string", /* the subject string */
3144 11, /* the length of the subject string */
3145 0, /* start at offset 0 in the subject */
3146 0, /* default options */
3147 match_data, /* the match data block */
3148 NULL, /* a match context; NULL means use defaults */
3149 wspace, /* working space vector */
3150 20); /* number of elements (NOT size in bytes) */
3151 .
3152 .SS "Option bits for \fBpcre_dfa_match()\fP"
3153 .rs
3154 .sp
3155 The unused bits of the \fIoptions\fP argument for \fBpcre2_dfa_match()\fP must
3156 be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
3159 PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for
3160 \fBpcre2_match()\fP, so their description is not repeated here.
3161 .sp
3164 .sp
3165 These have the same general effect as they do for \fBpcre2_match()\fP, but the
3166 details are slightly different. When PCRE2_PARTIAL_HARD is set for
3167 \fBpcre2_dfa_match()\fP, it returns PCRE2_ERROR_PARTIAL if the end of the
3168 subject is reached and there is still at least one matching possibility that
3169 requires additional characters. This happens even if some complete matches have
3170 already been found. When PCRE2_PARTIAL_SOFT is set, the return code
3171 PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL if the end of the
3172 subject is reached, there have been no complete matches, but there is still at
3173 least one matching possibility. The portion of the string that was inspected
3174 when the longest partial match was found is set as the first matching string in
3175 both cases. There is a more detailed discussion of partial and multi-segment
3176 matching, with examples, in the
3177 .\" HREF
3178 \fBpcre2partial\fP
3179 .\"
3180 documentation.
3181 .sp
3183 .sp
3184 Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as
3185 soon as it has found one match. Because of the way the alternative algorithm
3186 works, this is necessarily the shortest possible match at the first possible
3187 matching point in the subject string.
3188 .sp
3190 .sp
3191 When \fBpcre2_dfa_match()\fP returns a partial match, it is possible to call it
3192 again, with additional subject characters, and have it continue with the same
3193 match. The PCRE2_DFA_RESTART option requests this action; when it is set, the
3194 \fIworkspace\fP and \fIwscount\fP options must reference the same vector as
3195 before because data about the match so far is left in them after a partial
3196 match. There is more discussion of this facility in the
3197 .\" HREF
3198 \fBpcre2partial\fP
3199 .\"
3200 documentation.
3201 .
3202 .
3203 .SS "Successful returns from \fBpcre2_dfa_match()\fP"
3204 .rs
3205 .sp
3206 When \fBpcre2_dfa_match()\fP succeeds, it may have matched more than one
3207 substring in the subject. Note, however, that all the matches from one run of
3208 the function start at the same point in the subject. The shorter matches are
3209 all initial substrings of the longer matches. For example, if the pattern
3210 .sp
3211 <.*>
3212 .sp
3213 is matched against the string
3214 .sp
3215 This is <something> <something else> <something further> no more
3216 .sp
3217 the three matched strings are
3218 .sp
3219 <something> <something else> <something further>
3220 <something> <something else>
3221 <something>
3222 .sp
3223 On success, the yield of the function is a number greater than zero, which is
3224 the number of matched substrings. The offsets of the substrings are returned in
3225 the ovector, and can be extracted by number in the same way as for
3226 \fBpcre2_match()\fP, but the numbers bear no relation to any capturing groups
3227 that may exist in the pattern, because DFA matching does not support group
3228 capture.
3229 .P
3230 Calls to the convenience functions that extract substrings by name
3231 return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used after a
3232 DFA match. The convenience functions that extract substrings by number never
3233 return PCRE2_ERROR_NOSUBSTRING, and the meanings of some other errors are
3234 slightly different:
3235 .sp
3237 .sp
3238 The ovector is not big enough to include a slot for the given substring number.
3239 .sp
3241 .sp
3242 There is a slot in the ovector for this substring, but there were insufficient
3243 matches to fill it.
3244 .P
3245 The matched strings are stored in the ovector in reverse order of length; that
3246 is, the longest matching string is first. If there were too many matches to fit
3247 into the ovector, the yield of the function is zero, and the vector is filled
3248 with the longest matches.
3249 .P
3250 NOTE: PCRE2's "auto-possessification" optimization usually applies to character
3251 repeats at the end of a pattern (as well as internally). For example, the
3252 pattern "a\ed+" is compiled as if it were "a\ed++". For DFA matching, this
3253 means that only one possible match is found. If you really do want multiple
3254 matches in such cases, either use an ungreedy repeat auch as "a\ed+?" or set
3255 the PCRE2_NO_AUTO_POSSESS option when compiling.
3256 .
3257 .
3258 .SS "Error returns from \fBpcre2_dfa_match()\fP"
3259 .rs
3260 .sp
3261 The \fBpcre2_dfa_match()\fP function returns a negative number when it fails.
3262 Many of the errors are the same as for \fBpcre2_match()\fP, as described
3263 .\" HTML <a href="#errorlist">
3264 .\" </a>
3265 above.
3266 .\"
3267 There are in addition the following errors that are specific to
3268 \fBpcre2_dfa_match()\fP:
3269 .sp
3271 .sp
3272 This return is given if \fBpcre2_dfa_match()\fP encounters an item in the
3273 pattern that it does not support, for instance, the use of \eC in a UTF mode or
3274 a back reference.
3275 .sp
3277 .sp
3278 This return is given if \fBpcre2_dfa_match()\fP encounters a condition item
3279 that uses a back reference for the condition, or a test for recursion in a
3280 specific group. These are not supported.
3281 .sp
3283 .sp
3284 This return is given if \fBpcre2_dfa_match()\fP runs out of space in the
3285 \fIworkspace\fP vector.
3286 .sp
3288 .sp
3289 When a recursive subpattern is processed, the matching function calls itself
3290 recursively, using private memory for the ovector and \fIworkspace\fP. This
3291 error is given if the internal ovector is not large enough. This should be
3292 extremely rare, as a vector of size 1000 is used.
3293 .sp
3295 .sp
3296 When \fBpcre2_dfa_match()\fP is called with the \fBPCRE2_DFA_RESTART\fP option,
3297 some plausibility checks are made on the contents of the workspace, which
3298 should contain data about the previous partial match. If any of these checks
3299 fail, this error is given.
3300 .
3301 .
3302 .SH "SEE ALSO"
3303 .rs
3304 .sp
3305 \fBpcre2build\fP(3), \fBpcre2callout\fP(3), \fBpcre2demo(3)\fP,
3306 \fBpcre2matching\fP(3), \fBpcre2partial\fP(3), \fBpcre2posix\fP(3),
3307 \fBpcre2sample\fP(3), \fBpcre2stack\fP(3), \fBpcre2unicode\fP(3).
3308 .
3309 .
3311 .rs
3312 .sp
3313 .nf
3314 Philip Hazel
3315 University Computing Service
3316 Cambridge, England.
3317 .fi
3318 .
3319 .
3321 .rs
3322 .sp
3323 .nf
3324 Last updated: 21 March 2017
3325 Copyright (c) 1997-2017 University of Cambridge.
3326 .fi

  ViewVC Help
Powered by ViewVC 1.1.5