/[pcre]/code/tags/pcre-5.0/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-5.0/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 76 - (show annotations)
Sat Feb 24 21:40:39 2007 UTC (12 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 282536 byte(s)
Tag code/trunk as code/tags/pcre-5.0.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2004 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Redistribution and use in source and binary forms, with or without
16 modification, are permitted provided that the following conditions are met:
17
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
20
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
24
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
28
29 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 POSSIBILITY OF SUCH DAMAGE.
40 -----------------------------------------------------------------------------
41 */
42
43
44 /* Define DEBUG to get debugging output on stdout. */
45 /* #define DEBUG */
46
47 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
48 inline, and there are *still* stupid compilers about that don't like indented
49 pre-processor statements. I suppose it's only been 10 years... */
50
51 #ifdef DEBUG
52 #define DPRINTF(p) printf p
53 #else
54 #define DPRINTF(p) /*nothing*/
55 #endif
56
57 /* Include the internals header, which itself includes "config.h", the Standard
58 C headers, and the external pcre header. */
59
60 #include "internal.h"
61
62 /* If Unicode Property support is wanted, include a private copy of the
63 function that does it, and the table that translates names to numbers. */
64
65 #ifdef SUPPORT_UCP
66 #include "ucp.c"
67 #include "ucptypetable.c"
68 #endif
69
70 /* Maximum number of items on the nested bracket stacks at compile time. This
71 applies to the nesting of all kinds of parentheses. It does not limit
72 un-nested, non-capturing parentheses. This number can be made bigger if
73 necessary - it is used to dimension one int and one unsigned char vector at
74 compile time. */
75
76 #define BRASTACK_SIZE 200
77
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85
86 /* The maximum remaining length of subject we are prepared to search for a
87 req_byte match. */
88
89 #define REQ_BYTE_MAX 1000
90
91
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
94
95 static const uschar OP_lengths[] = { OP_LENGTHS };
96
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106
107 #if !EBCDIC /* This is the "normal" table for ASCII systems */
108 static const short int escapes[] = {
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118 0, 0, -ESC_z /* x - z */
119 };
120
121 #else /* This is the "abnormal" table for EBCDIC systems */
122 static const short int escapes[] = {
123 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
133 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
141 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
143 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
146 };
147 #endif
148
149
150 /* Tables of names of POSIX character classes and their lengths. The list is
151 terminated by a zero length entry. The first three must be alpha, upper, lower,
152 as this is assumed for handling case independence. */
153
154 static const char *const posix_names[] = {
155 "alpha", "lower", "upper",
156 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
157 "print", "punct", "space", "word", "xdigit" };
158
159 static const uschar posix_name_lengths[] = {
160 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
161
162 /* Table of class bit maps for each POSIX class; up to three may be combined
163 to form the class. The table for [:blank:] is dynamically modified to remove
164 the vertical space characters. */
165
166 static const int posix_class_maps[] = {
167 cbit_lower, cbit_upper, -1, /* alpha */
168 cbit_lower, -1, -1, /* lower */
169 cbit_upper, -1, -1, /* upper */
170 cbit_digit, cbit_lower, cbit_upper, /* alnum */
171 cbit_print, cbit_cntrl, -1, /* ascii */
172 cbit_space, -1, -1, /* blank - a GNU extension */
173 cbit_cntrl, -1, -1, /* cntrl */
174 cbit_digit, -1, -1, /* digit */
175 cbit_graph, -1, -1, /* graph */
176 cbit_print, -1, -1, /* print */
177 cbit_punct, -1, -1, /* punct */
178 cbit_space, -1, -1, /* space */
179 cbit_word, -1, -1, /* word - a Perl extension */
180 cbit_xdigit,-1, -1 /* xdigit */
181 };
182
183 /* Table to identify digits and hex digits. This is used when compiling
184 patterns. Note that the tables in chartables are dependent on the locale, and
185 may mark arbitrary characters as digits - but the PCRE compiling code expects
186 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187 a private table here. It costs 256 bytes, but it is a lot faster than doing
188 character value tests (at least in some simple cases I timed), and in some
189 applications one wants PCRE to compile efficiently as well as match
190 efficiently.
191
192 For convenience, we use the same bit definitions as in chartables:
193
194 0x04 decimal digit
195 0x08 hexadecimal digit
196
197 Then we can use ctype_digit and ctype_xdigit in the code. */
198
199 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
200 static const unsigned char digitab[] =
201 {
202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
234
235 #else /* This is the "abnormal" case, for EBCDIC systems */
236 static const unsigned char digitab[] =
237 {
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
254 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
262 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
268 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
269 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
270
271 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
273 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
274 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
276 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
280 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
281 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
283 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
285 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
288 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
289 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
290 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
291 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
292 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
293 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
294 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
295 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
297 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
298 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
299 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
300 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
301 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
302 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
303 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
304 #endif
305
306
307 /* Definition to allow mutual recursion */
308
309 static BOOL
310 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311 BOOL, int, int *, int *, branch_chain *, compile_data *);
312
313 /* Structure for building a chain of data that actually lives on the
314 stack, for holding the values of the subject pointer at the start of each
315 subpattern, so as to detect when an empty string has been matched by a
316 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317 are on the heap, not on the stack. */
318
319 typedef struct eptrblock {
320 struct eptrblock *epb_prev;
321 const uschar *epb_saved_eptr;
322 } eptrblock;
323
324 /* Flag bits for the match() function */
325
326 #define match_condassert 0x01 /* Called to check a condition assertion */
327 #define match_isgroup 0x02 /* Set if start of bracketed group */
328
329 /* Non-error returns from the match() function. Error returns are externally
330 defined PCRE_ERROR_xxx codes, which are all negative. */
331
332 #define MATCH_MATCH 1
333 #define MATCH_NOMATCH 0
334
335
336
337 /*************************************************
338 * Global variables *
339 *************************************************/
340
341 /* PCRE is thread-clean and doesn't use any global variables in the normal
342 sense. However, it calls memory allocation and free functions via the four
343 indirections below, and it can optionally do callouts. These values can be
344 changed by the caller, but are shared between all threads. However, when
345 compiling for Virtual Pascal, things are done differently (see pcre.in). */
346
347 #ifndef VPCOMPAT
348 #ifdef __cplusplus
349 extern "C" void *(*pcre_malloc)(size_t) = malloc;
350 extern "C" void (*pcre_free)(void *) = free;
351 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352 extern "C" void (*pcre_stack_free)(void *) = free;
353 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
354 #else
355 void *(*pcre_malloc)(size_t) = malloc;
356 void (*pcre_free)(void *) = free;
357 void *(*pcre_stack_malloc)(size_t) = malloc;
358 void (*pcre_stack_free)(void *) = free;
359 int (*pcre_callout)(pcre_callout_block *) = NULL;
360 #endif
361 #endif
362
363
364 /*************************************************
365 * Macros and tables for character handling *
366 *************************************************/
367
368 /* When UTF-8 encoding is being used, a character is no longer just a single
369 byte. The macros for character handling generate simple sequences when used in
370 byte-mode, and more complicated ones for UTF-8 characters. */
371
372 #ifndef SUPPORT_UTF8
373 #define GETCHAR(c, eptr) c = *eptr;
374 #define GETCHARINC(c, eptr) c = *eptr++;
375 #define GETCHARINCTEST(c, eptr) c = *eptr++;
376 #define GETCHARLEN(c, eptr, len) c = *eptr;
377 #define BACKCHAR(eptr)
378
379 #else /* SUPPORT_UTF8 */
380
381 /* Get the next UTF-8 character, not advancing the pointer. This is called when
382 we know we are in UTF-8 mode. */
383
384 #define GETCHAR(c, eptr) \
385 c = *eptr; \
386 if ((c & 0xc0) == 0xc0) \
387 { \
388 int gcii; \
389 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
390 int gcss = 6*gcaa; \
391 c = (c & utf8_table3[gcaa]) << gcss; \
392 for (gcii = 1; gcii <= gcaa; gcii++) \
393 { \
394 gcss -= 6; \
395 c |= (eptr[gcii] & 0x3f) << gcss; \
396 } \
397 }
398
399 /* Get the next UTF-8 character, advancing the pointer. This is called when we
400 know we are in UTF-8 mode. */
401
402 #define GETCHARINC(c, eptr) \
403 c = *eptr++; \
404 if ((c & 0xc0) == 0xc0) \
405 { \
406 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
407 int gcss = 6*gcaa; \
408 c = (c & utf8_table3[gcaa]) << gcss; \
409 while (gcaa-- > 0) \
410 { \
411 gcss -= 6; \
412 c |= (*eptr++ & 0x3f) << gcss; \
413 } \
414 }
415
416 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
417
418 #define GETCHARINCTEST(c, eptr) \
419 c = *eptr++; \
420 if (md->utf8 && (c & 0xc0) == 0xc0) \
421 { \
422 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
423 int gcss = 6*gcaa; \
424 c = (c & utf8_table3[gcaa]) << gcss; \
425 while (gcaa-- > 0) \
426 { \
427 gcss -= 6; \
428 c |= (*eptr++ & 0x3f) << gcss; \
429 } \
430 }
431
432 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
433 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
434
435 #define GETCHARLEN(c, eptr, len) \
436 c = *eptr; \
437 if ((c & 0xc0) == 0xc0) \
438 { \
439 int gcii; \
440 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
441 int gcss = 6*gcaa; \
442 c = (c & utf8_table3[gcaa]) << gcss; \
443 for (gcii = 1; gcii <= gcaa; gcii++) \
444 { \
445 gcss -= 6; \
446 c |= (eptr[gcii] & 0x3f) << gcss; \
447 } \
448 len += gcaa; \
449 }
450
451 /* If the pointer is not at the start of a character, move it back until
452 it is. Called only in UTF-8 mode. */
453
454 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
455
456 #endif
457
458
459
460 /*************************************************
461 * Default character tables *
462 *************************************************/
463
464 /* A default set of character tables is included in the PCRE binary. Its source
465 is built by the maketables auxiliary program, which uses the default C ctypes
466 functions, and put in the file chartables.c. These tables are used by PCRE
467 whenever the caller of pcre_compile() does not provide an alternate set of
468 tables. */
469
470 #include "chartables.c"
471
472
473
474 #ifdef SUPPORT_UTF8
475 /*************************************************
476 * Tables for UTF-8 support *
477 *************************************************/
478
479 /* These are the breakpoints for different numbers of bytes in a UTF-8
480 character. */
481
482 static const int utf8_table1[] =
483 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
484
485 /* These are the indicator bits and the mask for the data bits to set in the
486 first byte of a character, indexed by the number of additional bytes. */
487
488 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
490
491 /* Table of the number of extra characters, indexed by the first character
492 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
493 0x3d. */
494
495 static const uschar utf8_table4[] = {
496 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
500
501
502 /*************************************************
503 * Convert character value to UTF-8 *
504 *************************************************/
505
506 /* This function takes an integer value in the range 0 - 0x7fffffff
507 and encodes it as a UTF-8 character in 0 to 6 bytes.
508
509 Arguments:
510 cvalue the character value
511 buffer pointer to buffer for result - at least 6 bytes long
512
513 Returns: number of characters placed in the buffer
514 */
515
516 static int
517 ord2utf8(int cvalue, uschar *buffer)
518 {
519 register int i, j;
520 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521 if (cvalue <= utf8_table1[i]) break;
522 buffer += i;
523 for (j = i; j > 0; j--)
524 {
525 *buffer-- = 0x80 | (cvalue & 0x3f);
526 cvalue >>= 6;
527 }
528 *buffer = utf8_table2[i] | cvalue;
529 return i + 1;
530 }
531 #endif
532
533
534
535 /*************************************************
536 * Print compiled regex *
537 *************************************************/
538
539 /* The code for doing this is held in a separate file that is also included in
540 pcretest.c. It defines a function called print_internals(). */
541
542 #ifdef DEBUG
543 #include "printint.c"
544 #endif
545
546
547
548 /*************************************************
549 * Return version string *
550 *************************************************/
551
552 #define STRING(a) # a
553 #define XSTRING(s) STRING(s)
554
555 EXPORT const char *
556 pcre_version(void)
557 {
558 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
559 }
560
561
562
563
564 /*************************************************
565 * Flip bytes in an integer *
566 *************************************************/
567
568 /* This function is called when the magic number in a regex doesn't match in
569 order to flip its bytes to see if we are dealing with a pattern that was
570 compiled on a host of different endianness. If so, this function is used to
571 flip other byte values.
572
573 Arguments:
574 value the number to flip
575 n the number of bytes to flip (assumed to be 2 or 4)
576
577 Returns: the flipped value
578 */
579
580 static long int
581 byteflip(long int value, int n)
582 {
583 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
584 return ((value & 0x000000ff) << 24) |
585 ((value & 0x0000ff00) << 8) |
586 ((value & 0x00ff0000) >> 8) |
587 ((value & 0xff000000) >> 24);
588 }
589
590
591
592 /*************************************************
593 * Test for a byte-flipped compiled regex *
594 *************************************************/
595
596 /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
597 job is to test whether the regex is byte-flipped - that is, it was compiled on
598 a system of opposite endianness. The function is called only when the native
599 MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
600 relevant values into a different data block, and return it.
601
602 Arguments:
603 re points to the regex
604 study points to study data, or NULL
605 internal_re points to a new regex block
606 internal_study points to a new study block
607
608 Returns: the new block if is is indeed a byte-flipped regex
609 NULL if it is not
610 */
611
612 static real_pcre *
613 try_flipped(const real_pcre *re, real_pcre *internal_re,
614 const pcre_study_data *study, pcre_study_data *internal_study)
615 {
616 if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
617 return NULL;
618
619 *internal_re = *re; /* To copy other fields */
620 internal_re->size = byteflip(re->size, sizeof(re->size));
621 internal_re->options = byteflip(re->options, sizeof(re->options));
622 internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
623 internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
624 internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
625 internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
626 internal_re->name_table_offset = byteflip(re->name_table_offset,
627 sizeof(re->name_table_offset));
628 internal_re->name_entry_size = byteflip(re->name_entry_size,
629 sizeof(re->name_entry_size));
630 internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
631
632 if (study != NULL)
633 {
634 *internal_study = *study; /* To copy other fields */
635 internal_study->size = byteflip(study->size, sizeof(study->size));
636 internal_study->options = byteflip(study->options, sizeof(study->options));
637 }
638
639 return internal_re;
640 }
641
642
643
644 /*************************************************
645 * (Obsolete) Return info about compiled pattern *
646 *************************************************/
647
648 /* This is the original "info" function. It picks potentially useful data out
649 of the private structure, but its interface was too rigid. It remains for
650 backwards compatibility. The public options are passed back in an int - though
651 the re->options field has been expanded to a long int, all the public options
652 at the low end of it, and so even on 16-bit systems this will still be OK.
653 Therefore, I haven't changed the API for pcre_info().
654
655 Arguments:
656 argument_re points to compiled code
657 optptr where to pass back the options
658 first_byte where to pass back the first character,
659 or -1 if multiline and all branches start ^,
660 or -2 otherwise
661
662 Returns: number of capturing subpatterns
663 or negative values on error
664 */
665
666 EXPORT int
667 pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
668 {
669 real_pcre internal_re;
670 const real_pcre *re = (const real_pcre *)argument_re;
671 if (re == NULL) return PCRE_ERROR_NULL;
672 if (re->magic_number != MAGIC_NUMBER)
673 {
674 re = try_flipped(re, &internal_re, NULL, NULL);
675 if (re == NULL) return PCRE_ERROR_BADMAGIC;
676 }
677 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
678 if (first_byte != NULL)
679 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
680 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
681 return re->top_bracket;
682 }
683
684
685
686 /*************************************************
687 * Return info about compiled pattern *
688 *************************************************/
689
690 /* This is a newer "info" function which has an extensible interface so
691 that additional items can be added compatibly.
692
693 Arguments:
694 argument_re points to compiled code
695 extra_data points extra data, or NULL
696 what what information is required
697 where where to put the information
698
699 Returns: 0 if data returned, negative on error
700 */
701
702 EXPORT int
703 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
704 void *where)
705 {
706 real_pcre internal_re;
707 pcre_study_data internal_study;
708 const real_pcre *re = (const real_pcre *)argument_re;
709 const pcre_study_data *study = NULL;
710
711 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
712
713 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
714 study = (const pcre_study_data *)extra_data->study_data;
715
716 if (re->magic_number != MAGIC_NUMBER)
717 {
718 re = try_flipped(re, &internal_re, study, &internal_study);
719 if (re == NULL) return PCRE_ERROR_BADMAGIC;
720 if (study != NULL) study = &internal_study;
721 }
722
723 switch (what)
724 {
725 case PCRE_INFO_OPTIONS:
726 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
727 break;
728
729 case PCRE_INFO_SIZE:
730 *((size_t *)where) = re->size;
731 break;
732
733 case PCRE_INFO_STUDYSIZE:
734 *((size_t *)where) = (study == NULL)? 0 : study->size;
735 break;
736
737 case PCRE_INFO_CAPTURECOUNT:
738 *((int *)where) = re->top_bracket;
739 break;
740
741 case PCRE_INFO_BACKREFMAX:
742 *((int *)where) = re->top_backref;
743 break;
744
745 case PCRE_INFO_FIRSTBYTE:
746 *((int *)where) =
747 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
748 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
749 break;
750
751 /* Make sure we pass back the pointer to the bit vector in the external
752 block, not the internal copy (with flipped integer fields). */
753
754 case PCRE_INFO_FIRSTTABLE:
755 *((const uschar **)where) =
756 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
757 ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
758 break;
759
760 case PCRE_INFO_LASTLITERAL:
761 *((int *)where) =
762 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
763 break;
764
765 case PCRE_INFO_NAMEENTRYSIZE:
766 *((int *)where) = re->name_entry_size;
767 break;
768
769 case PCRE_INFO_NAMECOUNT:
770 *((int *)where) = re->name_count;
771 break;
772
773 case PCRE_INFO_NAMETABLE:
774 *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
775 break;
776
777 case PCRE_INFO_DEFAULT_TABLES:
778 *((const uschar **)where) = (const uschar *)pcre_default_tables;
779 break;
780
781 default: return PCRE_ERROR_BADOPTION;
782 }
783
784 return 0;
785 }
786
787
788
789 /*************************************************
790 * Return info about what features are configured *
791 *************************************************/
792
793 /* This is function which has an extensible interface so that additional items
794 can be added compatibly.
795
796 Arguments:
797 what what information is required
798 where where to put the information
799
800 Returns: 0 if data returned, negative on error
801 */
802
803 EXPORT int
804 pcre_config(int what, void *where)
805 {
806 switch (what)
807 {
808 case PCRE_CONFIG_UTF8:
809 #ifdef SUPPORT_UTF8
810 *((int *)where) = 1;
811 #else
812 *((int *)where) = 0;
813 #endif
814 break;
815
816 case PCRE_CONFIG_UNICODE_PROPERTIES:
817 #ifdef SUPPORT_UCP
818 *((int *)where) = 1;
819 #else
820 *((int *)where) = 0;
821 #endif
822 break;
823
824 case PCRE_CONFIG_NEWLINE:
825 *((int *)where) = NEWLINE;
826 break;
827
828 case PCRE_CONFIG_LINK_SIZE:
829 *((int *)where) = LINK_SIZE;
830 break;
831
832 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
833 *((int *)where) = POSIX_MALLOC_THRESHOLD;
834 break;
835
836 case PCRE_CONFIG_MATCH_LIMIT:
837 *((unsigned int *)where) = MATCH_LIMIT;
838 break;
839
840 case PCRE_CONFIG_STACKRECURSE:
841 #ifdef NO_RECURSE
842 *((int *)where) = 0;
843 #else
844 *((int *)where) = 1;
845 #endif
846 break;
847
848 default: return PCRE_ERROR_BADOPTION;
849 }
850
851 return 0;
852 }
853
854
855
856 #ifdef DEBUG
857 /*************************************************
858 * Debugging function to print chars *
859 *************************************************/
860
861 /* Print a sequence of chars in printable format, stopping at the end of the
862 subject if the requested.
863
864 Arguments:
865 p points to characters
866 length number to print
867 is_subject TRUE if printing from within md->start_subject
868 md pointer to matching data block, if is_subject is TRUE
869
870 Returns: nothing
871 */
872
873 static void
874 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
875 {
876 int c;
877 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
878 while (length-- > 0)
879 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
880 }
881 #endif
882
883
884
885
886 /*************************************************
887 * Handle escapes *
888 *************************************************/
889
890 /* This function is called when a \ has been encountered. It either returns a
891 positive value for a simple escape such as \n, or a negative value which
892 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
893 a positive value greater than 255 may be returned. On entry, ptr is pointing at
894 the \. On exit, it is on the final character of the escape sequence.
895
896 Arguments:
897 ptrptr points to the pattern position pointer
898 errorptr points to the pointer to the error message
899 bracount number of previous extracting brackets
900 options the options bits
901 isclass TRUE if inside a character class
902
903 Returns: zero or positive => a data character
904 negative => a special escape sequence
905 on error, errorptr is set
906 */
907
908 static int
909 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
910 int options, BOOL isclass)
911 {
912 const uschar *ptr = *ptrptr;
913 int c, i;
914
915 /* If backslash is at the end of the pattern, it's an error. */
916
917 c = *(++ptr);
918 if (c == 0) *errorptr = ERR1;
919
920 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
921 a table. A non-zero result is something that can be returned immediately.
922 Otherwise further processing may be required. */
923
924 #if !EBCDIC /* ASCII coding */
925 else if (c < '0' || c > 'z') {} /* Not alphameric */
926 else if ((i = escapes[c - '0']) != 0) c = i;
927
928 #else /* EBCDIC coding */
929 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
930 else if ((i = escapes[c - 0x48]) != 0) c = i;
931 #endif
932
933 /* Escapes that need further processing, or are illegal. */
934
935 else
936 {
937 const uschar *oldptr;
938 switch (c)
939 {
940 /* A number of Perl escapes are not handled by PCRE. We give an explicit
941 error. */
942
943 case 'l':
944 case 'L':
945 case 'N':
946 case 'u':
947 case 'U':
948 *errorptr = ERR37;
949 break;
950
951 /* The handling of escape sequences consisting of a string of digits
952 starting with one that is not zero is not straightforward. By experiment,
953 the way Perl works seems to be as follows:
954
955 Outside a character class, the digits are read as a decimal number. If the
956 number is less than 10, or if there are that many previous extracting
957 left brackets, then it is a back reference. Otherwise, up to three octal
958 digits are read to form an escaped byte. Thus \123 is likely to be octal
959 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
960 value is greater than 377, the least significant 8 bits are taken. Inside a
961 character class, \ followed by a digit is always an octal number. */
962
963 case '1': case '2': case '3': case '4': case '5':
964 case '6': case '7': case '8': case '9':
965
966 if (!isclass)
967 {
968 oldptr = ptr;
969 c -= '0';
970 while ((digitab[ptr[1]] & ctype_digit) != 0)
971 c = c * 10 + *(++ptr) - '0';
972 if (c < 10 || c <= bracount)
973 {
974 c = -(ESC_REF + c);
975 break;
976 }
977 ptr = oldptr; /* Put the pointer back and fall through */
978 }
979
980 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
981 generates a binary zero byte and treats the digit as a following literal.
982 Thus we have to pull back the pointer by one. */
983
984 if ((c = *ptr) >= '8')
985 {
986 ptr--;
987 c = 0;
988 break;
989 }
990
991 /* \0 always starts an octal number, but we may drop through to here with a
992 larger first octal digit. */
993
994 case '0':
995 c -= '0';
996 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
997 c = c * 8 + *(++ptr) - '0';
998 c &= 255; /* Take least significant 8 bits */
999 break;
1000
1001 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1002 which can be greater than 0xff, but only if the ddd are hex digits. */
1003
1004 case 'x':
1005 #ifdef SUPPORT_UTF8
1006 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1007 {
1008 const uschar *pt = ptr + 2;
1009 register int count = 0;
1010 c = 0;
1011 while ((digitab[*pt] & ctype_xdigit) != 0)
1012 {
1013 int cc = *pt++;
1014 count++;
1015 #if !EBCDIC /* ASCII coding */
1016 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1017 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1018 #else /* EBCDIC coding */
1019 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
1020 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1021 #endif
1022 }
1023 if (*pt == '}')
1024 {
1025 if (c < 0 || count > 8) *errorptr = ERR34;
1026 ptr = pt;
1027 break;
1028 }
1029 /* If the sequence of hex digits does not end with '}', then we don't
1030 recognize this construct; fall through to the normal \x handling. */
1031 }
1032 #endif
1033
1034 /* Read just a single hex char */
1035
1036 c = 0;
1037 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1038 {
1039 int cc; /* Some compilers don't like ++ */
1040 cc = *(++ptr); /* in initializers */
1041 #if !EBCDIC /* ASCII coding */
1042 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1043 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1044 #else /* EBCDIC coding */
1045 if (cc <= 'z') cc += 64; /* Convert to upper case */
1046 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1047 #endif
1048 }
1049 break;
1050
1051 /* Other special escapes not starting with a digit are straightforward */
1052
1053 case 'c':
1054 c = *(++ptr);
1055 if (c == 0)
1056 {
1057 *errorptr = ERR2;
1058 return 0;
1059 }
1060
1061 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1062 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1063 (However, an EBCDIC equivalent has now been added.) */
1064
1065 #if !EBCDIC /* ASCII coding */
1066 if (c >= 'a' && c <= 'z') c -= 32;
1067 c ^= 0x40;
1068 #else /* EBCDIC coding */
1069 if (c >= 'a' && c <= 'z') c += 64;
1070 c ^= 0xC0;
1071 #endif
1072 break;
1073
1074 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1075 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1076 for Perl compatibility, it is a literal. This code looks a bit odd, but
1077 there used to be some cases other than the default, and there may be again
1078 in future, so I haven't "optimized" it. */
1079
1080 default:
1081 if ((options & PCRE_EXTRA) != 0) switch(c)
1082 {
1083 default:
1084 *errorptr = ERR3;
1085 break;
1086 }
1087 break;
1088 }
1089 }
1090
1091 *ptrptr = ptr;
1092 return c;
1093 }
1094
1095
1096
1097 #ifdef SUPPORT_UCP
1098 /*************************************************
1099 * Handle \P and \p *
1100 *************************************************/
1101
1102 /* This function is called after \P or \p has been encountered, provided that
1103 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1104 pointing at the P or p. On exit, it is pointing at the final character of the
1105 escape sequence.
1106
1107 Argument:
1108 ptrptr points to the pattern position pointer
1109 negptr points to a boolean that is set TRUE for negation else FALSE
1110 errorptr points to the pointer to the error message
1111
1112 Returns: value from ucp_type_table, or -1 for an invalid type
1113 */
1114
1115 static int
1116 get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1117 {
1118 int c, i, bot, top;
1119 const uschar *ptr = *ptrptr;
1120 char name[4];
1121
1122 c = *(++ptr);
1123 if (c == 0) goto ERROR_RETURN;
1124
1125 *negptr = FALSE;
1126
1127 /* \P or \p can be followed by a one- or two-character name in {}, optionally
1128 preceded by ^ for negation. */
1129
1130 if (c == '{')
1131 {
1132 if (ptr[1] == '^')
1133 {
1134 *negptr = TRUE;
1135 ptr++;
1136 }
1137 for (i = 0; i <= 2; i++)
1138 {
1139 c = *(++ptr);
1140 if (c == 0) goto ERROR_RETURN;
1141 if (c == '}') break;
1142 name[i] = c;
1143 }
1144 if (c !='}') /* Try to distinguish error cases */
1145 {
1146 while (*(++ptr) != 0 && *ptr != '}');
1147 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1148 }
1149 name[i] = 0;
1150 }
1151
1152 /* Otherwise there is just one following character */
1153
1154 else
1155 {
1156 name[0] = c;
1157 name[1] = 0;
1158 }
1159
1160 *ptrptr = ptr;
1161
1162 /* Search for a recognized property name using binary chop */
1163
1164 bot = 0;
1165 top = sizeof(utt)/sizeof(ucp_type_table);
1166
1167 while (bot < top)
1168 {
1169 i = (bot + top)/2;
1170 c = strcmp(name, utt[i].name);
1171 if (c == 0) return utt[i].value;
1172 if (c > 0) bot = i + 1; else top = i;
1173 }
1174
1175 UNKNOWN_RETURN:
1176 *errorptr = ERR47;
1177 *ptrptr = ptr;
1178 return -1;
1179
1180 ERROR_RETURN:
1181 *errorptr = ERR46;
1182 *ptrptr = ptr;
1183 return -1;
1184 }
1185 #endif
1186
1187
1188
1189
1190 /*************************************************
1191 * Check for counted repeat *
1192 *************************************************/
1193
1194 /* This function is called when a '{' is encountered in a place where it might
1195 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1196 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1197 where the ddds are digits.
1198
1199 Arguments:
1200 p pointer to the first char after '{'
1201
1202 Returns: TRUE or FALSE
1203 */
1204
1205 static BOOL
1206 is_counted_repeat(const uschar *p)
1207 {
1208 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1209 while ((digitab[*p] & ctype_digit) != 0) p++;
1210 if (*p == '}') return TRUE;
1211
1212 if (*p++ != ',') return FALSE;
1213 if (*p == '}') return TRUE;
1214
1215 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1216 while ((digitab[*p] & ctype_digit) != 0) p++;
1217
1218 return (*p == '}');
1219 }
1220
1221
1222
1223 /*************************************************
1224 * Read repeat counts *
1225 *************************************************/
1226
1227 /* Read an item of the form {n,m} and return the values. This is called only
1228 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1229 so the syntax is guaranteed to be correct, but we need to check the values.
1230
1231 Arguments:
1232 p pointer to first char after '{'
1233 minp pointer to int for min
1234 maxp pointer to int for max
1235 returned as -1 if no max
1236 errorptr points to pointer to error message
1237
1238 Returns: pointer to '}' on success;
1239 current ptr on error, with errorptr set
1240 */
1241
1242 static const uschar *
1243 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1244 {
1245 int min = 0;
1246 int max = -1;
1247
1248 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1249
1250 if (*p == '}') max = min; else
1251 {
1252 if (*(++p) != '}')
1253 {
1254 max = 0;
1255 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1256 if (max < min)
1257 {
1258 *errorptr = ERR4;
1259 return p;
1260 }
1261 }
1262 }
1263
1264 /* Do paranoid checks, then fill in the required variables, and pass back the
1265 pointer to the terminating '}'. */
1266
1267 if (min > 65535 || max > 65535)
1268 *errorptr = ERR5;
1269 else
1270 {
1271 *minp = min;
1272 *maxp = max;
1273 }
1274 return p;
1275 }
1276
1277
1278
1279 /*************************************************
1280 * Find first significant op code *
1281 *************************************************/
1282
1283 /* This is called by several functions that scan a compiled expression looking
1284 for a fixed first character, or an anchoring op code etc. It skips over things
1285 that do not influence this. For some calls, a change of option is important.
1286 For some calls, it makes sense to skip negative forward and all backward
1287 assertions, and also the \b assertion; for others it does not.
1288
1289 Arguments:
1290 code pointer to the start of the group
1291 options pointer to external options
1292 optbit the option bit whose changing is significant, or
1293 zero if none are
1294 skipassert TRUE if certain assertions are to be skipped
1295
1296 Returns: pointer to the first significant opcode
1297 */
1298
1299 static const uschar*
1300 first_significant_code(const uschar *code, int *options, int optbit,
1301 BOOL skipassert)
1302 {
1303 for (;;)
1304 {
1305 switch ((int)*code)
1306 {
1307 case OP_OPT:
1308 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309 *options = (int)code[1];
1310 code += 2;
1311 break;
1312
1313 case OP_ASSERT_NOT:
1314 case OP_ASSERTBACK:
1315 case OP_ASSERTBACK_NOT:
1316 if (!skipassert) return code;
1317 do code += GET(code, 1); while (*code == OP_ALT);
1318 code += OP_lengths[*code];
1319 break;
1320
1321 case OP_WORD_BOUNDARY:
1322 case OP_NOT_WORD_BOUNDARY:
1323 if (!skipassert) return code;
1324 /* Fall through */
1325
1326 case OP_CALLOUT:
1327 case OP_CREF:
1328 case OP_BRANUMBER:
1329 code += OP_lengths[*code];
1330 break;
1331
1332 default:
1333 return code;
1334 }
1335 }
1336 /* Control never reaches here */
1337 }
1338
1339
1340
1341
1342 /*************************************************
1343 * Find the fixed length of a pattern *
1344 *************************************************/
1345
1346 /* Scan a pattern and compute the fixed length of subject that will match it,
1347 if the length is fixed. This is needed for dealing with backward assertions.
1348 In UTF8 mode, the result is in characters rather than bytes.
1349
1350 Arguments:
1351 code points to the start of the pattern (the bracket)
1352 options the compiling options
1353
1354 Returns: the fixed length, or -1 if there is no fixed length,
1355 or -2 if \C was encountered
1356 */
1357
1358 static int
1359 find_fixedlength(uschar *code, int options)
1360 {
1361 int length = -1;
1362
1363 register int branchlength = 0;
1364 register uschar *cc = code + 1 + LINK_SIZE;
1365
1366 /* Scan along the opcodes for this branch. If we get to the end of the
1367 branch, check the length against that of the other branches. */
1368
1369 for (;;)
1370 {
1371 int d;
1372 register int op = *cc;
1373 if (op >= OP_BRA) op = OP_BRA;
1374
1375 switch (op)
1376 {
1377 case OP_BRA:
1378 case OP_ONCE:
1379 case OP_COND:
1380 d = find_fixedlength(cc, options);
1381 if (d < 0) return d;
1382 branchlength += d;
1383 do cc += GET(cc, 1); while (*cc == OP_ALT);
1384 cc += 1 + LINK_SIZE;
1385 break;
1386
1387 /* Reached end of a branch; if it's a ket it is the end of a nested
1388 call. If it's ALT it is an alternation in a nested call. If it is
1389 END it's the end of the outer call. All can be handled by the same code. */
1390
1391 case OP_ALT:
1392 case OP_KET:
1393 case OP_KETRMAX:
1394 case OP_KETRMIN:
1395 case OP_END:
1396 if (length < 0) length = branchlength;
1397 else if (length != branchlength) return -1;
1398 if (*cc != OP_ALT) return length;
1399 cc += 1 + LINK_SIZE;
1400 branchlength = 0;
1401 break;
1402
1403 /* Skip over assertive subpatterns */
1404
1405 case OP_ASSERT:
1406 case OP_ASSERT_NOT:
1407 case OP_ASSERTBACK:
1408 case OP_ASSERTBACK_NOT:
1409 do cc += GET(cc, 1); while (*cc == OP_ALT);
1410 /* Fall through */
1411
1412 /* Skip over things that don't match chars */
1413
1414 case OP_REVERSE:
1415 case OP_BRANUMBER:
1416 case OP_CREF:
1417 case OP_OPT:
1418 case OP_CALLOUT:
1419 case OP_SOD:
1420 case OP_SOM:
1421 case OP_EOD:
1422 case OP_EODN:
1423 case OP_CIRC:
1424 case OP_DOLL:
1425 case OP_NOT_WORD_BOUNDARY:
1426 case OP_WORD_BOUNDARY:
1427 cc += OP_lengths[*cc];
1428 break;
1429
1430 /* Handle literal characters */
1431
1432 case OP_CHAR:
1433 case OP_CHARNC:
1434 branchlength++;
1435 cc += 2;
1436 #ifdef SUPPORT_UTF8
1437 if ((options & PCRE_UTF8) != 0)
1438 {
1439 while ((*cc & 0xc0) == 0x80) cc++;
1440 }
1441 #endif
1442 break;
1443
1444 /* Handle exact repetitions. The count is already in characters, but we
1445 need to skip over a multibyte character in UTF8 mode. */
1446
1447 case OP_EXACT:
1448 branchlength += GET2(cc,1);
1449 cc += 4;
1450 #ifdef SUPPORT_UTF8
1451 if ((options & PCRE_UTF8) != 0)
1452 {
1453 while((*cc & 0x80) == 0x80) cc++;
1454 }
1455 #endif
1456 break;
1457
1458 case OP_TYPEEXACT:
1459 branchlength += GET2(cc,1);
1460 cc += 4;
1461 break;
1462
1463 /* Handle single-char matchers */
1464
1465 case OP_PROP:
1466 case OP_NOTPROP:
1467 cc++;
1468 /* Fall through */
1469
1470 case OP_NOT_DIGIT:
1471 case OP_DIGIT:
1472 case OP_NOT_WHITESPACE:
1473 case OP_WHITESPACE:
1474 case OP_NOT_WORDCHAR:
1475 case OP_WORDCHAR:
1476 case OP_ANY:
1477 branchlength++;
1478 cc++;
1479 break;
1480
1481 /* The single-byte matcher isn't allowed */
1482
1483 case OP_ANYBYTE:
1484 return -2;
1485
1486 /* Check a class for variable quantification */
1487
1488 #ifdef SUPPORT_UTF8
1489 case OP_XCLASS:
1490 cc += GET(cc, 1) - 33;
1491 /* Fall through */
1492 #endif
1493
1494 case OP_CLASS:
1495 case OP_NCLASS:
1496 cc += 33;
1497
1498 switch (*cc)
1499 {
1500 case OP_CRSTAR:
1501 case OP_CRMINSTAR:
1502 case OP_CRQUERY:
1503 case OP_CRMINQUERY:
1504 return -1;
1505
1506 case OP_CRRANGE:
1507 case OP_CRMINRANGE:
1508 if (GET2(cc,1) != GET2(cc,3)) return -1;
1509 branchlength += GET2(cc,1);
1510 cc += 5;
1511 break;
1512
1513 default:
1514 branchlength++;
1515 }
1516 break;
1517
1518 /* Anything else is variable length */
1519
1520 default:
1521 return -1;
1522 }
1523 }
1524 /* Control never gets here */
1525 }
1526
1527
1528
1529
1530 /*************************************************
1531 * Scan compiled regex for numbered bracket *
1532 *************************************************/
1533
1534 /* This little function scans through a compiled pattern until it finds a
1535 capturing bracket with the given number.
1536
1537 Arguments:
1538 code points to start of expression
1539 utf8 TRUE in UTF-8 mode
1540 number the required bracket number
1541
1542 Returns: pointer to the opcode for the bracket, or NULL if not found
1543 */
1544
1545 static const uschar *
1546 find_bracket(const uschar *code, BOOL utf8, int number)
1547 {
1548 #ifndef SUPPORT_UTF8
1549 utf8 = utf8; /* Stop pedantic compilers complaining */
1550 #endif
1551
1552 for (;;)
1553 {
1554 register int c = *code;
1555 if (c == OP_END) return NULL;
1556 else if (c > OP_BRA)
1557 {
1558 int n = c - OP_BRA;
1559 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1560 if (n == number) return (uschar *)code;
1561 code += OP_lengths[OP_BRA];
1562 }
1563 else
1564 {
1565 code += OP_lengths[c];
1566
1567 #ifdef SUPPORT_UTF8
1568
1569 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1570 by a multi-byte character. The length in the table is a minimum, so we have
1571 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1572 can use relatively efficient code. */
1573
1574 if (utf8) switch(c)
1575 {
1576 case OP_CHAR:
1577 case OP_CHARNC:
1578 case OP_EXACT:
1579 case OP_UPTO:
1580 case OP_MINUPTO:
1581 case OP_STAR:
1582 case OP_MINSTAR:
1583 case OP_PLUS:
1584 case OP_MINPLUS:
1585 case OP_QUERY:
1586 case OP_MINQUERY:
1587 while ((*code & 0xc0) == 0x80) code++;
1588 break;
1589
1590 /* XCLASS is used for classes that cannot be represented just by a bit
1591 map. This includes negated single high-valued characters. The length in
1592 the table is zero; the actual length is stored in the compiled code. */
1593
1594 case OP_XCLASS:
1595 code += GET(code, 1) + 1;
1596 break;
1597 }
1598 #endif
1599 }
1600 }
1601 }
1602
1603
1604
1605 /*************************************************
1606 * Scan compiled regex for recursion reference *
1607 *************************************************/
1608
1609 /* This little function scans through a compiled pattern until it finds an
1610 instance of OP_RECURSE.
1611
1612 Arguments:
1613 code points to start of expression
1614 utf8 TRUE in UTF-8 mode
1615
1616 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1617 */
1618
1619 static const uschar *
1620 find_recurse(const uschar *code, BOOL utf8)
1621 {
1622 #ifndef SUPPORT_UTF8
1623 utf8 = utf8; /* Stop pedantic compilers complaining */
1624 #endif
1625
1626 for (;;)
1627 {
1628 register int c = *code;
1629 if (c == OP_END) return NULL;
1630 else if (c == OP_RECURSE) return code;
1631 else if (c > OP_BRA)
1632 {
1633 code += OP_lengths[OP_BRA];
1634 }
1635 else
1636 {
1637 code += OP_lengths[c];
1638
1639 #ifdef SUPPORT_UTF8
1640
1641 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1642 by a multi-byte character. The length in the table is a minimum, so we have
1643 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1644 can use relatively efficient code. */
1645
1646 if (utf8) switch(c)
1647 {
1648 case OP_CHAR:
1649 case OP_CHARNC:
1650 case OP_EXACT:
1651 case OP_UPTO:
1652 case OP_MINUPTO:
1653 case OP_STAR:
1654 case OP_MINSTAR:
1655 case OP_PLUS:
1656 case OP_MINPLUS:
1657 case OP_QUERY:
1658 case OP_MINQUERY:
1659 while ((*code & 0xc0) == 0x80) code++;
1660 break;
1661
1662 /* XCLASS is used for classes that cannot be represented just by a bit
1663 map. This includes negated single high-valued characters. The length in
1664 the table is zero; the actual length is stored in the compiled code. */
1665
1666 case OP_XCLASS:
1667 code += GET(code, 1) + 1;
1668 break;
1669 }
1670 #endif
1671 }
1672 }
1673 }
1674
1675
1676
1677 /*************************************************
1678 * Scan compiled branch for non-emptiness *
1679 *************************************************/
1680
1681 /* This function scans through a branch of a compiled pattern to see whether it
1682 can match the empty string or not. It is called only from could_be_empty()
1683 below. Note that first_significant_code() skips over assertions. If we hit an
1684 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1685 whose current branch will already have been scanned.
1686
1687 Arguments:
1688 code points to start of search
1689 endcode points to where to stop
1690 utf8 TRUE if in UTF8 mode
1691
1692 Returns: TRUE if what is matched could be empty
1693 */
1694
1695 static BOOL
1696 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1697 {
1698 register int c;
1699 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1700 code < endcode;
1701 code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1702 {
1703 const uschar *ccode;
1704
1705 c = *code;
1706
1707 if (c >= OP_BRA)
1708 {
1709 BOOL empty_branch;
1710 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1711
1712 /* Scan a closed bracket */
1713
1714 empty_branch = FALSE;
1715 do
1716 {
1717 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1718 empty_branch = TRUE;
1719 code += GET(code, 1);
1720 }
1721 while (*code == OP_ALT);
1722 if (!empty_branch) return FALSE; /* All branches are non-empty */
1723 code += 1 + LINK_SIZE;
1724 c = *code;
1725 }
1726
1727 else switch (c)
1728 {
1729 /* Check for quantifiers after a class */
1730
1731 #ifdef SUPPORT_UTF8
1732 case OP_XCLASS:
1733 ccode = code + GET(code, 1);
1734 goto CHECK_CLASS_REPEAT;
1735 #endif
1736
1737 case OP_CLASS:
1738 case OP_NCLASS:
1739 ccode = code + 33;
1740
1741 #ifdef SUPPORT_UTF8
1742 CHECK_CLASS_REPEAT:
1743 #endif
1744
1745 switch (*ccode)
1746 {
1747 case OP_CRSTAR: /* These could be empty; continue */
1748 case OP_CRMINSTAR:
1749 case OP_CRQUERY:
1750 case OP_CRMINQUERY:
1751 break;
1752
1753 default: /* Non-repeat => class must match */
1754 case OP_CRPLUS: /* These repeats aren't empty */
1755 case OP_CRMINPLUS:
1756 return FALSE;
1757
1758 case OP_CRRANGE:
1759 case OP_CRMINRANGE:
1760 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1761 break;
1762 }
1763 break;
1764
1765 /* Opcodes that must match a character */
1766
1767 case OP_PROP:
1768 case OP_NOTPROP:
1769 case OP_EXTUNI:
1770 case OP_NOT_DIGIT:
1771 case OP_DIGIT:
1772 case OP_NOT_WHITESPACE:
1773 case OP_WHITESPACE:
1774 case OP_NOT_WORDCHAR:
1775 case OP_WORDCHAR:
1776 case OP_ANY:
1777 case OP_ANYBYTE:
1778 case OP_CHAR:
1779 case OP_CHARNC:
1780 case OP_NOT:
1781 case OP_PLUS:
1782 case OP_MINPLUS:
1783 case OP_EXACT:
1784 case OP_NOTPLUS:
1785 case OP_NOTMINPLUS:
1786 case OP_NOTEXACT:
1787 case OP_TYPEPLUS:
1788 case OP_TYPEMINPLUS:
1789 case OP_TYPEEXACT:
1790 return FALSE;
1791
1792 /* End of branch */
1793
1794 case OP_KET:
1795 case OP_KETRMAX:
1796 case OP_KETRMIN:
1797 case OP_ALT:
1798 return TRUE;
1799
1800 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1801 followed by a multibyte character */
1802
1803 #ifdef SUPPORT_UTF8
1804 case OP_STAR:
1805 case OP_MINSTAR:
1806 case OP_QUERY:
1807 case OP_MINQUERY:
1808 case OP_UPTO:
1809 case OP_MINUPTO:
1810 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1811 break;
1812 #endif
1813 }
1814 }
1815
1816 return TRUE;
1817 }
1818
1819
1820
1821 /*************************************************
1822 * Scan compiled regex for non-emptiness *
1823 *************************************************/
1824
1825 /* This function is called to check for left recursive calls. We want to check
1826 the current branch of the current pattern to see if it could match the empty
1827 string. If it could, we must look outwards for branches at other levels,
1828 stopping when we pass beyond the bracket which is the subject of the recursion.
1829
1830 Arguments:
1831 code points to start of the recursion
1832 endcode points to where to stop (current RECURSE item)
1833 bcptr points to the chain of current (unclosed) branch starts
1834 utf8 TRUE if in UTF-8 mode
1835
1836 Returns: TRUE if what is matched could be empty
1837 */
1838
1839 static BOOL
1840 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1841 BOOL utf8)
1842 {
1843 while (bcptr != NULL && bcptr->current >= code)
1844 {
1845 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1846 bcptr = bcptr->outer;
1847 }
1848 return TRUE;
1849 }
1850
1851
1852
1853 /*************************************************
1854 * Check for POSIX class syntax *
1855 *************************************************/
1856
1857 /* This function is called when the sequence "[:" or "[." or "[=" is
1858 encountered in a character class. It checks whether this is followed by an
1859 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1860 ".]" or "=]".
1861
1862 Argument:
1863 ptr pointer to the initial [
1864 endptr where to return the end pointer
1865 cd pointer to compile data
1866
1867 Returns: TRUE or FALSE
1868 */
1869
1870 static BOOL
1871 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1872 {
1873 int terminator; /* Don't combine these lines; the Solaris cc */
1874 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1875 if (*(++ptr) == '^') ptr++;
1876 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1877 if (*ptr == terminator && ptr[1] == ']')
1878 {
1879 *endptr = ptr;
1880 return TRUE;
1881 }
1882 return FALSE;
1883 }
1884
1885
1886
1887
1888 /*************************************************
1889 * Check POSIX class name *
1890 *************************************************/
1891
1892 /* This function is called to check the name given in a POSIX-style class entry
1893 such as [:alnum:].
1894
1895 Arguments:
1896 ptr points to the first letter
1897 len the length of the name
1898
1899 Returns: a value representing the name, or -1 if unknown
1900 */
1901
1902 static int
1903 check_posix_name(const uschar *ptr, int len)
1904 {
1905 register int yield = 0;
1906 while (posix_name_lengths[yield] != 0)
1907 {
1908 if (len == posix_name_lengths[yield] &&
1909 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1910 yield++;
1911 }
1912 return -1;
1913 }
1914
1915
1916 /*************************************************
1917 * Adjust OP_RECURSE items in repeated group *
1918 *************************************************/
1919
1920 /* OP_RECURSE items contain an offset from the start of the regex to the group
1921 that is referenced. This means that groups can be replicated for fixed
1922 repetition simply by copying (because the recursion is allowed to refer to
1923 earlier groups that are outside the current group). However, when a group is
1924 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1925 it, after it has been compiled. This means that any OP_RECURSE items within it
1926 that refer to the group itself or any contained groups have to have their
1927 offsets adjusted. That is the job of this function. Before it is called, the
1928 partially compiled regex must be temporarily terminated with OP_END.
1929
1930 Arguments:
1931 group points to the start of the group
1932 adjust the amount by which the group is to be moved
1933 utf8 TRUE in UTF-8 mode
1934 cd contains pointers to tables etc.
1935
1936 Returns: nothing
1937 */
1938
1939 static void
1940 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1941 {
1942 uschar *ptr = group;
1943 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1944 {
1945 int offset = GET(ptr, 1);
1946 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1947 ptr += 1 + LINK_SIZE;
1948 }
1949 }
1950
1951
1952
1953 /*************************************************
1954 * Insert an automatic callout point *
1955 *************************************************/
1956
1957 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1958 callout points before each pattern item.
1959
1960 Arguments:
1961 code current code pointer
1962 ptr current pattern pointer
1963 cd pointers to tables etc
1964
1965 Returns: new code pointer
1966 */
1967
1968 static uschar *
1969 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1970 {
1971 *code++ = OP_CALLOUT;
1972 *code++ = 255;
1973 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1974 PUT(code, LINK_SIZE, 0); /* Default length */
1975 return code + 2*LINK_SIZE;
1976 }
1977
1978
1979
1980 /*************************************************
1981 * Complete a callout item *
1982 *************************************************/
1983
1984 /* A callout item contains the length of the next item in the pattern, which
1985 we can't fill in till after we have reached the relevant point. This is used
1986 for both automatic and manual callouts.
1987
1988 Arguments:
1989 previous_callout points to previous callout item
1990 ptr current pattern pointer
1991 cd pointers to tables etc
1992
1993 Returns: nothing
1994 */
1995
1996 static void
1997 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1998 {
1999 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2000 PUT(previous_callout, 2 + LINK_SIZE, length);
2001 }
2002
2003
2004
2005 #ifdef SUPPORT_UCP
2006 /*************************************************
2007 * Get othercase range *
2008 *************************************************/
2009
2010 /* This function is passed the start and end of a class range, in UTF-8 mode
2011 with UCP support. It searches up the characters, looking for internal ranges of
2012 characters in the "other" case. Each call returns the next one, updating the
2013 start address.
2014
2015 Arguments:
2016 cptr points to starting character value; updated
2017 d end value
2018 ocptr where to put start of othercase range
2019 odptr where to put end of othercase range
2020
2021 Yield: TRUE when range returned; FALSE when no more
2022 */
2023
2024 static BOOL
2025 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2026 {
2027 int c, chartype, othercase, next;
2028
2029 for (c = *cptr; c <= d; c++)
2030 {
2031 if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2032 }
2033
2034 if (c > d) return FALSE;
2035
2036 *ocptr = othercase;
2037 next = othercase + 1;
2038
2039 for (++c; c <= d; c++)
2040 {
2041 if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2042 break;
2043 next++;
2044 }
2045
2046 *odptr = next - 1;
2047 *cptr = c;
2048
2049 return TRUE;
2050 }
2051 #endif /* SUPPORT_UCP */
2052
2053
2054 /*************************************************
2055 * Compile one branch *
2056 *************************************************/
2057
2058 /* Scan the pattern, compiling it into the code vector. If the options are
2059 changed during the branch, the pointer is used to change the external options
2060 bits.
2061
2062 Arguments:
2063 optionsptr pointer to the option bits
2064 brackets points to number of extracting brackets used
2065 codeptr points to the pointer to the current code point
2066 ptrptr points to the current pattern pointer
2067 errorptr points to pointer to error message
2068 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2069 reqbyteptr set to the last literal character required, else < 0
2070 bcptr points to current branch chain
2071 cd contains pointers to tables etc.
2072
2073 Returns: TRUE on success
2074 FALSE, with *errorptr set on error
2075 */
2076
2077 static BOOL
2078 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2079 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2080 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2081 {
2082 int repeat_type, op_type;
2083 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2084 int bravalue = 0;
2085 int greedy_default, greedy_non_default;
2086 int firstbyte, reqbyte;
2087 int zeroreqbyte, zerofirstbyte;
2088 int req_caseopt, reqvary, tempreqvary;
2089 int condcount = 0;
2090 int options = *optionsptr;
2091 int after_manual_callout = 0;
2092 register int c;
2093 register uschar *code = *codeptr;
2094 uschar *tempcode;
2095 BOOL inescq = FALSE;
2096 BOOL groupsetfirstbyte = FALSE;
2097 const uschar *ptr = *ptrptr;
2098 const uschar *tempptr;
2099 uschar *previous = NULL;
2100 uschar *previous_callout = NULL;
2101 uschar classbits[32];
2102
2103 #ifdef SUPPORT_UTF8
2104 BOOL class_utf8;
2105 BOOL utf8 = (options & PCRE_UTF8) != 0;
2106 uschar *class_utf8data;
2107 uschar utf8_char[6];
2108 #else
2109 BOOL utf8 = FALSE;
2110 #endif
2111
2112 /* Set up the default and non-default settings for greediness */
2113
2114 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2115 greedy_non_default = greedy_default ^ 1;
2116
2117 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2118 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2119 matches a non-fixed char first char; reqbyte just remains unset if we never
2120 find one.
2121
2122 When we hit a repeat whose minimum is zero, we may have to adjust these values
2123 to take the zero repeat into account. This is implemented by setting them to
2124 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2125 item types that can be repeated set these backoff variables appropriately. */
2126
2127 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2128
2129 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2130 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2131 value > 255. It is added into the firstbyte or reqbyte variables to record the
2132 case status of the value. This is used only for ASCII characters. */
2133
2134 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2135
2136 /* Switch on next character until the end of the branch */
2137
2138 for (;; ptr++)
2139 {
2140 BOOL negate_class;
2141 BOOL possessive_quantifier;
2142 BOOL is_quantifier;
2143 int class_charcount;
2144 int class_lastchar;
2145 int newoptions;
2146 int recno;
2147 int skipbytes;
2148 int subreqbyte;
2149 int subfirstbyte;
2150 int mclength;
2151 uschar mcbuffer[8];
2152
2153 /* Next byte in the pattern */
2154
2155 c = *ptr;
2156
2157 /* If in \Q...\E, check for the end; if not, we have a literal */
2158
2159 if (inescq && c != 0)
2160 {
2161 if (c == '\\' && ptr[1] == 'E')
2162 {
2163 inescq = FALSE;
2164 ptr++;
2165 continue;
2166 }
2167 else
2168 {
2169 if (previous_callout != NULL)
2170 {
2171 complete_callout(previous_callout, ptr, cd);
2172 previous_callout = NULL;
2173 }
2174 if ((options & PCRE_AUTO_CALLOUT) != 0)
2175 {
2176 previous_callout = code;
2177 code = auto_callout(code, ptr, cd);
2178 }
2179 goto NORMAL_CHAR;
2180 }
2181 }
2182
2183 /* Fill in length of a previous callout, except when the next thing is
2184 a quantifier. */
2185
2186 is_quantifier = c == '*' || c == '+' || c == '?' ||
2187 (c == '{' && is_counted_repeat(ptr+1));
2188
2189 if (!is_quantifier && previous_callout != NULL &&
2190 after_manual_callout-- <= 0)
2191 {
2192 complete_callout(previous_callout, ptr, cd);
2193 previous_callout = NULL;
2194 }
2195
2196 /* In extended mode, skip white space and comments */
2197
2198 if ((options & PCRE_EXTENDED) != 0)
2199 {
2200 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2201 if (c == '#')
2202 {
2203 /* The space before the ; is to avoid a warning on a silly compiler
2204 on the Macintosh. */
2205 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2206 if (c != 0) continue; /* Else fall through to handle end of string */
2207 }
2208 }
2209
2210 /* No auto callout for quantifiers. */
2211
2212 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2213 {
2214 previous_callout = code;
2215 code = auto_callout(code, ptr, cd);
2216 }
2217
2218 switch(c)
2219 {
2220 /* The branch terminates at end of string, |, or ). */
2221
2222 case 0:
2223 case '|':
2224 case ')':
2225 *firstbyteptr = firstbyte;
2226 *reqbyteptr = reqbyte;
2227 *codeptr = code;
2228 *ptrptr = ptr;
2229 return TRUE;
2230
2231 /* Handle single-character metacharacters. In multiline mode, ^ disables
2232 the setting of any following char as a first character. */
2233
2234 case '^':
2235 if ((options & PCRE_MULTILINE) != 0)
2236 {
2237 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2238 }
2239 previous = NULL;
2240 *code++ = OP_CIRC;
2241 break;
2242
2243 case '$':
2244 previous = NULL;
2245 *code++ = OP_DOLL;
2246 break;
2247
2248 /* There can never be a first char if '.' is first, whatever happens about
2249 repeats. The value of reqbyte doesn't change either. */
2250
2251 case '.':
2252 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2253 zerofirstbyte = firstbyte;
2254 zeroreqbyte = reqbyte;
2255 previous = code;
2256 *code++ = OP_ANY;
2257 break;
2258
2259 /* Character classes. If the included characters are all < 255 in value, we
2260 build a 32-byte bitmap of the permitted characters, except in the special
2261 case where there is only one such character. For negated classes, we build
2262 the map as usual, then invert it at the end. However, we use a different
2263 opcode so that data characters > 255 can be handled correctly.
2264
2265 If the class contains characters outside the 0-255 range, a different
2266 opcode is compiled. It may optionally have a bit map for characters < 256,
2267 but those above are are explicitly listed afterwards. A flag byte tells
2268 whether the bitmap is present, and whether this is a negated class or not.
2269 */
2270
2271 case '[':
2272 previous = code;
2273
2274 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2275 they are encountered at the top level, so we'll do that too. */
2276
2277 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2278 check_posix_syntax(ptr, &tempptr, cd))
2279 {
2280 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2281 goto FAILED;
2282 }
2283
2284 /* If the first character is '^', set the negation flag and skip it. */
2285
2286 if ((c = *(++ptr)) == '^')
2287 {
2288 negate_class = TRUE;
2289 c = *(++ptr);
2290 }
2291 else
2292 {
2293 negate_class = FALSE;
2294 }
2295
2296 /* Keep a count of chars with values < 256 so that we can optimize the case
2297 of just a single character (as long as it's < 256). For higher valued UTF-8
2298 characters, we don't yet do any optimization. */
2299
2300 class_charcount = 0;
2301 class_lastchar = -1;
2302
2303 #ifdef SUPPORT_UTF8
2304 class_utf8 = FALSE; /* No chars >= 256 */
2305 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
2306 #endif
2307
2308 /* Initialize the 32-char bit map to all zeros. We have to build the
2309 map in a temporary bit of store, in case the class contains only 1
2310 character (< 256), because in that case the compiled code doesn't use the
2311 bit map. */
2312
2313 memset(classbits, 0, 32 * sizeof(uschar));
2314
2315 /* Process characters until ] is reached. By writing this as a "do" it
2316 means that an initial ] is taken as a data character. The first pass
2317 through the regex checked the overall syntax, so we don't need to be very
2318 strict here. At the start of the loop, c contains the first byte of the
2319 character. */
2320
2321 do
2322 {
2323 #ifdef SUPPORT_UTF8
2324 if (utf8 && c > 127)
2325 { /* Braces are required because the */
2326 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2327 }
2328 #endif
2329
2330 /* Inside \Q...\E everything is literal except \E */
2331
2332 if (inescq)
2333 {
2334 if (c == '\\' && ptr[1] == 'E')
2335 {
2336 inescq = FALSE;
2337 ptr++;
2338 continue;
2339 }
2340 else goto LONE_SINGLE_CHARACTER;
2341 }
2342
2343 /* Handle POSIX class names. Perl allows a negation extension of the
2344 form [:^name:]. A square bracket that doesn't match the syntax is
2345 treated as a literal. We also recognize the POSIX constructions
2346 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2347 5.6 and 5.8 do. */
2348
2349 if (c == '[' &&
2350 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2351 check_posix_syntax(ptr, &tempptr, cd))
2352 {
2353 BOOL local_negate = FALSE;
2354 int posix_class, i;
2355 register const uschar *cbits = cd->cbits;
2356
2357 if (ptr[1] != ':')
2358 {
2359 *errorptr = ERR31;
2360 goto FAILED;
2361 }
2362
2363 ptr += 2;
2364 if (*ptr == '^')
2365 {
2366 local_negate = TRUE;
2367 ptr++;
2368 }
2369
2370 posix_class = check_posix_name(ptr, tempptr - ptr);
2371 if (posix_class < 0)
2372 {
2373 *errorptr = ERR30;
2374 goto FAILED;
2375 }
2376
2377 /* If matching is caseless, upper and lower are converted to
2378 alpha. This relies on the fact that the class table starts with
2379 alpha, lower, upper as the first 3 entries. */
2380
2381 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2382 posix_class = 0;
2383
2384 /* Or into the map we are building up to 3 of the static class
2385 tables, or their negations. The [:blank:] class sets up the same
2386 chars as the [:space:] class (all white space). We remove the vertical
2387 white space chars afterwards. */
2388
2389 posix_class *= 3;
2390 for (i = 0; i < 3; i++)
2391 {
2392 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2393 int taboffset = posix_class_maps[posix_class + i];
2394 if (taboffset < 0) break;
2395 if (local_negate)
2396 {
2397 if (i == 0)
2398 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2399 else
2400 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2401 if (blankclass) classbits[1] |= 0x3c;
2402 }
2403 else
2404 {
2405 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2406 if (blankclass) classbits[1] &= ~0x3c;
2407 }
2408 }
2409
2410 ptr = tempptr + 1;
2411 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2412 continue; /* End of POSIX syntax handling */
2413 }
2414
2415 /* Backslash may introduce a single character, or it may introduce one
2416 of the specials, which just set a flag. Escaped items are checked for
2417 validity in the pre-compiling pass. The sequence \b is a special case.
2418 Inside a class (and only there) it is treated as backspace. Elsewhere
2419 it marks a word boundary. Other escapes have preset maps ready to
2420 or into the one we are building. We assume they have more than one
2421 character in them, so set class_charcount bigger than one. */
2422
2423 if (c == '\\')
2424 {
2425 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2426
2427 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2428 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2429 else if (-c == ESC_Q) /* Handle start of quoted string */
2430 {
2431 if (ptr[1] == '\\' && ptr[2] == 'E')
2432 {
2433 ptr += 2; /* avoid empty string */
2434 }
2435 else inescq = TRUE;
2436 continue;
2437 }
2438
2439 if (c < 0)
2440 {
2441 register const uschar *cbits = cd->cbits;
2442 class_charcount += 2; /* Greater than 1 is what matters */
2443 switch (-c)
2444 {
2445 case ESC_d:
2446 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2447 continue;
2448
2449 case ESC_D:
2450 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2451 continue;
2452
2453 case ESC_w:
2454 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2455 continue;
2456
2457 case ESC_W:
2458 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2459 continue;
2460
2461 case ESC_s:
2462 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2463 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2464 continue;
2465
2466 case ESC_S:
2467 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2468 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2469 continue;
2470
2471 #ifdef SUPPORT_UCP
2472 case ESC_p:
2473 case ESC_P:
2474 {
2475 BOOL negated;
2476 int property = get_ucp(&ptr, &negated, errorptr);
2477 if (property < 0) goto FAILED;
2478 class_utf8 = TRUE;
2479 *class_utf8data++ = ((-c == ESC_p) != negated)?
2480 XCL_PROP : XCL_NOTPROP;
2481 *class_utf8data++ = property;
2482 class_charcount -= 2; /* Not a < 256 character */
2483 }
2484 continue;
2485 #endif
2486
2487 /* Unrecognized escapes are faulted if PCRE is running in its
2488 strict mode. By default, for compatibility with Perl, they are
2489 treated as literals. */
2490
2491 default:
2492 if ((options & PCRE_EXTRA) != 0)
2493 {
2494 *errorptr = ERR7;
2495 goto FAILED;
2496 }
2497 c = *ptr; /* The final character */
2498 class_charcount -= 2; /* Undo the default count from above */
2499 }
2500 }
2501
2502 /* Fall through if we have a single character (c >= 0). This may be
2503 > 256 in UTF-8 mode. */
2504
2505 } /* End of backslash handling */
2506
2507 /* A single character may be followed by '-' to form a range. However,
2508 Perl does not permit ']' to be the end of the range. A '-' character
2509 here is treated as a literal. */
2510
2511 if (ptr[1] == '-' && ptr[2] != ']')
2512 {
2513 int d;
2514 ptr += 2;
2515
2516 #ifdef SUPPORT_UTF8
2517 if (utf8)
2518 { /* Braces are required because the */
2519 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2520 }
2521 else
2522 #endif
2523 d = *ptr; /* Not UTF-8 mode */
2524
2525 /* The second part of a range can be a single-character escape, but
2526 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2527 in such circumstances. */
2528
2529 if (d == '\\')
2530 {
2531 const uschar *oldptr = ptr;
2532 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2533
2534 /* \b is backslash; \X is literal X; any other special means the '-'
2535 was literal */
2536
2537 if (d < 0)
2538 {
2539 if (d == -ESC_b) d = '\b';
2540 else if (d == -ESC_X) d = 'X'; else
2541 {
2542 ptr = oldptr - 2;
2543 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2544 }
2545 }
2546 }
2547
2548 /* The check that the two values are in the correct order happens in
2549 the pre-pass. Optimize one-character ranges */
2550
2551 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2552
2553 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2554 matching, we have to use an XCLASS with extra data items. Caseless
2555 matching for characters > 127 is available only if UCP support is
2556 available. */
2557
2558 #ifdef SUPPORT_UTF8
2559 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2560 {
2561 class_utf8 = TRUE;
2562
2563 /* With UCP support, we can find the other case equivalents of
2564 the relevant characters. There may be several ranges. Optimize how
2565 they fit with the basic range. */
2566
2567 #ifdef SUPPORT_UCP
2568 if ((options & PCRE_CASELESS) != 0)
2569 {
2570 int occ, ocd;
2571 int cc = c;
2572 int origd = d;
2573 while (get_othercase_range(&cc, origd, &occ, &ocd))
2574 {
2575 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2576
2577 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2578 { /* if there is overlap, */
2579 c = occ; /* noting that if occ < c */
2580 continue; /* we can't have ocd > d */
2581 } /* because a subrange is */
2582 if (ocd > d && occ <= d + 1) /* always shorter than */
2583 { /* the basic range. */
2584 d = ocd;
2585 continue;
2586 }
2587
2588 if (occ == ocd)
2589 {
2590 *class_utf8data++ = XCL_SINGLE;
2591 }
2592 else
2593 {
2594 *class_utf8data++ = XCL_RANGE;
2595 class_utf8data += ord2utf8(occ, class_utf8data);
2596 }
2597 class_utf8data += ord2utf8(ocd, class_utf8data);
2598 }
2599 }
2600 #endif /* SUPPORT_UCP */
2601
2602 /* Now record the original range, possibly modified for UCP caseless
2603 overlapping ranges. */
2604
2605 *class_utf8data++ = XCL_RANGE;
2606 class_utf8data += ord2utf8(c, class_utf8data);
2607 class_utf8data += ord2utf8(d, class_utf8data);
2608
2609 /* With UCP support, we are done. Without UCP support, there is no
2610 caseless matching for UTF-8 characters > 127; we can use the bit map
2611 for the smaller ones. */
2612
2613 #ifdef SUPPORT_UCP
2614 continue; /* With next character in the class */
2615 #else
2616 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2617
2618 /* Adjust upper limit and fall through to set up the map */
2619
2620 d = 127;
2621
2622 #endif /* SUPPORT_UCP */
2623 }
2624 #endif /* SUPPORT_UTF8 */
2625
2626 /* We use the bit map for all cases when not in UTF-8 mode; else
2627 ranges that lie entirely within 0-127 when there is UCP support; else
2628 for partial ranges without UCP support. */
2629
2630 for (; c <= d; c++)
2631 {
2632 classbits[c/8] |= (1 << (c&7));
2633 if ((options & PCRE_CASELESS) != 0)
2634 {
2635 int uc = cd->fcc[c]; /* flip case */
2636 classbits[uc/8] |= (1 << (uc&7));
2637 }
2638 class_charcount++; /* in case a one-char range */
2639 class_lastchar = c;
2640 }
2641
2642 continue; /* Go get the next char in the class */
2643 }
2644
2645 /* Handle a lone single character - we can get here for a normal
2646 non-escape char, or after \ that introduces a single character or for an
2647 apparent range that isn't. */
2648
2649 LONE_SINGLE_CHARACTER:
2650
2651 /* Handle a character that cannot go in the bit map */
2652
2653 #ifdef SUPPORT_UTF8
2654 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2655 {
2656 class_utf8 = TRUE;
2657 *class_utf8data++ = XCL_SINGLE;
2658 class_utf8data += ord2utf8(c, class_utf8data);
2659
2660 #ifdef SUPPORT_UCP
2661 if ((options & PCRE_CASELESS) != 0)
2662 {
2663 int chartype;
2664 int othercase;
2665 if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2666 {
2667 *class_utf8data++ = XCL_SINGLE;
2668 class_utf8data += ord2utf8(othercase, class_utf8data);
2669 }
2670 }
2671 #endif /* SUPPORT_UCP */
2672
2673 }
2674 else
2675 #endif /* SUPPORT_UTF8 */
2676
2677 /* Handle a single-byte character */
2678 {
2679 classbits[c/8] |= (1 << (c&7));
2680 if ((options & PCRE_CASELESS) != 0)
2681 {
2682 c = cd->fcc[c]; /* flip case */
2683 classbits[c/8] |= (1 << (c&7));
2684 }
2685 class_charcount++;
2686 class_lastchar = c;
2687 }
2688 }
2689
2690 /* Loop until ']' reached; the check for end of string happens inside the
2691 loop. This "while" is the end of the "do" above. */
2692
2693 while ((c = *(++ptr)) != ']' || inescq);
2694
2695 /* If class_charcount is 1, we saw precisely one character whose value is
2696 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2697 can optimize the negative case only if there were no characters >= 128
2698 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2699 single-bytes only. This is an historical hangover. Maybe one day we can
2700 tidy these opcodes to handle multi-byte characters.
2701
2702 The optimization throws away the bit map. We turn the item into a
2703 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2704 that OP_NOT does not support multibyte characters. In the positive case, it
2705 can cause firstbyte to be set. Otherwise, there can be no first char if
2706 this item is first, whatever repeat count may follow. In the case of
2707 reqbyte, save the previous value for reinstating. */
2708
2709 #ifdef SUPPORT_UTF8
2710 if (class_charcount == 1 &&
2711 (!utf8 ||
2712 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2713
2714 #else
2715 if (class_charcount == 1)
2716 #endif
2717 {
2718 zeroreqbyte = reqbyte;
2719
2720 /* The OP_NOT opcode works on one-byte characters only. */
2721
2722 if (negate_class)
2723 {
2724 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2725 zerofirstbyte = firstbyte;
2726 *code++ = OP_NOT;
2727 *code++ = class_lastchar;
2728 break;
2729 }
2730
2731 /* For a single, positive character, get the value into mcbuffer, and
2732 then we can handle this with the normal one-character code. */
2733
2734 #ifdef SUPPORT_UTF8
2735 if (utf8 && class_lastchar > 127)
2736 mclength = ord2utf8(class_lastchar, mcbuffer);
2737 else
2738 #endif
2739 {
2740 mcbuffer[0] = class_lastchar;
2741 mclength = 1;
2742 }
2743 goto ONE_CHAR;
2744 } /* End of 1-char optimization */
2745
2746 /* The general case - not the one-char optimization. If this is the first
2747 thing in the branch, there can be no first char setting, whatever the
2748 repeat count. Any reqbyte setting must remain unchanged after any kind of
2749 repeat. */
2750
2751 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2752 zerofirstbyte = firstbyte;
2753 zeroreqbyte = reqbyte;
2754
2755 /* If there are characters with values > 255, we have to compile an
2756 extended class, with its own opcode. If there are no characters < 256,
2757 we can omit the bitmap. */
2758
2759 #ifdef SUPPORT_UTF8
2760 if (class_utf8)
2761 {
2762 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2763 *code++ = OP_XCLASS;
2764 code += LINK_SIZE;
2765 *code = negate_class? XCL_NOT : 0;
2766
2767 /* If the map is required, install it, and move on to the end of
2768 the extra data */
2769
2770 if (class_charcount > 0)
2771 {
2772 *code++ |= XCL_MAP;
2773 memcpy(code, classbits, 32);
2774 code = class_utf8data;
2775 }
2776
2777 /* If the map is not required, slide down the extra data. */
2778
2779 else
2780 {
2781 int len = class_utf8data - (code + 33);
2782 memmove(code + 1, code + 33, len);
2783 code += len + 1;
2784 }
2785
2786 /* Now fill in the complete length of the item */
2787
2788 PUT(previous, 1, code - previous);
2789 break; /* End of class handling */
2790 }
2791 #endif
2792
2793 /* If there are no characters > 255, negate the 32-byte map if necessary,
2794 and copy it into the code vector. If this is the first thing in the branch,
2795 there can be no first char setting, whatever the repeat count. Any reqbyte
2796 setting must remain unchanged after any kind of repeat. */
2797
2798 if (negate_class)
2799 {
2800 *code++ = OP_NCLASS;
2801 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2802 }
2803 else
2804 {
2805 *code++ = OP_CLASS;
2806 memcpy(code, classbits, 32);
2807 }
2808 code += 32;
2809 break;
2810
2811 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2812 has been tested above. */
2813
2814 case '{':
2815 if (!is_quantifier) goto NORMAL_CHAR;
2816 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2817 if (*errorptr != NULL) goto FAILED;
2818 goto REPEAT;
2819
2820 case '*':
2821 repeat_min = 0;
2822 repeat_max = -1;
2823 goto REPEAT;
2824
2825 case '+':
2826 repeat_min = 1;
2827 repeat_max = -1;
2828 goto REPEAT;
2829
2830 case '?':
2831 repeat_min = 0;
2832 repeat_max = 1;
2833
2834 REPEAT:
2835 if (previous == NULL)
2836 {
2837 *errorptr = ERR9;
2838 goto FAILED;
2839 }
2840
2841 if (repeat_min == 0)
2842 {
2843 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2844 reqbyte = zeroreqbyte; /* Ditto */
2845 }
2846
2847 /* Remember whether this is a variable length repeat */
2848
2849 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2850
2851 op_type = 0; /* Default single-char op codes */
2852 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2853
2854 /* Save start of previous item, in case we have to move it up to make space
2855 for an inserted OP_ONCE for the additional '+' extension. */
2856
2857 tempcode = previous;
2858
2859 /* If the next character is '+', we have a possessive quantifier. This
2860 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2861 If the next character is '?' this is a minimizing repeat, by default,
2862 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2863 repeat type to the non-default. */
2864
2865 if (ptr[1] == '+')
2866 {
2867 repeat_type = 0; /* Force greedy */
2868 possessive_quantifier = TRUE;
2869 ptr++;
2870 }
2871 else if (ptr[1] == '?')
2872 {
2873 repeat_type = greedy_non_default;
2874 ptr++;
2875 }
2876 else repeat_type = greedy_default;
2877
2878 /* If previous was a recursion, we need to wrap it inside brackets so that
2879 it can be replicated if necessary. */
2880
2881 if (*previous == OP_RECURSE)
2882 {
2883 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2884 code += 1 + LINK_SIZE;
2885 *previous = OP_BRA;
2886 PUT(previous, 1, code - previous);
2887 *code = OP_KET;
2888 PUT(code, 1, code - previous);
2889 code += 1 + LINK_SIZE;
2890 }
2891
2892 /* If previous was a character match, abolish the item and generate a
2893 repeat item instead. If a char item has a minumum of more than one, ensure
2894 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2895 the first thing in a branch because the x will have gone into firstbyte
2896 instead. */
2897
2898 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2899 {
2900 /* Deal with UTF-8 characters that take up more than one byte. It's
2901 easier to write this out separately than try to macrify it. Use c to
2902 hold the length of the character in bytes, plus 0x80 to flag that it's a
2903 length rather than a small character. */
2904
2905 #ifdef SUPPORT_UTF8
2906 if (utf8 && (code[-1] & 0x80) != 0)
2907 {
2908 uschar *lastchar = code - 1;
2909 while((*lastchar & 0xc0) == 0x80) lastchar--;
2910 c = code - lastchar; /* Length of UTF-8 character */
2911 memcpy(utf8_char, lastchar, c); /* Save the char */
2912 c |= 0x80; /* Flag c as a length */
2913 }
2914 else
2915 #endif
2916
2917 /* Handle the case of a single byte - either with no UTF8 support, or
2918 with UTF-8 disabled, or for a UTF-8 character < 128. */
2919
2920 {
2921 c = code[-1];
2922 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2923 }
2924
2925 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2926 }
2927
2928 /* If previous was a single negated character ([^a] or similar), we use
2929 one of the special opcodes, replacing it. The code is shared with single-
2930 character repeats by setting opt_type to add a suitable offset into
2931 repeat_type. OP_NOT is currently used only for single-byte chars. */
2932
2933 else if (*previous == OP_NOT)
2934 {
2935 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2936 c = previous[1];
2937 goto OUTPUT_SINGLE_REPEAT;
2938 }
2939
2940 /* If previous was a character type match (\d or similar), abolish it and
2941 create a suitable repeat item. The code is shared with single-character
2942 repeats by setting op_type to add a suitable offset into repeat_type. Note
2943 the the Unicode property types will be present only when SUPPORT_UCP is
2944 defined, but we don't wrap the little bits of code here because it just
2945 makes it horribly messy. */
2946
2947 else if (*previous < OP_EODN)
2948 {
2949 uschar *oldcode;
2950 int prop_type;
2951 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2952 c = *previous;
2953
2954 OUTPUT_SINGLE_REPEAT:
2955 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2956 previous[1] : -1;
2957
2958 oldcode = code;
2959 code = previous; /* Usually overwrite previous item */
2960
2961 /* If the maximum is zero then the minimum must also be zero; Perl allows
2962 this case, so we do too - by simply omitting the item altogether. */
2963
2964 if (repeat_max == 0) goto END_REPEAT;
2965
2966 /* All real repeats make it impossible to handle partial matching (maybe
2967 one day we will be able to remove this restriction). */
2968
2969 if (repeat_max != 1) cd->nopartial = TRUE;
2970
2971 /* Combine the op_type with the repeat_type */
2972
2973 repeat_type += op_type;
2974
2975 /* A minimum of zero is handled either as the special case * or ?, or as
2976 an UPTO, with the maximum given. */
2977
2978 if (repeat_min == 0)
2979 {
2980 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2981 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2982 else
2983 {
2984 *code++ = OP_UPTO + repeat_type;
2985 PUT2INC(code, 0, repeat_max);
2986 }
2987 }
2988
2989 /* A repeat minimum of 1 is optimized into some special cases. If the
2990 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2991 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2992 one less than the maximum. */
2993
2994 else if (repeat_min == 1)
2995 {
2996 if (repeat_max == -1)
2997 *code++ = OP_PLUS + repeat_type;
2998 else
2999 {
3000 code = oldcode; /* leave previous item in place */
3001 if (repeat_max == 1) goto END_REPEAT;
3002 *code++ = OP_UPTO + repeat_type;
3003 PUT2INC(code, 0, repeat_max - 1);
3004 }
3005 }
3006
3007 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3008 handled as an EXACT followed by an UPTO. */
3009
3010 else
3011 {
3012 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3013 PUT2INC(code, 0, repeat_min);
3014
3015 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3016 we have to insert the character for the previous code. For a repeated
3017 Unicode property match, there is an extra byte that defines the
3018 required property. In UTF-8 mode, long characters have their length in
3019 c, with the 0x80 bit as a flag. */
3020
3021 if (repeat_max < 0)
3022 {
3023 #ifdef SUPPORT_UTF8
3024 if (utf8 && c >= 128)
3025 {
3026 memcpy(code, utf8_char, c & 7);
3027 code += c & 7;
3028 }
3029 else
3030 #endif
3031 {
3032 *code++ = c;
3033 if (prop_type >= 0) *code++ = prop_type;
3034 }
3035 *code++ = OP_STAR + repeat_type;
3036 }
3037
3038 /* Else insert an UPTO if the max is greater than the min, again
3039 preceded by the character, for the previously inserted code. */
3040
3041 else if (repeat_max != repeat_min)
3042 {
3043 #ifdef SUPPORT_UTF8
3044 if (utf8 && c >= 128)
3045 {
3046 memcpy(code, utf8_char, c & 7);
3047 code += c & 7;
3048 }
3049 else
3050 #endif
3051 *code++ = c;
3052 if (prop_type >= 0) *code++ = prop_type;
3053 repeat_max -= repeat_min;
3054 *code++ = OP_UPTO + repeat_type;
3055 PUT2INC(code, 0, repeat_max);
3056 }
3057 }
3058
3059 /* The character or character type itself comes last in all cases. */
3060
3061 #ifdef SUPPORT_UTF8
3062 if (utf8 && c >= 128)
3063 {
3064 memcpy(code, utf8_char, c & 7);
3065 code += c & 7;
3066 }
3067 else
3068 #endif
3069 *code++ = c;
3070
3071 /* For a repeated Unicode property match, there is an extra byte that
3072 defines the required property. */
3073
3074 #ifdef SUPPORT_UCP
3075 if (prop_type >= 0) *code++ = prop_type;
3076 #endif
3077 }
3078
3079 /* If previous was a character class or a back reference, we put the repeat
3080 stuff after it, but just skip the item if the repeat was {0,0}. */
3081
3082 else if (*previous == OP_CLASS ||
3083 *previous == OP_NCLASS ||
3084 #ifdef SUPPORT_UTF8
3085 *previous == OP_XCLASS ||
3086 #endif
3087 *previous == OP_REF)
3088 {
3089 if (repeat_max == 0)
3090 {
3091 code = previous;
3092 goto END_REPEAT;
3093 }
3094
3095 /* All real repeats make it impossible to handle partial matching (maybe
3096 one day we will be able to remove this restriction). */
3097
3098 if (repeat_max != 1) cd->nopartial = TRUE;
3099
3100 if (repeat_min == 0 && repeat_max == -1)
3101 *code++ = OP_CRSTAR + repeat_type;
3102 else if (repeat_min == 1 && repeat_max == -1)
3103 *code++ = OP_CRPLUS + repeat_type;
3104 else if (repeat_min == 0 && repeat_max == 1)
3105 *code++ = OP_CRQUERY + repeat_type;
3106 else
3107 {
3108 *code++ = OP_CRRANGE + repeat_type;
3109 PUT2INC(code, 0, repeat_min);
3110 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3111 PUT2INC(code, 0, repeat_max);
3112 }
3113 }
3114
3115 /* If previous was a bracket group, we may have to replicate it in certain
3116 cases. */
3117
3118 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3119 *previous == OP_COND)
3120 {
3121 register int i;
3122 int ketoffset = 0;
3123 int len = code - previous;
3124 uschar *bralink = NULL;
3125
3126 /* If the maximum repeat count is unlimited, find the end of the bracket
3127 by scanning through from the start, and compute the offset back to it
3128 from the current code pointer. There may be an OP_OPT setting following
3129 the final KET, so we can't find the end just by going back from the code
3130 pointer. */
3131
3132 if (repeat_max == -1)
3133 {
3134 register uschar *ket = previous;
3135 do ket += GET(ket, 1); while (*ket != OP_KET);
3136 ketoffset = code - ket;
3137 }
3138
3139 /* The case of a zero minimum is special because of the need to stick
3140 OP_BRAZERO in front of it, and because the group appears once in the
3141 data, whereas in other cases it appears the minimum number of times. For
3142 this reason, it is simplest to treat this case separately, as otherwise
3143 the code gets far too messy. There are several special subcases when the
3144 minimum is zero. */
3145
3146 if (repeat_min == 0)
3147 {
3148 /* If the maximum is also zero, we just omit the group from the output
3149 altogether. */
3150
3151 if (repeat_max == 0)
3152 {
3153 code = previous;
3154 goto END_REPEAT;
3155 }
3156
3157 /* If the maximum is 1 or unlimited, we just have to stick in the
3158 BRAZERO and do no more at this point. However, we do need to adjust
3159 any OP_RECURSE calls inside the group that refer to the group itself or
3160 any internal group, because the offset is from the start of the whole
3161 regex. Temporarily terminate the pattern while doing this. */
3162
3163 if (repeat_max <= 1)
3164 {
3165 *code = OP_END;
3166 adjust_recurse(previous, 1, utf8, cd);
3167 memmove(previous+1, previous, len);
3168 code++;
3169 *previous++ = OP_BRAZERO + repeat_type;
3170 }
3171
3172 /* If the maximum is greater than 1 and limited, we have to replicate
3173 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3174 The first one has to be handled carefully because it's the original
3175 copy, which has to be moved up. The remainder can be handled by code
3176 that is common with the non-zero minimum case below. We have to
3177 adjust the value or repeat_max, since one less copy is required. Once
3178 again, we may have to adjust any OP_RECURSE calls inside the group. */
3179
3180 else
3181 {
3182 int offset;
3183 *code = OP_END;
3184 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3185 memmove(previous + 2 + LINK_SIZE, previous, len);
3186 code += 2 + LINK_SIZE;
3187 *previous++ = OP_BRAZERO + repeat_type;
3188 *previous++ = OP_BRA;
3189
3190 /* We chain together the bracket offset fields that have to be
3191 filled in later when the ends of the brackets are reached. */
3192
3193 offset = (bralink == NULL)? 0 : previous - bralink;
3194 bralink = previous;
3195 PUTINC(previous, 0, offset);
3196 }
3197
3198 repeat_max--;
3199 }
3200
3201 /* If the minimum is greater than zero, replicate the group as many
3202 times as necessary, and adjust the maximum to the number of subsequent
3203 copies that we need. If we set a first char from the group, and didn't
3204 set a required char, copy the latter from the former. */
3205
3206 else
3207 {
3208 if (repeat_min > 1)
3209 {
3210 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3211 for (i = 1; i < repeat_min; i++)
3212 {
3213 memcpy(code, previous, len);
3214 code += len;
3215 }
3216 }
3217 if (repeat_max > 0) repeat_max -= repeat_min;
3218 }
3219
3220 /* This code is common to both the zero and non-zero minimum cases. If
3221 the maximum is limited, it replicates the group in a nested fashion,
3222 remembering the bracket starts on a stack. In the case of a zero minimum,
3223 the first one was set up above. In all cases the repeat_max now specifies
3224 the number of additional copies needed. */
3225
3226 if (repeat_max >= 0)
3227 {
3228 for (i = repeat_max - 1; i >= 0; i--)
3229 {
3230 *code++ = OP_BRAZERO + repeat_type;
3231
3232 /* All but the final copy start a new nesting, maintaining the
3233 chain of brackets outstanding. */
3234
3235 if (i != 0)
3236 {
3237 int offset;
3238 *code++ = OP_BRA;
3239 offset = (bralink == NULL)? 0 : code - bralink;
3240 bralink = code;
3241 PUTINC(code, 0, offset);
3242 }
3243
3244 memcpy(code, previous, len);
3245 code += len;
3246 }
3247
3248 /* Now chain through the pending brackets, and fill in their length
3249 fields (which are holding the chain links pro tem). */
3250
3251 while (bralink != NULL)
3252 {
3253 int oldlinkoffset;
3254 int offset = code - bralink + 1;
3255 uschar *bra = code - offset;
3256 oldlinkoffset = GET(bra, 1);
3257 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3258 *code++ = OP_KET;
3259 PUTINC(code, 0, offset);
3260 PUT(bra, 1, offset);
3261 }
3262 }
3263
3264 /* If the maximum is unlimited, set a repeater in the final copy. We
3265 can't just offset backwards from the current code point, because we
3266 don't know if there's been an options resetting after the ket. The
3267 correct offset was computed above. */
3268
3269 else code[-ketoffset] = OP_KETRMAX + repeat_type;
3270 }
3271
3272 /* Else there's some kind of shambles */
3273
3274 else
3275 {
3276 *errorptr = ERR11;
3277 goto FAILED;
3278 }
3279
3280 /* If the character following a repeat is '+', we wrap the entire repeated
3281 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3282 Sun's Java package. The repeated item starts at tempcode, not at previous,
3283 which might be the first part of a string whose (former) last char we
3284 repeated. However, we don't support '+' after a greediness '?'. */
3285
3286 if (possessive_quantifier)
3287 {
3288 int len = code - tempcode;
3289 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3290 code += 1 + LINK_SIZE;
3291 len += 1 + LINK_SIZE;
3292 tempcode[0] = OP_ONCE;
3293 *code++ = OP_KET;
3294 PUTINC(code, 0, len);
3295 PUT(tempcode, 1, len);
3296 }
3297
3298 /* In all case we no longer have a previous item. We also set the
3299 "follows varying string" flag for subsequently encountered reqbytes if
3300 it isn't already set and we have just passed a varying length item. */
3301
3302 END_REPEAT:
3303 previous = NULL;
3304 cd->req_varyopt |= reqvary;
3305 break;
3306
3307
3308 /* Start of nested bracket sub-expression, or comment or lookahead or
3309 lookbehind or option setting or condition. First deal with special things
3310 that can come after a bracket; all are introduced by ?, and the appearance
3311 of any of them means that this is not a referencing group. They were
3312 checked for validity in the first pass over the string, so we don't have to
3313 check for syntax errors here. */
3314
3315 case '(':
3316 newoptions = options;
3317 skipbytes = 0;
3318
3319 if (*(++ptr) == '?')
3320 {
3321 int set, unset;
3322 int *optset;
3323
3324 switch (*(++ptr))
3325 {
3326 case '#': /* Comment; skip to ket */
3327 ptr++;
3328 while (*ptr != ')') ptr++;
3329 continue;
3330
3331 case ':': /* Non-extracting bracket */
3332 bravalue = OP_BRA;
3333 ptr++;
3334 break;
3335
3336 case '(':
3337 bravalue = OP_COND; /* Conditional group */
3338
3339 /* Condition to test for recursion */
3340
3341 if (ptr[1] == 'R')
3342 {
3343 code[1+LINK_SIZE] = OP_CREF;
3344 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3345 skipbytes = 3;
3346 ptr += 3;
3347 }
3348
3349 /* Condition to test for a numbered subpattern match. We know that
3350 if a digit follows ( then there will just be digits until ) because
3351 the syntax was checked in the first pass. */
3352
3353 else if ((digitab[ptr[1]] && ctype_digit) != 0)
3354 {
3355 int condref; /* Don't amalgamate; some compilers */
3356 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
3357 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3358 if (condref == 0)
3359 {
3360 *errorptr = ERR35;
3361 goto FAILED;
3362 }
3363 ptr++;
3364 code[1+LINK_SIZE] = OP_CREF;
3365 PUT2(code, 2+LINK_SIZE, condref);
3366 skipbytes = 3;
3367 }
3368 /* For conditions that are assertions, we just fall through, having
3369 set bravalue above. */
3370 break;
3371
3372 case '=': /* Positive lookahead */
3373 bravalue = OP_ASSERT;
3374 ptr++;
3375 break;
3376
3377 case '!': /* Negative lookahead */
3378 bravalue = OP_ASSERT_NOT;
3379 ptr++;
3380 break;
3381
3382 case '<': /* Lookbehinds */
3383 switch (*(++ptr))
3384 {
3385 case '=': /* Positive lookbehind */
3386 bravalue = OP_ASSERTBACK;
3387 ptr++;
3388 break;
3389
3390 case '!': /* Negative lookbehind */
3391 bravalue = OP_ASSERTBACK_NOT;
3392 ptr++;
3393 break;
3394 }
3395 break;
3396
3397 case '>': /* One-time brackets */
3398 bravalue = OP_ONCE;
3399 ptr++;
3400 break;
3401
3402 case 'C': /* Callout - may be followed by digits; */
3403 previous_callout = code; /* Save for later completion */
3404 after_manual_callout = 1; /* Skip one item before completing */
3405 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3406 { /* closing parenthesis is present. */
3407 int n = 0;
3408 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3409 n = n * 10 + *ptr - '0';
3410 if (n > 255)
3411 {
3412 *errorptr = ERR38;
3413 goto FAILED;
3414 }
3415 *code++ = n;
3416 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3417 PUT(code, LINK_SIZE, 0); /* Default length */
3418 code += 2 * LINK_SIZE;
3419 }
3420 previous = NULL;
3421 continue;
3422
3423 case 'P': /* Named subpattern handling */
3424 if (*(++ptr) == '<') /* Definition */
3425 {
3426 int i, namelen;
3427 uschar *slot = cd->name_table;
3428 const uschar *name; /* Don't amalgamate; some compilers */
3429 name = ++ptr; /* grumble at autoincrement in declaration */
3430
3431 while (*ptr++ != '>');
3432 namelen = ptr - name - 1;
3433
3434 for (i = 0; i < cd->names_found; i++)
3435 {
3436 int crc = memcmp(name, slot+2, namelen);
3437 if (crc == 0)
3438 {
3439 if (slot[2+namelen] == 0)
3440 {
3441 *errorptr = ERR43;
3442 goto FAILED;
3443 }
3444 crc = -1; /* Current name is substring */
3445 }
3446 if (crc < 0)
3447 {
3448 memmove(slot + cd->name_entry_size, slot,
3449 (cd->names_found - i) * cd->name_entry_size);
3450 break;
3451 }
3452 slot += cd->name_entry_size;
3453 }
3454
3455 PUT2(slot, 0, *brackets + 1);
3456 memcpy(slot + 2, name, namelen);
3457 slot[2+namelen] = 0;
3458 cd->names_found++;
3459 goto NUMBERED_GROUP;
3460 }
3461
3462 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3463 {
3464 int i, namelen;
3465 int type = *ptr++;
3466 const uschar *name = ptr;
3467 uschar *slot = cd->name_table;
3468
3469 while (*ptr != ')') ptr++;
3470 namelen = ptr - name;
3471
3472 for (i = 0; i < cd->names_found; i++)
3473 {
3474 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3475 slot += cd->name_entry_size;
3476 }
3477 if (i >= cd->names_found)
3478 {
3479 *errorptr = ERR15;
3480 goto FAILED;
3481 }
3482
3483 recno = GET2(slot, 0);
3484
3485 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3486
3487 /* Back reference */
3488
3489 previous = code;
3490 *code++ = OP_REF;
3491 PUT2INC(code, 0, recno);
3492 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3493 if (recno > cd->top_backref) cd->top_backref = recno;
3494 continue;
3495 }
3496
3497 /* Should never happen */
3498 break;
3499
3500 case 'R': /* Pattern recursion */
3501 ptr++; /* Same as (?0) */
3502 /* Fall through */
3503
3504 /* Recursion or "subroutine" call */
3505
3506 case '0': case '1': case '2': case '3': case '4':
3507 case '5': case '6': case '7': case '8': case '9':
3508 {
3509 const uschar *called;
3510 recno = 0;
3511 while((digitab[*ptr] & ctype_digit) != 0)
3512 recno = recno * 10 + *ptr++ - '0';
3513
3514 /* Come here from code above that handles a named recursion */
3515
3516 HANDLE_RECURSION:
3517
3518 previous = code;
3519
3520 /* Find the bracket that is being referenced. Temporarily end the
3521 regex in case it doesn't exist. */
3522
3523 *code = OP_END;
3524 called = (recno == 0)?
3525 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3526
3527 if (called == NULL)
3528 {
3529 *errorptr = ERR15;
3530 goto FAILED;
3531 }
3532
3533 /* If the subpattern is still open, this is a recursive call. We
3534 check to see if this is a left recursion that could loop for ever,
3535 and diagnose that case. */
3536
3537 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3538 {
3539 *errorptr = ERR40;
3540 goto FAILED;
3541 }
3542
3543 /* Insert the recursion/subroutine item */
3544
3545 *code = OP_RECURSE;
3546 PUT(code, 1, called - cd->start_code);
3547 code += 1 + LINK_SIZE;
3548 }
3549 continue;
3550
3551 /* Character after (? not specially recognized */
3552
3553 default: /* Option setting */
3554 set = unset = 0;
3555 optset = &set;
3556
3557 while (*ptr != ')' && *ptr != ':')
3558 {
3559 switch (*ptr++)
3560 {
3561 case '-': optset = &unset; break;
3562
3563 case 'i': *optset |= PCRE_CASELESS; break;
3564 case 'm': *optset |= PCRE_MULTILINE; break;
3565 case 's': *optset |= PCRE_DOTALL; break;
3566 case 'x': *optset |= PCRE_EXTENDED; break;
3567 case 'U': *optset |= PCRE_UNGREEDY; break;
3568 case 'X': *optset |= PCRE_EXTRA; break;
3569 }
3570 }
3571
3572 /* Set up the changed option bits, but don't change anything yet. */
3573
3574 newoptions = (options | set) & (~unset);
3575
3576 /* If the options ended with ')' this is not the start of a nested
3577 group with option changes, so the options change at this level. Compile
3578 code to change the ims options if this setting actually changes any of
3579 them. We also pass the new setting back so that it can be put at the
3580 start of any following branches, and when this group ends (if we are in
3581 a group), a resetting item can be compiled.
3582
3583 Note that if this item is right at the start of the pattern, the
3584 options will have been abstracted and made global, so there will be no
3585 change to compile. */
3586
3587 if (*ptr == ')')
3588 {
3589 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3590 {
3591 *code++ = OP_OPT;
3592 *code++ = newoptions & PCRE_IMS;
3593 }
3594
3595 /* Change options at this level, and pass them back for use
3596 in subsequent branches. Reset the greedy defaults and the case
3597 value for firstbyte and reqbyte. */
3598
3599 *optionsptr = options = newoptions;
3600 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3601 greedy_non_default = greedy_default ^ 1;
3602 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3603
3604 previous = NULL; /* This item can't be repeated */
3605 continue; /* It is complete */
3606 }
3607
3608 /* If the options ended with ':' we are heading into a nested group
3609 with possible change of options. Such groups are non-capturing and are
3610 not assertions of any kind. All we need to do is skip over the ':';
3611 the newoptions value is handled below. */
3612
3613 bravalue = OP_BRA;
3614 ptr++;
3615 }
3616 }
3617
3618 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3619 non-capturing and behave like (?:...) brackets */
3620
3621 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3622 {
3623 bravalue = OP_BRA;
3624 }
3625
3626 /* Else we have a referencing group; adjust the opcode. If the bracket
3627 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3628 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3629
3630 else
3631 {
3632 NUMBERED_GROUP:
3633 if (++(*brackets) > EXTRACT_BASIC_MAX)
3634 {
3635 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3636 code[1+LINK_SIZE] = OP_BRANUMBER;
3637 PUT2(code, 2+LINK_SIZE, *brackets);
3638 skipbytes = 3;
3639 }
3640 else bravalue = OP_BRA + *brackets;
3641 }
3642
3643 /* Process nested bracketed re. Assertions may not be repeated, but other
3644 kinds can be. We copy code into a non-register variable in order to be able
3645 to pass its address because some compilers complain otherwise. Pass in a
3646 new setting for the ims options if they have changed. */
3647
3648 previous = (bravalue >= OP_ONCE)? code : NULL;
3649 *code = bravalue;
3650 tempcode = code;
3651 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3652
3653 if (!compile_regex(
3654 newoptions, /* The complete new option state */
3655 options & PCRE_IMS, /* The previous ims option state */
3656 brackets, /* Extracting bracket count */
3657 &tempcode, /* Where to put code (updated) */
3658 &ptr, /* Input pointer (updated) */
3659 errorptr, /* Where to put an error message */
3660 (bravalue == OP_ASSERTBACK ||
3661 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3662 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3663 &subfirstbyte, /* For possible first char */
3664 &subreqbyte, /* For possible last char */
3665 bcptr, /* Current branch chain */
3666 cd)) /* Tables block */
3667 goto FAILED;
3668
3669 /* At the end of compiling, code is still pointing to the start of the
3670 group, while tempcode has been updated to point past the end of the group
3671 and any option resetting that may follow it. The pattern pointer (ptr)
3672 is on the bracket. */
3673
3674 /* If this is a conditional bracket, check that there are no more than
3675 two branches in the group. */
3676
3677 else if (bravalue == OP_COND)
3678 {
3679 uschar *tc = code;
3680 condcount = 0;
3681
3682 do {
3683 condcount++;
3684 tc += GET(tc,1);
3685 }
3686 while (*tc != OP_KET);
3687
3688 if (condcount > 2)
3689 {
3690 *errorptr = ERR27;
3691 goto FAILED;
3692 }
3693
3694 /* If there is just one branch, we must not make use of its firstbyte or
3695 reqbyte, because this is equivalent to an empty second branch. */
3696
3697 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3698 }
3699
3700 /* Handle updating of the required and first characters. Update for normal
3701 brackets of all kinds, and conditions with two branches (see code above).
3702 If the bracket is followed by a quantifier with zero repeat, we have to
3703 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3704 main loop so that they can be accessed for the back off. */
3705
3706 zeroreqbyte = reqbyte;
3707 zerofirstbyte = firstbyte;
3708 groupsetfirstbyte = FALSE;
3709
3710 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3711 {
3712 /* If we have not yet set a firstbyte in this branch, take it from the
3713 subpattern, remembering that it was set here so that a repeat of more
3714 than one can replicate it as reqbyte if necessary. If the subpattern has
3715 no firstbyte, set "none" for the whole branch. In both cases, a zero
3716 repeat forces firstbyte to "none". */
3717
3718 if (firstbyte == REQ_UNSET)
3719 {
3720 if (subfirstbyte >= 0)
3721 {
3722 firstbyte = subfirstbyte;
3723 groupsetfirstbyte = TRUE;
3724 }
3725 else firstbyte = REQ_NONE;
3726 zerofirstbyte = REQ_NONE;
3727 }
3728
3729 /* If firstbyte was previously set, convert the subpattern's firstbyte
3730 into reqbyte if there wasn't one, using the vary flag that was in
3731 existence beforehand. */
3732
3733 else if (subfirstbyte >= 0 && subreqbyte < 0)
3734 subreqbyte = subfirstbyte | tempreqvary;
3735
3736 /* If the subpattern set a required byte (or set a first byte that isn't
3737 really the first byte - see above), set it. */
3738
3739 if (subreqbyte >= 0) reqbyte = subreqbyte;
3740 }
3741
3742 /* For a forward assertion, we take the reqbyte, if set. This can be
3743 helpful if the pattern that follows the assertion doesn't set a different
3744 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3745 for an assertion, however because it leads to incorrect effect for patterns
3746 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3747 of a firstbyte. This is overcome by a scan at the end if there's no
3748 firstbyte, looking for an asserted first char. */
3749
3750 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3751
3752 /* Now update the main code pointer to the end of the group. */
3753
3754 code = tempcode;
3755
3756 /* Error if hit end of pattern */
3757
3758 if (*ptr != ')')
3759 {
3760 *errorptr = ERR14;
3761 goto FAILED;
3762 }
3763 break;
3764
3765 /* Check \ for being a real metacharacter; if not, fall through and handle
3766 it as a data character at the start of a string. Escape items are checked
3767 for validity in the pre-compiling pass. */
3768
3769 case '\\':
3770 tempptr = ptr;
3771 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3772
3773 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3774 are arranged to be the negation of the corresponding OP_values. For the
3775 back references, the values are ESC_REF plus the reference number. Only
3776 back references and those types that consume a character may be repeated.
3777 We can test for values between ESC_b and ESC_Z for the latter; this may
3778 have to change if any new ones are ever created. */
3779
3780 if (c < 0)
3781 {
3782 if (-c == ESC_Q) /* Handle start of quoted string */
3783 {
3784 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3785 else inescq = TRUE;
3786 continue;
3787 }
3788
3789 /* For metasequences that actually match a character, we disable the
3790 setting of a first character if it hasn't already been set. */
3791
3792 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3793 firstbyte = REQ_NONE;
3794
3795 /* Set values to reset to if this is followed by a zero repeat. */
3796
3797 zerofirstbyte = firstbyte;
3798 zeroreqbyte = reqbyte;
3799
3800 /* Back references are handled specially */
3801
3802 if (-c >= ESC_REF)
3803 {
3804 int number = -c - ESC_REF;
3805 previous = code;
3806 *code++ = OP_REF;
3807 PUT2INC(code, 0, number);
3808 }
3809
3810 /* So are Unicode property matches, if supported. We know that get_ucp
3811 won't fail because it was tested in the pre-pass. */
3812
3813 #ifdef SUPPORT_UCP
3814 else if (-c == ESC_P || -c == ESC_p)
3815 {
3816 BOOL negated;
3817 int value = get_ucp(&ptr, &negated, errorptr);
3818 previous = code;
3819 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3820 *code++ = value;
3821 }
3822 #endif
3823
3824 /* For the rest, we can obtain the OP value by negating the escape
3825 value */
3826
3827 else
3828 {
3829 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3830 *code++ = -c;
3831 }
3832 continue;
3833 }
3834
3835 /* We have a data character whose value is in c. In UTF-8 mode it may have
3836 a value > 127. We set its representation in the length/buffer, and then
3837 handle it as a data character. */
3838
3839 #ifdef SUPPORT_UTF8
3840 if (utf8 && c > 127)
3841 mclength = ord2utf8(c, mcbuffer);
3842 else
3843 #endif
3844
3845 {
3846 mcbuffer[0] = c;
3847 mclength = 1;
3848 }
3849
3850 goto ONE_CHAR;
3851
3852 /* Handle a literal character. It is guaranteed not to be whitespace or #
3853 when the extended flag is set. If we are in UTF-8 mode, it may be a
3854 multi-byte literal character. */
3855
3856 default:
3857 NORMAL_CHAR:
3858 mclength = 1;
3859 mcbuffer[0] = c;
3860
3861 #ifdef SUPPORT_UTF8
3862 if (utf8 && (c & 0xc0) == 0xc0)
3863 {
3864 while ((ptr[1] & 0xc0) == 0x80)
3865 mcbuffer[mclength++] = *(++ptr);
3866 }
3867 #endif
3868
3869 /* At this point we have the character's bytes in mcbuffer, and the length
3870 in mclength. When not in UTF-8 mode, the length is always 1. */
3871
3872 ONE_CHAR:
3873 previous = code;
3874 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3875 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3876
3877 /* Set the first and required bytes appropriately. If no previous first
3878 byte, set it from this character, but revert to none on a zero repeat.
3879 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3880 repeat. */
3881
3882 if (firstbyte == REQ_UNSET)
3883 {
3884 zerofirstbyte = REQ_NONE;
3885 zeroreqbyte = reqbyte;
3886
3887 /* If the character is more than one byte long, we can set firstbyte
3888 only if it is not to be matched caselessly. */
3889
3890 if (mclength == 1 || req_caseopt == 0)
3891 {
3892 firstbyte = mcbuffer[0] | req_caseopt;
3893 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3894 }
3895 else firstbyte = reqbyte = REQ_NONE;
3896 }
3897
3898 /* firstbyte was previously set; we can set reqbyte only the length is
3899 1 or the matching is caseful. */
3900
3901 else
3902 {
3903 zerofirstbyte = firstbyte;
3904 zeroreqbyte = reqbyte;
3905 if (mclength == 1 || req_caseopt == 0)
3906 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3907 }
3908
3909 break; /* End of literal character handling */
3910 }
3911 } /* end of big loop */
3912
3913 /* Control never reaches here by falling through, only by a goto for all the
3914 error states. Pass back the position in the pattern so that it can be displayed
3915 to the user for diagnosing the error. */
3916
3917 FAILED:
3918 *ptrptr = ptr;
3919 return FALSE;
3920 }
3921
3922
3923
3924
3925 /*************************************************
3926 * Compile sequence of alternatives *
3927 *************************************************/
3928
3929 /* On entry, ptr is pointing past the bracket character, but on return
3930 it points to the closing bracket, or vertical bar, or end of string.
3931 The code variable is pointing at the byte into which the BRA operator has been
3932 stored. If the ims options are changed at the start (for a (?ims: group) or
3933 during any branch, we need to insert an OP_OPT item at the start of every
3934 following branch to ensure they get set correctly at run time, and also pass
3935 the new options into every subsequent branch compile.
3936
3937 Argument:
3938 options option bits, including any changes for this subpattern
3939 oldims previous settings of ims option bits
3940 brackets -> int containing the number of extracting brackets used
3941 codeptr -> the address of the current code pointer
3942 ptrptr -> the address of the current pattern pointer
3943 errorptr -> pointer to error message
3944 lookbehind TRUE if this is a lookbehind assertion
3945 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3946 firstbyteptr place to put the first required character, or a negative number
3947 reqbyteptr place to put the last required character, or a negative number
3948 bcptr pointer to the chain of currently open branches
3949 cd points to the data block with tables pointers etc.
3950
3951 Returns: TRUE on success
3952 */
3953
3954 static BOOL
3955 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3956 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3957 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3958 {
3959 const uschar *ptr = *ptrptr;
3960 uschar *code = *codeptr;
3961 uschar *last_branch = code;
3962 uschar *start_bracket = code;
3963 uschar *reverse_count = NULL;
3964 int firstbyte, reqbyte;
3965 int branchfirstbyte, branchreqbyte;
3966 branch_chain bc;
3967
3968 bc.outer = bcptr;
3969 bc.current = code;
3970
3971 firstbyte = reqbyte = REQ_UNSET;
3972
3973 /* Offset is set zero to mark that this bracket is still open */
3974
3975 PUT(code, 1, 0);
3976 code += 1 + LINK_SIZE + skipbytes;
3977
3978 /* Loop for each alternative branch */
3979
3980 for (;;)
3981 {
3982 /* Handle a change of ims options at the start of the branch */
3983
3984 if ((options & PCRE_IMS) != oldims)
3985 {
3986 *code++ = OP_OPT;
3987 *code++ = options & PCRE_IMS;
3988 }
3989
3990 /* Set up dummy OP_REVERSE if lookbehind assertion */
3991
3992 if (lookbehind)
3993 {
3994 *code++ = OP_REVERSE;
3995 reverse_count = code;
3996 PUTINC(code, 0, 0);
3997 }
3998
3999 /* Now compile the branch */
4000
4001 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4002 &branchfirstbyte, &branchreqbyte, &bc, cd))
4003 {
4004 *ptrptr = ptr;
4005 return FALSE;
4006 }
4007
4008 /* If this is the first branch, the firstbyte and reqbyte values for the
4009 branch become the values for the regex. */
4010
4011 if (*last_branch != OP_ALT)
4012 {
4013 firstbyte = branchfirstbyte;
4014 reqbyte = branchreqbyte;
4015 }
4016
4017 /* If this is not the first branch, the first char and reqbyte have to
4018 match the values from all the previous branches, except that if the previous
4019 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4020 REQ_VARY for the regex. */
4021
4022 else
4023 {
4024 /* If we previously had a firstbyte, but it doesn't match the new branch,
4025 we have to abandon the firstbyte for the regex, but if there was previously
4026 no reqbyte, it takes on the value of the old firstbyte. */
4027
4028 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4029 {
4030 if (reqbyte < 0) reqbyte = firstbyte;
4031 firstbyte = REQ_NONE;
4032 }
4033
4034 /* If we (now or from before) have no firstbyte, a firstbyte from the
4035 branch becomes a reqbyte if there isn't a branch reqbyte. */
4036
4037 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4038 branchreqbyte = branchfirstbyte;
4039
4040 /* Now ensure that the reqbytes match */
4041
4042 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4043 reqbyte = REQ_NONE;
4044 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4045 }
4046
4047 /* If lookbehind, check that this branch matches a fixed-length string,
4048 and put the length into the OP_REVERSE item. Temporarily mark the end of
4049 the branch with OP_END. */
4050
4051 if (lookbehind)
4052 {
4053 int length;
4054 *code = OP_END;
4055 length = find_fixedlength(last_branch, options);
4056 DPRINTF(("fixed length = %d\n", length));
4057 if (length < 0)
4058 {
4059 *errorptr = (length == -2)? ERR36 : ERR25;
4060 *ptrptr = ptr;
4061 return FALSE;
4062 }
4063 PUT(reverse_count, 0, length);
4064 }
4065
4066 /* Reached end of expression, either ')' or end of pattern. Go back through
4067 the alternative branches and reverse the chain of offsets, with the field in
4068 the BRA item now becoming an offset to the first alternative. If there are
4069 no alternatives, it points to the end of the group. The length in the
4070 terminating ket is always the length of the whole bracketed item. If any of
4071 the ims options were changed inside the group, compile a resetting op-code
4072 following, except at the very end of the pattern. Return leaving the pointer
4073 at the terminating char. */
4074
4075 if (*ptr != '|')
4076 {
4077 int length = code - last_branch;
4078 do
4079 {
4080 int prev_length = GET(last_branch, 1);
4081 PUT(last_branch, 1, length);
4082 length = prev_length;
4083 last_branch -= length;
4084 }
4085 while (length > 0);
4086
4087 /* Fill in the ket */
4088
4089 *code = OP_KET;
4090 PUT(code, 1, code - start_bracket);
4091 code += 1 + LINK_SIZE;
4092
4093 /* Resetting option if needed */
4094
4095 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4096 {
4097 *code++ = OP_OPT;
4098 *code++ = oldims;
4099 }
4100
4101 /* Set values to pass back */
4102
4103 *codeptr = code;
4104 *ptrptr = ptr;
4105 *firstbyteptr = firstbyte;
4106 *reqbyteptr = reqbyte;
4107 return TRUE;
4108 }
4109
4110 /* Another branch follows; insert an "or" node. Its length field points back
4111 to the previous branch while the bracket remains open. At the end the chain
4112 is reversed. It's done like this so that the start of the bracket has a
4113 zero offset until it is closed, making it possible to detect recursion. */
4114
4115 *code = OP_ALT;
4116 PUT(code, 1, code - last_branch);
4117 bc.current = last_branch = code;
4118 code += 1 + LINK_SIZE;
4119 ptr++;
4120 }
4121 /* Control never reaches here */
4122 }
4123
4124
4125
4126
4127 /*************************************************
4128 * Check for anchored expression *
4129 *************************************************/
4130
4131 /* Try to find out if this is an anchored regular expression. Consider each
4132 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4133 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4134 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4135 counts, since OP_CIRC can match in the middle.
4136
4137 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4138 This is the code for \G, which means "match at start of match position, taking
4139 into account the match offset".
4140
4141 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4142 because that will try the rest of the pattern at all possible matching points,
4143 so there is no point trying again.... er ....
4144
4145 .... except when the .* appears inside capturing parentheses, and there is a
4146 subsequent back reference to those parentheses. We haven't enough information
4147 to catch that case precisely.
4148
4149 At first, the best we could do was to detect when .* was in capturing brackets
4150 and the highest back reference was greater than or equal to that level.
4151 However, by keeping a bitmap of the first 31 back references, we can catch some
4152 of the more common cases more precisely.
4153
4154 Arguments:
4155 code points to start of expression (the bracket)
4156 options points to the options setting
4157 bracket_map a bitmap of which brackets we are inside while testing; this
4158 handles up to substring 31; after that we just have to take
4159 the less precise approach
4160 backref_map the back reference bitmap
4161
4162 Returns: TRUE or FALSE
4163 */
4164
4165 static BOOL
4166 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4167 unsigned int backref_map)
4168 {
4169 do {
4170 const uschar *scode =
4171 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4172 register int op = *scode;
4173
4174 /* Capturing brackets */
4175
4176 if (op > OP_BRA)
4177 {
4178 int new_map;
4179 op -= OP_BRA;
4180 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4181 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4182 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4183 }
4184
4185 /* Other brackets */
4186
4187 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4188 {
4189 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4190 }
4191
4192 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4193 are or may be referenced. */
4194
4195 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4196 (*options & PCRE_DOTALL) != 0)
4197 {
4198 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4199 }
4200
4201 /* Check for explicit anchoring */
4202
4203 else if (op != OP_SOD && op != OP_SOM &&
4204 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4205 return FALSE;
4206 code += GET(code, 1);
4207 }
4208 while (*code == OP_ALT); /* Loop for each alternative */
4209 return TRUE;
4210 }
4211
4212
4213
4214 /*************************************************
4215 * Check for starting with ^ or .* *
4216 *************************************************/
4217
4218 /* This is called to find out if every branch starts with ^ or .* so that
4219 "first char" processing can be done to speed things up in multiline
4220 matching and for non-DOTALL patterns that start with .* (which must start at
4221 the beginning or after \n). As in the case of is_anchored() (see above), we
4222 have to take account of back references to capturing brackets that contain .*
4223 because in that case we can't make the assumption.
4224
4225 Arguments:
4226 code points to start of expression (the bracket)
4227 bracket_map a bitmap of which brackets we are inside while testing; this
4228 handles up to substring 31; after that we just have to take
4229 the less precise approach
4230 backref_map the back reference bitmap
4231
4232 Returns: TRUE or FALSE
4233 */
4234
4235 static BOOL
4236 is_startline(const uschar *code, unsigned int bracket_map,
4237 unsigned int backref_map)
4238 {
4239 do {
4240 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4241 FALSE);
4242 register int op = *scode;
4243
4244 /* Capturing brackets */
4245
4246 if (op > OP_BRA)
4247 {
4248 int new_map;
4249 op -= OP_BRA;
4250 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4251 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4252 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4253 }
4254
4255 /* Other brackets */
4256
4257 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4258 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4259
4260 /* .* means "start at start or after \n" if it isn't in brackets that
4261 may be referenced. */
4262
4263 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4264 {
4265 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4266 }
4267
4268 /* Check for explicit circumflex */
4269
4270 else if (op != OP_CIRC) return FALSE;
4271
4272 /* Move on to the next alternative */
4273
4274 code += GET(code, 1);
4275 }
4276 while (*code == OP_ALT); /* Loop for each alternative */
4277 return TRUE;
4278 }
4279
4280
4281
4282 /*************************************************
4283 * Check for asserted fixed first char *
4284 *************************************************/
4285
4286 /* During compilation, the "first char" settings from forward assertions are
4287 discarded, because they can cause conflicts with actual literals that follow.
4288 However, if we end up without a first char setting for an unanchored pattern,
4289 it is worth scanning the regex to see if there is an initial asserted first
4290 char. If all branches start with the same asserted char, or with a bracket all
4291 of whose alternatives start with the same asserted char (recurse ad lib), then
4292 we return that char, otherwise -1.
4293
4294 Arguments:
4295 code points to start of expression (the bracket)
4296 options pointer to the options (used to check casing changes)
4297 inassert TRUE if in an assertion
4298
4299 Returns: -1 or the fixed first char
4300 */
4301
4302 static int
4303 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4304 {
4305 register int c = -1;
4306 do {
4307 int d;
4308 const uschar *scode =
4309 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4310 register int op = *scode;
4311
4312 if (op >= OP_BRA) op = OP_BRA;
4313
4314 switch(op)
4315 {
4316 default:
4317 return -1;
4318
4319 case OP_BRA:
4320 case OP_ASSERT:
4321 case OP_ONCE:
4322 case OP_COND:
4323 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4324 return -1;
4325 if (c < 0) c = d; else if (c != d) return -1;
4326 break;
4327
4328 case OP_EXACT: /* Fall through */
4329 scode += 2;
4330
4331 case OP_CHAR:
4332 case OP_CHARNC:
4333 case OP_PLUS:
4334 case OP_MINPLUS:
4335 if (!inassert) return -1;
4336 if (c < 0)
4337 {
4338 c = scode[1];
4339 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4340 }
4341 else if (c != scode[1]) return -1;
4342 break;
4343 }
4344
4345 code += GET(code, 1);
4346 }
4347 while (*code == OP_ALT);
4348 return c;
4349 }
4350
4351
4352
4353
4354 #ifdef SUPPORT_UTF8
4355 /*************************************************
4356 * Validate a UTF-8 string *
4357 *************************************************/
4358
4359 /* This function is called (optionally) at the start of compile or match, to
4360 validate that a supposed UTF-8 string is actually valid. The early check means
4361 that subsequent code can assume it is dealing with a valid string. The check
4362 can be turned off for maximum performance, but then consequences of supplying
4363 an invalid string are then undefined.
4364
4365 Arguments:
4366 string points to the string
4367 length length of string, or -1 if the string is zero-terminated
4368
4369 Returns: < 0 if the string is a valid UTF-8 string
4370 >= 0 otherwise; the value is the offset of the bad byte
4371 */
4372
4373 static int
4374 valid_utf8(const uschar *string, int length)
4375 {
4376 register const uschar *p;
4377
4378 if (length < 0)
4379 {
4380 for (p = string; *p != 0; p++);
4381 length = p - string;
4382 }
4383
4384 for (p = string; length-- > 0; p++)
4385 {
4386 register int ab;
4387 register int c = *p;
4388 if (c < 128) continue;
4389 if ((c & 0xc0) != 0xc0) return p - string;
4390 ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4391 if (length < ab) return p - string;
4392 length -= ab;
4393
4394 /* Check top bits in the second byte */
4395 if ((*(++p) & 0xc0) != 0x80) return p - string;
4396
4397 /* Check for overlong sequences for each different length */
4398 switch (ab)
4399 {
4400 /* Check for xx00 000x */
4401 case 1:
4402 if ((c & 0x3e) == 0) return p - string;
4403 continue; /* We know there aren't any more bytes to check */
4404
4405 /* Check for 1110 0000, xx0x xxxx */
4406 case 2:
4407 if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4408 break;
4409
4410 /* Check for 1111 0000, xx00 xxxx */
4411 case 3:
4412 if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4413 break;
4414
4415 /* Check for 1111 1000, xx00 0xxx */
4416 case 4:
4417 if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4418 break;
4419
4420 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4421 case 5:
4422 if (c == 0xfe || c == 0xff ||
4423 (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4424 break;
4425 }
4426
4427 /* Check for valid bytes after the 2nd, if any; all must start 10 */
4428 while (--ab > 0)
4429 {
4430 if ((*(++p) & 0xc0) != 0x80) return p - string;
4431 }
4432 }
4433
4434 return -1;
4435 }
4436 #endif
4437
4438
4439
4440 /*************************************************
4441 * Compile a Regular Expression *
4442 *************************************************/
4443
4444 /* This function takes a string and returns a pointer to a block of store
4445 holding a compiled version of the expression.
4446
4447 Arguments:
4448 pattern the regular expression
4449 options various option bits
4450 errorptr pointer to pointer to error text
4451 erroroffset ptr offset in pattern where error was detected
4452 tables pointer to character tables or NULL
4453
4454 Returns: pointer to compiled data block, or NULL on error,
4455 with errorptr and erroroffset set
4456 */
4457
4458 EXPORT pcre *
4459 pcre_compile(const char *pattern, int options, const char **errorptr,
4460 int *erroroffset, const unsigned char *tables)
4461 {
4462 real_pcre *re;
4463 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4464 int runlength;
4465 int c, firstbyte, reqbyte;
4466 int bracount = 0;
4467 int branch_extra = 0;
4468 int branch_newextra;
4469 int item_count = -1;
4470 int name_count = 0;
4471 int max_name_size = 0;
4472 int lastitemlength = 0;
4473 #ifdef SUPPORT_UTF8
4474 BOOL utf8;
4475 BOOL class_utf8;
4476 #endif
4477 BOOL inescq = FALSE;
4478 unsigned int brastackptr = 0;
4479 size_t size;
4480 uschar *code;
4481 const uschar *codestart;
4482 const uschar *ptr;
4483 compile_data compile_block;
4484 int brastack[BRASTACK_SIZE];
4485 uschar bralenstack[BRASTACK_SIZE];
4486
4487 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4488 can do is just return NULL. */
4489
4490 if (errorptr == NULL) return NULL;
4491 *errorptr = NULL;
4492
4493 /* However, we can give a message for this error */
4494
4495 if (erroroffset == NULL)
4496 {
4497 *errorptr = ERR16;
4498 return NULL;
4499 }
4500 *erroroffset = 0;
4501
4502 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4503
4504 #ifdef SUPPORT_UTF8
4505 utf8 = (options & PCRE_UTF8) != 0;
4506 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4507 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4508 {
4509 *errorptr = ERR44;
4510 return NULL;
4511 }
4512 #else
4513 if ((options & PCRE_UTF8) != 0)
4514 {
4515 *errorptr = ERR32;
4516 return NULL;
4517 }
4518 #endif
4519
4520 if ((options & ~PUBLIC_OPTIONS) != 0)
4521 {
4522 *errorptr = ERR17;
4523 return NULL;
4524 }
4525
4526 /* Set up pointers to the individual character tables */
4527
4528 if (tables == NULL) tables = pcre_default_tables;
4529 compile_block.lcc = tables + lcc_offset;
4530 compile_block.fcc = tables + fcc_offset;
4531 compile_block.cbits = tables + cbits_offset;
4532 compile_block.ctypes = tables + ctypes_offset;
4533
4534 /* Maximum back reference and backref bitmap. This is updated for numeric
4535 references during the first pass, but for named references during the actual
4536 compile pass. The bitmap records up to 31 back references to help in deciding
4537 whether (.*) can be treated as anchored or not. */
4538
4539 compile_block.top_backref = 0;
4540 compile_block.backref_map = 0;
4541
4542 /* Reflect pattern for debugging output */
4543
4544 DPRINTF(("------------------------------------------------------------------\n"));
4545 DPRINTF(("%s\n", pattern));
4546
4547 /* The first thing to do is to make a pass over the pattern to compute the
4548 amount of store required to hold the compiled code. This does not have to be
4549 perfect as long as errors are overestimates. At the same time we can detect any
4550 flag settings right at the start, and extract them. Make an attempt to correct
4551 for any counted white space if an "extended" flag setting appears late in the
4552 pattern. We can't be so clever for #-comments. */
4553
4554 ptr = (const uschar *)(pattern - 1);
4555 while ((c = *(++ptr)) != 0)
4556 {
4557 int min, max;
4558 int class_optcount;
4559 int bracket_length;
4560 int duplength;
4561
4562 /* If we are inside a \Q...\E sequence, all chars are literal */
4563
4564 if (inescq)
4565 {
4566 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4567 goto NORMAL_CHAR;
4568 }
4569
4570 /* Otherwise, first check for ignored whitespace and comments */
4571
4572 if ((options & PCRE_EXTENDED) != 0)
4573 {
4574 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4575 if (c == '#')
4576 {
4577 /* The space before the ; is to avoid a warning on a silly compiler
4578 on the Macintosh. */
4579 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4580 if (c == 0) break;
4581 continue;
4582 }
4583 }
4584
4585 item_count++; /* Is zero for the first non-comment item */
4586
4587 /* Allow space for auto callout before every item except quantifiers. */
4588
4589 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4590 c != '*' && c != '+' && c != '?' &&
4591 (c != '{' || !is_counted_repeat(ptr + 1)))
4592 length += 2 + 2*LINK_SIZE;
4593
4594 switch(c)
4595 {
4596 /* A backslashed item may be an escaped data character or it may be a
4597 character type. */
4598
4599 case '\\':
4600 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4601 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4602
4603 lastitemlength = 1; /* Default length of last item for repeats */
4604
4605 if (c >= 0) /* Data character */
4606 {
4607 length += 2; /* For a one-byte character */
4608
4609 #ifdef SUPPORT_UTF8
4610 if (utf8 && c > 127)
4611 {
4612 int i;
4613 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4614 if (c <= utf8_table1[i]) break;
4615 length += i;
4616 lastitemlength += i;
4617 }
4618 #endif
4619
4620 continue;
4621 }
4622
4623 /* If \Q, enter "literal" mode */
4624
4625 if (-c == ESC_Q)
4626 {
4627 inescq = TRUE;
4628 continue;
4629 }
4630
4631 /* \X is supported only if Unicode property support is compiled */
4632
4633 #ifndef SUPPORT_UCP
4634 if (-c == ESC_X)
4635 {
4636 *errorptr = ERR45;
4637 goto PCRE_ERROR_RETURN;
4638 }
4639 #endif
4640
4641 /* \P and \p are for Unicode properties, but only when the support has
4642 been compiled. Each item needs 2 bytes. */
4643
4644 else if (-c == ESC_P || -c == ESC_p)
4645 {
4646 #ifdef SUPPORT_UCP
4647 BOOL negated;
4648 length += 2;
4649 lastitemlength = 2;
4650 if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4651 continue;
4652 #else
4653 *errorptr = ERR45;
4654 goto PCRE_ERROR_RETURN;
4655 #endif
4656 }
4657
4658 /* Other escapes need one byte */
4659
4660 length++;
4661
4662 /* A back reference needs an additional 2 bytes, plus either one or 5
4663 bytes for a repeat. We also need to keep the value of the highest
4664 back reference. */
4665
4666 if (c <= -ESC_REF)
4667 {
4668 int refnum = -c - ESC_REF;
4669 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4670 if (refnum > compile_block.top_backref)
4671 compile_block.top_backref = refnum;
4672 length += 2; /* For single back reference */
4673 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4674 {
4675 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4676 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4677 if ((min == 0 && (max == 1 || max == -1)) ||
4678 (min == 1 && max == -1))
4679 length++;
4680 else length += 5;
4681 if (ptr[1] == '?') ptr++;
4682 }
4683 }
4684 continue;
4685
4686 case '^': /* Single-byte metacharacters */
4687 case '.':
4688 case '$':
4689 length++;
4690 lastitemlength = 1;
4691 continue;
4692
4693 case '*': /* These repeats won't be after brackets; */
4694 case '+': /* those are handled separately */
4695 case '?':
4696 length++;
4697 goto POSESSIVE; /* A few lines below */
4698
4699 /* This covers the cases of braced repeats after a single char, metachar,
4700 class, or back reference. */
4701
4702 case '{':
4703 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4704 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4705 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4706
4707 /* These special cases just insert one extra opcode */
4708
4709 if ((min == 0 && (max == 1 || max == -1)) ||
4710 (min == 1 && max == -1))
4711 length++;
4712
4713 /* These cases might insert additional copies of a preceding character. */
4714
4715 else
4716 {
4717 if (min != 1)
4718 {
4719 length -= lastitemlength; /* Uncount the original char or metachar */
4720 if (min > 0) length += 3 + lastitemlength;
4721 }
4722 length += lastitemlength + ((max > 0)? 3 : 1);
4723 }
4724
4725 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4726
4727 POSESSIVE: /* Test for possessive quantifier */
4728 if (ptr[1] == '+')
4729 {
4730 ptr++;
4731 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4732 }
4733 continue;
4734
4735 /* An alternation contains an offset to the next branch or ket. If any ims
4736 options changed in the previous branch(es), and/or if we are in a
4737 lookbehind assertion, extra space will be needed at the start of the
4738 branch. This is handled by branch_extra. */
4739
4740 case '|':
4741 length += 1 + LINK_SIZE + branch_extra;
4742 continue;
4743
4744 /* A character class uses 33 characters provided that all the character
4745 values are less than 256. Otherwise, it uses a bit map for low valued
4746 characters, and individual items for others. Don't worry about character
4747 types that aren't allowed in classes - they'll get picked up during the
4748 compile. A character class that contains only one single-byte character
4749 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4750 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4751
4752 case '[':
4753 if (*(++ptr) == '^')
4754 {
4755 class_optcount = 10; /* Greater than one */
4756 ptr++;
4757 }
4758 else class_optcount = 0;
4759
4760 #ifdef SUPPORT_UTF8
4761 class_utf8 = FALSE;
4762 #endif
4763
4764 /* Written as a "do" so that an initial ']' is taken as data */
4765
4766 if (*ptr != 0) do
4767 {
4768 /* Inside \Q...\E everything is literal except \E */
4769
4770 if (inescq)
4771 {
4772 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4773 inescq = FALSE;
4774 ptr += 1;
4775 continue;
4776 }
4777
4778 /* Outside \Q...\E, check for escapes */
4779
4780 if (*ptr == '\\')
4781 {
4782 c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4783 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4784
4785 /* \b is backspace inside a class; \X is literal */
4786
4787 if (-c == ESC_b) c = '\b';
4788 else if (-c == ESC_X) c = 'X';
4789
4790 /* \Q enters quoting mode */
4791
4792 else if (-c == ESC_Q)
4793 {
4794 inescq = TRUE;
4795 continue;
4796 }
4797
4798 /* Handle escapes that turn into characters */
4799
4800 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4801
4802 /* Escapes that are meta-things. The normal ones just affect the
4803 bit map, but Unicode properties require an XCLASS extended item. */
4804
4805 else
4806 {
4807 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4808 #ifdef SUPPORT_UTF8
4809 if (-c == ESC_p || -c == ESC_P)
4810 {
4811 if (!class_utf8)
4812 {
4813 class_utf8 = TRUE;
4814 length += LINK_SIZE + 2;
4815 }
4816 length += 2;
4817 }
4818 #endif
4819 }
4820 }
4821
4822 /* Check the syntax for POSIX stuff. The bits we actually handle are
4823 checked during the real compile phase. */
4824
4825 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4826 {
4827 ptr++;
4828 class_optcount = 10; /* Make sure > 1 */
4829 }
4830
4831 /* Anything else increments the possible optimization count. We have to
4832 detect ranges here so that we can compute the number of extra ranges for
4833 caseless wide characters when UCP support is available. If there are wide
4834 characters, we are going to have to use an XCLASS, even for single
4835 characters. */
4836
4837 else
4838 {
4839 int d;
4840
4841 GET_ONE_CHARACTER:
4842
4843 #ifdef SUPPORT_UTF8
4844 if (utf8)
4845 {
4846 int extra = 0;
4847 GETCHARLEN(c, ptr, extra);
4848 ptr += extra;
4849 }
4850 else c = *ptr;
4851 #else
4852 c = *ptr;
4853 #endif
4854
4855 /* Come here from handling \ above when it escapes to a char value */
4856
4857 NON_SPECIAL_CHARACTER:
4858 class_optcount++;
4859
4860 d = -1;
4861 if (ptr[1] == '-')
4862 {
4863 uschar const *hyptr = ptr++;
4864 if (ptr[1] == '\\')
4865 {
4866 ptr++;
4867 d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4868 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4869 if (-d == ESC_b) d = '\b'; /* backspace */
4870 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4871 }
4872 else if (ptr[1] != 0 && ptr[1] != ']')
4873 {
4874 ptr++;
4875 #ifdef SUPPORT_UTF8
4876 if (utf8)
4877 {
4878 int extra = 0;
4879 GETCHARLEN(d, ptr, extra);
4880 ptr += extra;
4881 }
4882 else
4883 #endif
4884 d = *ptr;
4885 }
4886 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4887 }
4888
4889 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4890 127 for caseless matching, we will need to use an XCLASS. */
4891
4892 if (d >= 0)
4893 {
4894 class_optcount = 10; /* Ensure > 1 */
4895 if (d < c)
4896 {
4897 *errorptr = ERR8;
4898 goto PCRE_ERROR_RETURN;
4899 }
4900
4901 #ifdef SUPPORT_UTF8
4902 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4903 {
4904 uschar buffer[6];
4905 if (!class_utf8) /* Allow for XCLASS overhead */
4906 {
4907 class_utf8 = TRUE;
4908 length += LINK_SIZE + 2;
4909 }
4910
4911 #ifdef SUPPORT_UCP
4912 /* If we have UCP support, find out how many extra ranges are
4913 needed to map the other case of characters within this range. We
4914 have to mimic the range optimization here, because extending the
4915 range upwards might push d over a boundary that makes is use
4916 another byte in the UTF-8 representation. */
4917
4918 if ((options & PCRE_CASELESS) != 0)
4919 {
4920 int occ, ocd;
4921 int cc = c;
4922 int origd = d;
4923 while (get_othercase_range(&cc, origd, &occ, &ocd))
4924 {
4925 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4926
4927 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4928 { /* if there is overlap, */
4929 c = occ; /* noting that if occ < c */
4930 continue; /* we can't have ocd > d */
4931 } /* because a subrange is */
4932 if (ocd > d && occ <= d + 1) /* always shorter than */
4933 { /* the basic range. */
4934 d = ocd;
4935 continue;
4936 }
4937
4938 /* An extra item is needed */
4939
4940 length += 1 + ord2utf8(occ, buffer) +
4941 ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4942 }
4943 }
4944 #endif /* SUPPORT_UCP */
4945
4946 /* The length of the (possibly extended) range */
4947
4948 length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4949 }
4950 #endif /* SUPPORT_UTF8 */
4951
4952 }
4953
4954 /* We have a single character. There is nothing to be done unless we
4955 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4956 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4957 support. */
4958
4959 else
4960 {
4961 #ifdef SUPPORT_UTF8
4962 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4963 {
4964 uschar buffer[6];
4965 class_optcount = 10; /* Ensure > 1 */
4966 if (!class_utf8) /* Allow for XCLASS overhead */
4967 {
4968 class_utf8 = TRUE;
4969 length += LINK_SIZE + 2;
4970 }
4971 #ifdef SUPPORT_UCP
4972 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4973 (1 + ord2utf8(c, buffer));
4974 #else /* SUPPORT_UCP */
4975 length += 1 + ord2utf8(c, buffer);
4976 #endif /* SUPPORT_UCP */
4977 }
4978 #endif /* SUPPORT_UTF8 */
4979 }
4980 }
4981 }
4982 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4983
4984 if (*ptr == 0) /* Missing terminating ']' */
4985 {
4986 *errorptr = ERR6;
4987 goto PCRE_ERROR_RETURN;
4988 }
4989
4990 /* We can optimize when there was only one optimizable character. Repeats
4991 for positive and negated single one-byte chars are handled by the general
4992 code. Here, we handle repeats for the class opcodes. */
4993
4994 if (class_optcount == 1) length += 3; else
4995 {
4996 length += 33;
4997
4998 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4999 we also need extra for wrapping the whole thing in a sub-pattern. */
5000
5001 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5002 {
5003 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5004 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5005 if ((min == 0 && (max == 1 || max == -1)) ||
5006 (min == 1 && max == -1))
5007 length++;
5008 else length += 5;
5009 if (ptr[1] == '+')
5010 {
5011 ptr++;
5012 length += 2 + 2*LINK_SIZE;
5013 }
5014 else if (ptr[1] == '?') ptr++;
5015 }
5016 }
5017 continue;
5018
5019 /* Brackets may be genuine groups or special things */
5020
5021 case '(':
5022 branch_newextra = 0;
5023 bracket_length = 1 + LINK_SIZE;
5024
5025 /* Handle special forms of bracket, which all start (? */
5026
5027 if (ptr[1] == '?')
5028 {
5029 int set, unset;
5030 int *optset;
5031
5032 switch (c = ptr[2])
5033 {
5034 /* Skip over comments entirely */
5035 case '#':
5036 ptr += 3;
5037 while (*ptr != 0 && *ptr != ')') ptr++;
5038 if (*ptr == 0)
5039 {
5040 *errorptr = ERR18;
5041 goto PCRE_ERROR_RETURN;
5042 }
5043 continue;
5044
5045 /* Non-referencing groups and lookaheads just move the pointer on, and
5046 then behave like a non-special bracket, except that they don't increment
5047 the count of extracting brackets. Ditto for the "once only" bracket,
5048 which is in Perl from version 5.005. */
5049
5050 case ':':
5051 case '=':
5052 case '!':
5053 case '>':
5054 ptr += 2;
5055 break;
5056
5057 /* (?R) specifies a recursive call to the regex, which is an extension
5058 to provide the facility which can be obtained by (?p{perl-code}) in
5059 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5060
5061 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5062 the appropriate numbered brackets. This includes both recursive and
5063 non-recursive calls. (?R) is now synonymous with (?0). */
5064
5065 case 'R':
5066 ptr++;
5067
5068 case '0': case '1': case '2': case '3': case '4':
5069 case '5': case '6': case '7': case '8': case '9':
5070 ptr += 2;
5071 if (c != 'R')
5072 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5073 if (*ptr != ')')
5074 {
5075 *errorptr = ERR29;
5076 goto PCRE_ERROR_RETURN;
5077 }
5078 length += 1 + LINK_SIZE;
5079
5080 /* If this item is quantified, it will get wrapped inside brackets so
5081 as to use the code for quantified brackets. We jump down and use the
5082 code that handles this for real brackets. */
5083
5084 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5085 {
5086 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
5087 duplength = 5 + 3 * LINK_SIZE;
5088 goto HANDLE_QUANTIFIED_BRACKETS;
5089 }
5090 continue;
5091
5092 /* (?C) is an extension which provides "callout" - to provide a bit of
5093 the functionality of the Perl (?{...}) feature. An optional number may
5094 follow (default is zero). */
5095
5096 case 'C':
5097 ptr += 2;
5098 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5099 if (*ptr != ')')
5100 {
5101 *errorptr = ERR39;
5102 goto PCRE_ERROR_RETURN;
5103 }
5104 length += 2 + 2*LINK_SIZE;
5105 continue;
5106
5107 /* Named subpatterns are an extension copied from Python */
5108
5109 case 'P':
5110 ptr += 3;
5111 if (*ptr == '<')
5112 {
5113 const uschar *p; /* Don't amalgamate; some compilers */
5114 p = ++ptr; /* grumble at autoincrement in declaration */
5115 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5116 if (*ptr != '>')
5117 {
5118 *errorptr = ERR42;
5119 goto PCRE_ERROR_RETURN;
5120 }
5121 name_count++;
5122 if (ptr - p > max_name_size) max_name_size = (ptr - p);
5123 break;
5124 }
5125
5126 if (*ptr == '=' || *ptr == '>')
5127 {
5128 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5129 if (*ptr != ')')
5130 {
5131 *errorptr = ERR42;
5132 goto PCRE_ERROR_RETURN;
5133 }
5134 break;
5135 }
5136
5137 /* Unknown character after (?P */
5138
5139 *errorptr = ERR41;
5140 goto PCRE_ERROR_RETURN;
5141
5142 /* Lookbehinds are in Perl from version 5.005 */
5143
5144 case '<':
5145 ptr += 3;
5146 if (*ptr == '=' || *ptr == '!')
5147 {
5148 branch_newextra = 1 + LINK_SIZE;
5149 length += 1 + LINK_SIZE; /* For the first branch */
5150 break;
5151 }
5152 *errorptr = ERR24;
5153 goto PCRE_ERROR_RETURN;
5154
5155 /* Conditionals are in Perl from version 5.005. The bracket must either
5156 be followed by a number (for bracket reference) or by an assertion
5157 group, or (a PCRE extension) by 'R' for a recursion test. */
5158
5159 case '(':
5160 if (ptr[3] == 'R' && ptr[4] == ')')
5161 {
5162 ptr += 4;
5163 length += 3;
5164 }
5165 else if ((digitab[ptr[3]] & ctype_digit) != 0)
5166 {
5167 ptr += 4;
5168 length += 3;
5169 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5170 if (*ptr != ')')
5171 {
5172 *errorptr = ERR26;
5173 goto PCRE_ERROR_RETURN;
5174 }
5175 }
5176 else /* An assertion must follow */
5177 {
5178 ptr++; /* Can treat like ':' as far as spacing is concerned */
5179 if (ptr[2] != '?' ||
5180 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5181 {
5182 ptr += 2; /* To get right offset in message */
5183 *errorptr = ERR28;
5184 goto PCRE_ERROR_RETURN;
5185 }
5186 }
5187 break;
5188
5189 /* Else loop checking valid options until ) is met. Anything else is an
5190 error. If we are without any brackets, i.e. at top level, the settings
5191 act as if specified in the options, so massage the options immediately.
5192 This is for backward compatibility with Perl 5.004. */
5193
5194 default:
5195 set = unset = 0;
5196 optset = &set;
5197 ptr += 2;
5198
5199 for (;; ptr++)
5200 {
5201 c = *ptr;
5202 switch (c)
5203 {
5204 case 'i':
5205 *optset |= PCRE_CASELESS;
5206 continue;
5207
5208 case 'm':
5209 *optset |= PCRE_MULTILINE;
5210 continue;
5211
5212 case 's':
5213 *optset |= PCRE_DOTALL;
5214 continue;
5215
5216 case 'x':
5217 *optset |= PCRE_EXTENDED;
5218 continue;
5219
5220 case 'X':
5221 *optset |= PCRE_EXTRA;
5222 continue;
5223
5224 case 'U':
5225 *optset |= PCRE_UNGREEDY;
5226 continue;
5227
5228 case '-':
5229 optset = &unset;
5230 continue;
5231
5232 /* A termination by ')' indicates an options-setting-only item; if
5233 this is at the very start of the pattern (indicated by item_count
5234 being zero), we use it to set the global options. This is helpful
5235 when analyzing the pattern for first characters, etc. Otherwise
5236 nothing is done here and it is handled during the compiling
5237 process.
5238
5239 [Historical note: Up to Perl 5.8, options settings at top level
5240 were always global settings, wherever they appeared in the pattern.
5241 That is, they were equivalent to an external setting. From 5.8
5242 onwards, they apply only to what follows (which is what you might
5243 expect).] */
5244
5245 case ')':
5246 if (item_count == 0)
5247 {
5248 options = (options | set) & (~unset);
5249 set = unset = 0; /* To save length */
5250 item_count--; /* To allow for several */
5251 }
5252
5253 /* Fall through */
5254
5255 /* A termination by ':' indicates the start of a nested group with
5256 the given options set. This is again handled at compile time, but
5257 we must allow for compiled space if any of the ims options are
5258 set. We also have to allow for resetting space at the end of
5259 the group, which is why 4 is added to the length and not just 2.
5260 If there are several changes of options within the same group, this
5261 will lead to an over-estimate on the length, but this shouldn't
5262 matter very much. We also have to allow for resetting options at
5263 the start of any alternations, which we do by setting
5264 branch_newextra to 2. Finally, we record whether the case-dependent
5265 flag ever changes within the regex. This is used by the "required
5266 character" code. */
5267
5268 case ':':
5269 if (((set|unset) & PCRE_IMS) != 0)
5270 {
5271 length += 4;
5272 branch_newextra = 2;
5273 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5274 }
5275 goto END_OPTIONS;
5276
5277 /* Unrecognized option character */
5278
5279 default:
5280 *errorptr = ERR12;
5281 goto PCRE_ERROR_RETURN;
5282 }
5283 }
5284
5285 /* If we hit a closing bracket, that's it - this is a freestanding
5286 option-setting. We need to ensure that branch_extra is updated if
5287 necessary. The only values branch_newextra can have here are 0 or 2.
5288 If the value is 2, then branch_extra must either be 2 or 5, depending
5289 on whether this is a lookbehind group or not. */
5290
5291 END_OPTIONS:
5292 if (c == ')')
5293 {
5294 if (branch_newextra == 2 &&
5295 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5296 branch_extra += branch_newextra;
5297 continue;
5298 }
5299
5300 /* If options were terminated by ':' control comes here. Fall through
5301 to handle the group below. */
5302 }
5303 }
5304
5305 /* Extracting brackets must be counted so we can process escapes in a
5306 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5307 need an additional 3 bytes of store per extracting bracket. However, if
5308 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5309 must leave the count alone (it will aways be zero). */
5310
5311 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5312 {
5313 bracount++;
5314 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5315 }
5316
5317 /* Save length for computing whole length at end if there's a repeat that
5318 requires duplication of the group. Also save the current value of
5319 branch_extra, and start the new group with the new value. If non-zero, this
5320 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5321
5322 if (brastackptr >= sizeof(brastack)/sizeof(int))
5323 {
5324 *errorptr = ERR19;
5325 goto PCRE_ERROR_RETURN;
5326 }
5327
5328 bralenstack[brastackptr] = branch_extra;
5329 branch_extra = branch_newextra;
5330
5331 brastack[brastackptr++] = length;
5332 length += bracket_length;
5333 continue;
5334
5335 /* Handle ket. Look for subsequent max/min; for certain sets of values we
5336 have to replicate this bracket up to that many times. If brastackptr is
5337 0 this is an unmatched bracket which will generate an error, but take care
5338 not to try to access brastack[-1] when computing the length and restoring
5339 the branch_extra value. */
5340
5341 case ')':
5342 length += 1 + LINK_SIZE;
5343 if (brastackptr > 0)
5344 {
5345 duplength = length - brastack[--brastackptr];
5346 branch_extra = bralenstack[brastackptr];
5347 }
5348 else duplength = 0;
5349
5350 /* The following code is also used when a recursion such as (?3) is
5351 followed by a quantifier, because in that case, it has to be wrapped inside
5352 brackets so that the quantifier works. The value of duplength must be
5353 set before arrival. */
5354
5355 HANDLE_QUANTIFIED_BRACKETS:
5356
5357 /* Leave ptr at the final char; for read_repeat_counts this happens
5358 automatically; for the others we need an increment. */
5359
5360 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5361 {
5362 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5363 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5364 }
5365 else if (c == '*') { min = 0; max = -1; ptr++; }
5366 else if (c == '+') { min = 1; max = -1; ptr++; }
5367 else if (c == '?') { min = 0; max = 1; ptr++; }
5368 else { min = 1; max = 1; }
5369
5370 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5371 group, and if the maximum is greater than zero, we have to replicate
5372 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5373 bracket set. */
5374
5375 if (min == 0)
5376 {
5377 length++;
5378 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5379 }
5380
5381 /* When the minimum is greater than zero, we have to replicate up to
5382 minval-1 times, with no additions required in the copies. Then, if there
5383 is a limited maximum we have to replicate up to maxval-1 times allowing
5384 for a BRAZERO item before each optional copy and nesting brackets for all
5385 but one of the optional copies. */
5386
5387 else
5388 {
5389 length += (min - 1) * duplength;
5390 if (max > min) /* Need this test as max=-1 means no limit */
5391 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5392 - (2 + 2*LINK_SIZE);
5393 }
5394
5395 /* Allow space for once brackets for "possessive quantifier" */
5396
5397 if (ptr[1] == '+')
5398 {
5399 ptr++;
5400 length += 2 + 2*LINK_SIZE;
5401 }
5402 continue;
5403
5404 /* Non-special character. It won't be space or # in extended mode, so it is
5405 always a genuine character. If we are in a \Q...\E sequence, check for the
5406 end; if not, we have a literal. */
5407
5408 default:
5409 NORMAL_CHAR:
5410
5411 if (inescq && c == '\\' && ptr[1] == 'E')
5412 {
5413 inescq = FALSE;
5414 ptr++;
5415 continue;
5416 }
5417
5418 length += 2; /* For a one-byte character */
5419 lastitemlength = 1; /* Default length of last item for repeats */
5420
5421 /* In UTF-8 mode, check for additional bytes. */
5422
5423 #ifdef SUPPORT_UTF8
5424 if (utf8 && (c & 0xc0) == 0xc0)
5425 {
5426 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5427 { /* because the end is marked */
5428 lastitemlength++; /* by a zero byte. */
5429 length++;
5430 ptr++;
5431 }
5432 }
5433 #endif
5434
5435 continue;
5436 }
5437 }
5438
5439 length += 2 + LINK_SIZE; /* For final KET and END */
5440
5441 if ((options & PCRE_AUTO_CALLOUT) != 0)
5442 length += 2 + 2*LINK_SIZE; /* For final callout */
5443
5444 if (length > MAX_PATTERN_SIZE)
5445 {
5446 *errorptr = ERR20;
5447 return NULL;
5448 }
5449
5450 /* Compute the size of data block needed and get it, either from malloc or
5451 externally provided function. */
5452
5453 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5454 re = (real_pcre *)(pcre_malloc)(size);
5455
5456 if (re == NULL)
5457 {
5458 *errorptr = ERR21;
5459 return NULL;
5460 }
5461
5462 /* Put in the magic number, and save the sizes, options, and character table
5463 pointer. NULL is used for the default character tables. The nullpad field is at
5464 the end; it's there to help in the case when a regex compiled on a system with
5465 4-byte pointers is run on another with 8-byte pointers. */
5466
5467 re->magic_number = MAGIC_NUMBER;
5468 re->size = size;
5469 re->options = options;
5470 re->dummy1 = re->dummy2 = 0;
5471 re->name_table_offset = sizeof(real_pcre);
5472 re->name_entry_size = max_name_size + 3;
5473 re->name_count = name_count;
5474 re->tables = (tables == pcre_default_tables)? NULL : tables;
5475 re->nullpad = NULL;
5476
5477 /* The starting points of the name/number translation table and of the code are
5478 passed around in the compile data block. */
5479
5480 compile_block.names_found = 0;
5481 compile_block.name_entry_size = max_name_size + 3;
5482 compile_block.name_table = (uschar *)re + re->name_table_offset;
5483 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5484 compile_block.start_code = codestart;
5485 compile_block.start_pattern = (const uschar *)pattern;
5486 compile_block.req_varyopt = 0;
5487 compile_block.nopartial = FALSE;
5488
5489 /* Set up a starting, non-extracting bracket, then compile the expression. On
5490 error, *errorptr will be set non-NULL, so we don't need to look at the result
5491 of the function here. */
5492
5493 ptr = (const uschar *)pattern;
5494 code = (uschar *)codestart;
5495 *code = OP_BRA;
5496 bracount = 0;
5497 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5498 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5499 re->top_bracket = bracount;
5500 re->top_backref = compile_block.top_backref;
5501
5502 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5503
5504 /* If not reached end of pattern on success, there's an excess bracket. */
5505
5506 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5507
5508 /* Fill in the terminating state and check for disastrous overflow, but
5509 if debugging, leave the test till after things are printed out. */
5510
5511 *code++ = OP_END;
5512
5513 #ifndef DEBUG
5514 if (code - codestart > length) *errorptr = ERR23;
5515 #endif
5516
5517 /* Give an error if there's back reference to a non-existent capturing
5518 subpattern. */
5519
5520 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5521
5522 /* Failed to compile, or error while post-processing */
5523
5524 if (*errorptr != NULL)
5525 {
5526 (pcre_free)(re);
5527 PCRE_ERROR_RETURN:
5528 *erroroffset = ptr - (const uschar *)pattern;
5529 return NULL;
5530 }
5531
5532 /* If the anchored option was not passed, set the flag if we can determine that
5533 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5534 as starting with .* when DOTALL is set).
5535
5536 Otherwise, if we know what the first character has to be, save it, because that
5537 speeds up unanchored matches no end. If not, see if we can set the
5538 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5539 start with ^. and also when all branches start with .* for non-DOTALL matches.
5540 */
5541
5542 if ((options & PCRE_ANCHORED) == 0)
5543 {
5544 int temp_options = options;
5545 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5546 re->options |= PCRE_ANCHORED;
5547 else
5548 {
5549 if (firstbyte < 0)
5550 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5551 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5552 {
5553 int ch = firstbyte & 255;
5554 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5555 compile_block.fcc[ch] == ch)? ch : firstbyte;
5556 re->options |= PCRE_FIRSTSET;
5557 }
5558 else if (is_startline(codestart, 0, compile_block.backref_map))
5559 re->options |= PCRE_STARTLINE;
5560 }
5561 }
5562
5563 /* For an anchored pattern, we use the "required byte" only if it follows a
5564 variable length item in the regex. Remove the caseless flag for non-caseable
5565 bytes. */
5566
5567 if (reqbyte >= 0 &&
5568 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5569 {
5570 int ch = reqbyte & 255;
5571 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5572 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5573 re->options |= PCRE_REQCHSET;
5574 }
5575
5576 /* Print out the compiled data for debugging */
5577
5578 #ifdef DEBUG
5579
5580 printf("Length = %d top_bracket = %d top_backref = %d\n",
5581 length, re->top_bracket, re->top_backref);
5582
5583 if (re->options != 0)
5584 {
5585 printf("%s%s%s%s%s%s%s%s%s%s\n",
5586 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5587 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5588 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5589 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5590 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5591 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5592 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5593 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5594 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5595 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5596 }
5597
5598 if ((re->options & PCRE_FIRSTSET) != 0)
5599 {
5600 int ch = re->first_byte & 255;
5601 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5602 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5603 else printf("First char = \\x%02x%s\n", ch, caseless);
5604 }
5605
5606 if ((re->options & PCRE_REQCHSET) != 0)
5607 {
5608 int ch = re->req_byte & 255;
5609 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5610 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5611 else printf("Req char = \\x%02x%s\n", ch, caseless);
5612 }
5613
5614 print_internals(re, stdout);
5615
5616 /* This check is done here in the debugging case so that the code that
5617 was compiled can be seen. */
5618
5619 if (code - codestart > length)
5620 {
5621 *errorptr = ERR23;
5622 (pcre_free)(re);
5623 *erroroffset = ptr - (uschar *)pattern;
5624 return NULL;
5625 }
5626 #endif
5627
5628 return (pcre *)re;
5629 }
5630
5631
5632
5633 /*************************************************
5634 * Match a back-reference *
5635 *************************************************/
5636
5637 /* If a back reference hasn't been set, the length that is passed is greater
5638 than the number of characters left in the string, so the match fails.
5639
5640 Arguments:
5641 offset index into the offset vector
5642 eptr points into the subject
5643 length length to be matched
5644 md points to match data block
5645 ims the ims flags
5646
5647 Returns: TRUE if matched
5648 */
5649
5650 static BOOL
5651 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5652 unsigned long int ims)
5653 {
5654 const uschar *p = md->start_subject + md->offset_vector[offset];
5655
5656 #ifdef DEBUG
5657 if (eptr >= md->end_subject)
5658 printf("matching subject <null>");
5659 else
5660 {
5661 printf("matching subject ");
5662 pchars(eptr, length, TRUE, md);
5663 }
5664 printf(" against backref ");
5665 pchars(p, length, FALSE, md);
5666 printf("\n");
5667 #endif
5668
5669 /* Always fail if not enough characters left */
5670
5671 if (length > md->end_subject - eptr) return FALSE;
5672
5673 /* Separate the caselesss case for speed */
5674
5675 if ((ims & PCRE_CASELESS) != 0)
5676 {
5677 while (length-- > 0)
5678 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5679 }
5680 else
5681 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5682
5683 return TRUE;
5684 }
5685
5686
5687 #ifdef SUPPORT_UTF8
5688 /*************************************************
5689 * Match character against an XCLASS *
5690 *************************************************/
5691
5692 /* This function is called from within the XCLASS code below, to match a
5693 character against an extended class which might match values > 255.
5694
5695 Arguments:
5696 c the character
5697 data points to the flag byte of the XCLASS data
5698
5699 Returns: TRUE if character matches, else FALSE
5700 */
5701
5702 static BOOL
5703 match_xclass(int c, const uschar *data)
5704 {
5705 int t;
5706 BOOL negated = (*data & XCL_NOT) != 0;
5707
5708 /* Character values < 256 are matched against a bitmap, if one is present. If
5709 not, we still carry on, because there may be ranges that start below 256 in the
5710 additional data. */
5711
5712 if (c < 256)
5713 {
5714 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5715 return !negated; /* char found */
5716 }
5717
5718 /* First skip the bit map if present. Then match against the list of Unicode
5719 properties or large chars or ranges that end with a large char. We won't ever
5720 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5721
5722 if ((*data++ & XCL_MAP) != 0) data += 32;
5723
5724 while ((t = *data++) != XCL_END)
5725 {
5726 int x, y;
5727 if (t == XCL_SINGLE)
5728 {
5729 GETCHARINC(x, data);
5730 if (c == x) return !negated;
5731 }
5732 else if (t == XCL_RANGE)
5733 {
5734 GETCHARINC(x, data);
5735 GETCHARINC(y, data);
5736 if (c >= x && c <= y) return !negated;
5737 }
5738
5739 #ifdef SUPPORT_UCP
5740 else /* XCL_PROP & XCL_NOTPROP */
5741 {
5742 int chartype, othercase;
5743 int rqdtype = *data++;
5744 int category = ucp_findchar(c, &chartype, &othercase);
5745 if (rqdtype >= 128)
5746 {
5747 if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5748 }
5749 else
5750 {
5751 if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5752 }
5753 }
5754 #endif /* SUPPORT_UCP */
5755 }
5756
5757 return negated; /* char did not match */
5758 }
5759 #endif
5760
5761
5762 /***************************************************************************
5763 ****************************************************************************
5764 RECURSION IN THE match() FUNCTION
5765
5766 The match() function is highly recursive. Some regular expressions can cause
5767 it to recurse thousands of times. I was writing for Unix, so I just let it
5768 call itself recursively. This uses the stack for saving everything that has
5769 to be saved for a recursive call. On Unix, the stack can be large, and this
5770 works fine.
5771
5772 It turns out that on non-Unix systems there are problems with programs that
5773 use a lot of stack. (This despite the fact that every last chip has oodles
5774 of memory these days, and techniques for extending the stack have been known
5775 for decades.) So....
5776
5777 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5778 calls by keeping local variables that need to be preserved in blocks of memory
5779 obtained from malloc instead instead of on the stack. Macros are used to
5780 achieve this so that the actual code doesn't look very different to what it
5781 always used to.
5782 ****************************************************************************
5783 ***************************************************************************/
5784
5785
5786 /* These versions of the macros use the stack, as normal */
5787
5788 #ifndef NO_RECURSE
5789 #define REGISTER register
5790 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5791 #define RRETURN(ra) return ra
5792 #else
5793
5794
5795 /* These versions of the macros manage a private stack on the heap. Note
5796 that the rd argument of RMATCH isn't actually used. It's the md argument of
5797 match(), which never changes. */
5798
5799 #define REGISTER
5800
5801 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5802 {\
5803 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5804 if (setjmp(frame->Xwhere) == 0)\
5805 {\
5806 newframe->Xeptr = ra;\
5807 newframe->Xecode = rb;\
5808 newframe->Xoffset_top = rc;\
5809 newframe->Xims = re;\
5810 newframe->Xeptrb = rf;\
5811 newframe->Xflags = rg;\
5812 newframe->Xprevframe = frame;\
5813 frame = newframe;\
5814 DPRINTF(("restarting from line %d\n", __LINE__));\
5815 goto HEAP_RECURSE;\
5816 }\
5817 else\
5818 {\
5819 DPRINTF(("longjumped back to line %d\n", __LINE__));\
5820 frame = md->thisframe;\
5821 rx = frame->Xresult;\
5822 }\
5823 }
5824
5825 #define RRETURN(ra)\
5826 {\
5827 heapframe *newframe = frame;\
5828 frame = newframe->Xprevframe;\
5829 (pcre_stack_free)(newframe);\
5830 if (frame != NULL)\
5831 {\
5832 frame->Xresult = ra;\
5833 md->thisframe = frame;\
5834 longjmp(frame->Xwhere, 1);\
5835 }\
5836 return ra;\
5837 }
5838
5839
5840 /* Structure for remembering the local variables in a private frame */
5841
5842 typedef struct heapframe {
5843 struct heapframe *Xprevframe;
5844
5845 /* Function arguments that may change */
5846
5847 const uschar *Xeptr;
5848 const uschar *Xecode;
5849 int Xoffset_top;
5850 long int Xims;
5851 eptrblock *Xeptrb;
5852 int Xflags;
5853
5854 /* Function local variables */
5855
5856 const uschar *Xcallpat;
5857 const uschar *Xcharptr;
5858 const uschar *Xdata;
5859 const uschar *Xnext;
5860 const uschar *Xpp;
5861 const uschar *Xprev;
5862 const uschar *Xsaved_eptr;
5863
5864 recursion_info Xnew_recursive;
5865
5866 BOOL Xcur_is_word;
5867 BOOL Xcondition;
5868 BOOL Xminimize;
5869 BOOL Xprev_is_word;
5870
5871 unsigned long int Xoriginal_ims;
5872
5873 #ifdef SUPPORT_UCP
5874 int Xprop_type;
5875 int Xprop_fail_result;
5876 int Xprop_category;
5877 int Xprop_chartype;
5878 int Xprop_othercase;
5879 int Xprop_test_against;
5880 int *Xprop_test_variable;
5881 #endif
5882
5883 int Xctype;
5884 int Xfc;
5885 int Xfi;
5886 int Xlength;
5887 int Xmax;
5888 int Xmin;
5889 int Xnumber;
5890 int Xoffset;
5891 int Xop;
5892 int Xsave_capture_last;
5893 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5894 int Xstacksave[REC_STACK_SAVE_MAX];
5895
5896 eptrblock Xnewptrb;
5897
5898 /* Place to pass back result, and where to jump back to */
5899
5900 int Xresult;
5901 jmp_buf Xwhere;
5902
5903 } heapframe;
5904
5905 #endif
5906
5907
5908 /***************************************************************************
5909 ***************************************************************************/
5910
5911
5912
5913 /*************************************************
5914 * Match from current position *
5915 *************************************************/
5916
5917 /* On entry ecode points to the first opcode, and eptr to the first character
5918 in the subject string, while eptrb holds the value of eptr at the start of the
5919 last bracketed group - used for breaking infinite loops matching zero-length
5920 strings. This function is called recursively in many circumstances. Whenever it
5921 returns a negative (error) response, the outer incarnation must also return the
5922 same response.
5923
5924 Performance note: It might be tempting to extract commonly used fields from the
5925 md structure (e.g. utf8, end_subject) into individual variables to improve
5926 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5927 made performance worse.
5928
5929 Arguments:
5930 eptr pointer in subject
5931 ecode position in code
5932 offset_top current top pointer
5933 md pointer to "static" info for the match
5934 ims current /i, /m, and /s options
5935 eptrb pointer to chain of blocks containing eptr at start of
5936 brackets - for testing for empty matches
5937 flags can contain
5938 match_condassert - this is an assertion condition
5939 match_isgroup - this is the start of a bracketed group
5940
5941 Returns: MATCH_MATCH if matched ) these values are >= 0
5942 MATCH_NOMATCH if failed to match )
5943 a negative PCRE_ERROR_xxx value if aborted by an error condition
5944 (e.g. stopped by recursion limit)
5945 */
5946
5947 static int
5948 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5949 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5950 int flags)
5951 {
5952 /* These variables do not need to be preserved over recursion in this function,
5953 so they can be ordinary variables in all cases. Mark them with "register"
5954 because they are used a lot in loops. */
5955
5956 register int rrc; /* Returns from recursive calls */
5957 register int i; /* Used for loops not involving calls to RMATCH() */
5958 register int c; /* Character values not kept over RMATCH() calls */
5959
5960 /* When recursion is not being used, all "local" variables that have to be
5961 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5962 heap storage. Set up the top-level frame here; others are obtained from the
5963 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5964
5965 #ifdef NO_RECURSE
5966 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5967 frame->Xprevframe = NULL; /* Marks the top level */
5968
5969 /* Copy in the original argument variables */
5970
5971 frame->Xeptr = eptr;
5972 frame->Xecode = ecode;
5973 frame->Xoffset_top = offset_top;
5974 frame->Xims = ims;
5975 frame->Xeptrb = eptrb;
5976 frame->Xflags = flags;
5977
5978 /* This is where control jumps back to to effect "recursion" */
5979
5980 HEAP_RECURSE:
5981
5982 /* Macros make the argument variables come from the current frame */
5983
5984 #define eptr frame->Xeptr
5985 #define ecode frame->Xecode
5986 #define offset_top frame->Xoffset_top
5987 #define ims frame->Xims
5988 #define eptrb frame->Xeptrb
5989 #define flags frame->Xflags
5990
5991 /* Ditto for the local variables */
5992
5993 #ifdef SUPPORT_UTF8
5994 #define charptr frame->Xcharptr
5995 #endif
5996 #define callpat frame->Xcallpat
5997 #define data frame->Xdata
5998 #define next frame->Xnext
5999 #define pp frame->Xpp
6000 #define prev frame->Xprev
6001 #define saved_eptr frame->Xsaved_eptr
6002
6003 #define new_recursive frame->Xnew_recursive
6004
6005 #define cur_is_word frame->Xcur_is_word
6006 #define condition frame->Xcondition
6007 #define minimize frame->Xminimize
6008 #define prev_is_word frame->Xprev_is_word
6009
6010 #define original_ims frame->Xoriginal_ims
6011
6012 #ifdef SUPPORT_UCP
6013 #define prop_type frame->Xprop_type
6014 #define prop_fail_result frame->Xprop_fail_result
6015 #define prop_category frame->Xprop_category
6016 #define prop_chartype frame->Xprop_chartype
6017 #define prop_othercase frame->Xprop_othercase
6018 #define prop_test_against frame->Xprop_test_against
6019 #define prop_test_variable frame->Xprop_test_variable
6020 #endif
6021
6022 #define ctype frame->Xctype
6023 #define fc frame->Xfc
6024 #define fi frame->Xfi
6025 #define length frame->Xlength
6026 #define max frame->Xmax
6027 #define min frame->Xmin
6028 #define number frame->Xnumber
6029 #define offset frame->Xoffset
6030 #define op frame->Xop
6031 #define save_capture_last frame->Xsave_capture_last
6032 #define save_offset1 frame->Xsave_offset1
6033 #define save_offset2 frame->Xsave_offset2
6034 #define save_offset3 frame->Xsave_offset3
6035 #define stacksave frame->Xstacksave
6036
6037 #define newptrb frame->Xnewptrb
6038
6039 /* When recursion is being used, local variables are allocated on the stack and
6040 get preserved during recursion in the normal way. In this environment, fi and
6041 i, and fc and c, can be the same variables. */
6042
6043 #else
6044 #define fi i
6045 #define fc c
6046
6047
6048 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
6049 const uschar *charptr; /* small blocks of the code. My normal */
6050 #endif /* style of coding would have declared */
6051 const uschar *callpat; /* them within each of those blocks. */
6052 const uschar *data; /* However, in order to accommodate the */
6053 const uschar *next; /* version of this code that uses an */
6054 const uschar *pp; /* external "stack" implemented on the */
6055 const uschar *prev; /* heap, it is easier to declare them */
6056 const uschar *saved_eptr; /* all here, so the declarations can */
6057 /* be cut out in a block. The only */
6058 recursion_info new_recursive; /* declarations within blocks below are */
6059 /* for variables that do not have to */
6060 BOOL cur_is_word; /* be preserved over a recursive call */
6061 BOOL condition; /* to RMATCH(). */
6062 BOOL minimize;
6063 BOOL prev_is_word;
6064
6065 unsigned long int original_ims;
6066
6067 #ifdef SUPPORT_UCP
6068 int prop_type;
6069 int prop_fail_result;
6070 int prop_category;
6071 int prop_chartype;
6072 int prop_othercase;
6073 int prop_test_against;
6074 int *prop_test_variable;
6075 #endif
6076
6077 int ctype;
6078 int length;
6079 int max;
6080 int min;
6081 int number;
6082 int offset;
6083 int op;
6084 int save_capture_last;
6085 int save_offset1, save_offset2, save_offset3;
6086 int stacksave[REC_STACK_SAVE_MAX];
6087
6088 eptrblock newptrb;
6089 #endif
6090
6091 /* These statements are here to stop the compiler complaining about unitialized
6092 variables. */
6093
6094 #ifdef SUPPORT_UCP
6095 prop_fail_result = 0;
6096 prop_test_against = 0;
6097 prop_test_variable = NULL;
6098 #endif
6099
6100 /* OK, now we can get on with the real code of the function. Recursion is
6101 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6102 these just turn into a recursive call to match() and a "return", respectively.
6103 However, RMATCH isn't like a function call because it's quite a complicated
6104 macro. It has to be used in one particular way. This shouldn't, however, impact
6105 performance when true recursion is being used. */
6106
6107 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6108
6109 original_ims = ims; /* Save for resetting on ')' */
6110
6111 /* At the start of a bracketed group, add the current subject pointer to the
6112 stack of such pointers, to be re-instated at the end of the group when we hit
6113 the closing ket. When match() is called in other circumstances, we don't add to
6114 this stack. */
6115
6116 if ((flags & match_isgroup) != 0)
6117 {
6118 newptrb.epb_prev = eptrb;
6119 newptrb.epb_saved_eptr = eptr;
6120 eptrb = &newptrb;
6121 }
6122
6123 /* Now start processing the operations. */
6124
6125 for (;;)
6126 {
6127 op = *ecode;
6128 minimize = FALSE;
6129
6130 /* For partial matching, remember if we ever hit the end of the subject after
6131 matching at least one subject character. */
6132
6133 if (md->partial &&
6134 eptr >= md->end_subject &&
6135 eptr > md->start_match)
6136 md->hitend = TRUE;
6137
6138 /* Opening capturing bracket. If there is space in the offset vector, save
6139 the current subject position in the working slot at the top of the vector. We
6140 mustn't change the current values of the data slot, because they may be set
6141 from a previous iteration of this group, and be referred to by a reference
6142 inside the group.
6143
6144 If the bracket fails to match, we need to restore this value and also the
6145 values of the final offsets, in case they were set by a previous iteration of
6146 the same bracket.
6147
6148 If there isn't enough space in the offset vector, treat this as if it were a
6149 non-capturing bracket. Don't worry about setting the flag for the error case
6150 here; that is handled in the code for KET. */
6151
6152 if (op > OP_BRA)
6153 {
6154 number = op - OP_BRA;
6155
6156 /* For extended extraction brackets (large number), we have to fish out the
6157 number from a dummy opcode at the start. */
6158
6159 if (number > EXTRACT_BASIC_MAX)
6160 number = GET2(ecode, 2+LINK_SIZE);
6161 offset = number << 1;
6162
6163 #ifdef DEBUG
6164 printf("start bracket %d subject=", number);
6165 pchars(eptr, 16, TRUE, md);
6166 printf("\n");
6167 #endif
6168
6169 if (offset < md->offset_max)
6170 {
6171 save_offset1 = md->offset_vector[offset];
6172 save_offset2 = md->offset_vector[offset+1];
6173 save_offset3 = md->offset_vector[md->offset_end - number];
6174 save_capture_last = md->capture_last;
6175
6176 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6177 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6178
6179 do
6180 {
6181 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6182 match_isgroup);
6183 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6184 md->capture_last = save_capture_last;
6185 ecode += GET(ecode, 1);
6186 }
6187 while (*ecode == OP_ALT);
6188
6189 DPRINTF(("bracket %d failed\n", number));
6190
6191 md->offset_vector[offset] = save_offset1;
6192 md->offset_vector[offset+1] = save_offset2;
6193 md->offset_vector[md->offset_end - number] = save_offset3;
6194
6195 RRETURN(MATCH_NOMATCH);
6196 }
6197
6198 /* Insufficient room for saving captured contents */
6199
6200 else op = OP_BRA;
6201 }
6202
6203 /* Other types of node can be handled by a switch */
6204
6205 switch(op)
6206 {
6207 case OP_BRA: /* Non-capturing bracket: optimized */
6208 DPRINTF(("start bracket 0\n"));
6209 do
6210 {
6211 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6212 match_isgroup);
6213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6214 ecode += GET(ecode, 1);
6215 }
6216 while (*ecode == OP_ALT);
6217 DPRINTF(("bracket 0 failed\n"));
6218 RRETURN(MATCH_NOMATCH);
6219
6220 /* Conditional group: compilation checked that there are no more than
6221 two branches. If the condition is false, skipping the first branch takes us
6222 past the end if there is only one branch, but that's OK because that is
6223 exactly what going to the ket would do. */
6224
6225 case OP_COND:
6226 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6227 {
6228 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6229 condition = (offset == CREF_RECURSE * 2)?
6230 (md->recursive != NULL) :
6231 (offset < offset_top && md->offset_vector[offset] >= 0);
6232 RMATCH(rrc, eptr, ecode + (condition?
6233 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6234 offset_top, md, ims, eptrb, match_isgroup);
6235 RRETURN(rrc);
6236 }
6237
6238 /* The condition is an assertion. Call match() to evaluate it - setting
6239 the final argument TRUE causes it to stop at the end of an assertion. */
6240
6241 else
6242 {
6243 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6244 match_condassert | match_isgroup);
6245 if (rrc == MATCH_MATCH)
6246 {
6247 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6248 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6249 }
6250 else if (rrc != MATCH_NOMATCH)
6251 {
6252 RRETURN(rrc); /* Need braces because of following else */
6253 }
6254 else ecode += GET(ecode, 1);
6255 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6256 match_isgroup);
6257 RRETURN(rrc);
6258 }
6259 /* Control never reaches here */
6260
6261 /* Skip over conditional reference or large extraction number data if
6262 encountered. */
6263
6264 case OP_CREF:
6265 case OP_BRANUMBER:
6266 ecode += 3;
6267 break;
6268
6269 /* End of the pattern. If we are in a recursion, we should restore the
6270 offsets appropriately and continue from after the call. */
6271
6272 case OP_END:
6273 if (md->recursive != NULL && md->recursive->group_num == 0)
6274 {
6275 recursion_info *rec = md->recursive;
6276 DPRINTF(("Hit the end in a (?0) recursion\n"));
6277 md->recursive = rec->prevrec;
6278 memmove(md->offset_vector, rec->offset_save,
6279 rec->saved_max * sizeof(int));
6280 md->start_match = rec->save_start;
6281 ims = original_ims;
6282 ecode = rec->after_call;
6283 break;
6284 }
6285
6286 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6287 string - backtracking will then try other alternatives, if any. */
6288
6289 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6290 md->end_match_ptr = eptr; /* Record where we ended */
6291 md->end_offset_top = offset_top; /* and how many extracts were taken */
6292 RRETURN(MATCH_MATCH);
6293
6294 /* Change option settings */
6295
6296 case OP_OPT:
6297 ims = ecode[1];
6298 ecode += 2;
6299 DPRINTF(("ims set to %02lx\n", ims));
6300 break;
6301
6302 /* Assertion brackets. Check the alternative branches in turn - the
6303 matching won't pass the KET for an assertion. If any one branch matches,
6304 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6305 start of each branch to move the current point backwards, so the code at
6306 this level is identical to the lookahead case. */
6307
6308 case OP_ASSERT:
6309 case OP_ASSERTBACK:
6310 do
6311 {
6312 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6313 match_isgroup);
6314 if (rrc == MATCH_MATCH) break;
6315 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6316 ecode += GET(ecode, 1);
6317 }
6318 while (*ecode == OP_ALT);
6319 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6320
6321 /* If checking an assertion for a condition, return MATCH_MATCH. */
6322
6323 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6324
6325 /* Continue from after the assertion, updating the offsets high water
6326 mark, since extracts may have been taken during the assertion. */
6327
6328 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6329 ecode += 1 + LINK_SIZE;
6330 offset_top = md->end_offset_top;
6331 continue;
6332
6333 /* Negative assertion: all branches must fail to match */
6334
6335 case OP_ASSERT_NOT:
6336 case OP_ASSERTBACK_NOT:
6337 do
6338 {
6339 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6340 match_isgroup);
6341 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6342 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6343 ecode += GET(ecode,1);
6344 }
6345 while (*ecode == OP_ALT);
6346
6347 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6348
6349 ecode += 1 + LINK_SIZE;
6350 continue;
6351
6352 /* Move the subject pointer back. This occurs only at the start of
6353 each branch of a lookbehind assertion. If we are too close to the start to
6354 move back, this match function fails. When working with UTF-8 we move
6355 back a number of characters, not bytes. */
6356
6357 case OP_REVERSE:
6358 #ifdef SUPPORT_UTF8
6359 if (md->utf8)
6360 {
6361 c = GET(ecode,1);
6362 for (i = 0; i < c; i++)
6363 {
6364 eptr--;
6365 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6366 BACKCHAR(eptr)
6367 }
6368 }
6369 else
6370 #endif
6371
6372 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6373
6374 {
6375 eptr -= GET(ecode,1);
6376 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6377 }
6378
6379 /* Skip to next op code */
6380
6381 ecode += 1 + LINK_SIZE;
6382 break;
6383
6384 /* The callout item calls an external function, if one is provided, passing
6385 details of the match so far. This is mainly for debugging, though the
6386 function is able to force a failure. */
6387
6388 case OP_CALLOUT:
6389 if (pcre_callout != NULL)
6390 {
6391 pcre_callout_block cb;
6392 cb.version = 1; /* Version 1 of the callout block */
6393 cb.callout_number = ecode[1];
6394 cb.offset_vector = md->offset_vector;
6395 cb.subject = (const char *)md->start_subject;
6396 cb.subject_length = md->end_subject - md->start_subject;
6397 cb.start_match = md->start_match - md->start_subject;
6398 cb.current_position = eptr - md->start_subject;
6399 cb.pattern_position = GET(ecode, 2);
6400 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6401 cb.capture_top = offset_top/2;
6402 cb.capture_last = md->capture_last;
6403 cb.callout_data = md->callout_data;
6404 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6405 if (rrc < 0) RRETURN(rrc);
6406 }
6407 ecode += 2 + 2*LINK_SIZE;
6408 break;
6409
6410 /* Recursion either matches the current regex, or some subexpression. The
6411 offset data is the offset to the starting bracket from the start of the
6412 whole pattern. (This is so that it works from duplicated subpatterns.)
6413
6414 If there are any capturing brackets started but not finished, we have to
6415 save their starting points and reinstate them after the recursion. However,
6416 we don't know how many such there are (offset_top records the completed
6417 total) so we just have to save all the potential data. There may be up to
6418 65535 such values, which is too large to put on the stack, but using malloc
6419 for small numbers seems expensive. As a compromise, the stack is used when
6420 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6421 is used. A problem is what to do if the malloc fails ... there is no way of
6422 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6423 values on the stack, and accept that the rest may be wrong.
6424
6425 There are also other values that have to be saved. We use a chained
6426 sequence of blocks that actually live on the stack. Thanks to Robin Houston
6427 for the original version of this logic. */
6428
6429 case OP_RECURSE:
6430 {
6431 callpat = md->start_code + GET(ecode, 1);
6432 new_recursive.group_num = *callpat - OP_BRA;
6433
6434 /* For extended extraction brackets (large number), we have to fish out
6435 the number from a dummy opcode at the start. */
6436
6437 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6438 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6439
6440 /* Add to "recursing stack" */
6441
6442 new_recursive.prevrec = md->recursive;
6443 md->recursive = &new_recursive;
6444
6445 /* Find where to continue from afterwards */
6446
6447 ecode += 1 + LINK_SIZE;
6448 new_recursive.after_call = ecode;
6449
6450 /* Now save the offset data. */
6451
6452 new_recursive.saved_max = md->offset_end;
6453 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6454 new_recursive.offset_save = stacksave;
6455 else
6456 {
6457 new_recursive.offset_save =
6458 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6459 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6460 }
6461
6462 memcpy(new_recursive.offset_save, md->offset_vector,
6463 new_recursive.saved_max * sizeof(int));
6464 new_recursive.save_start = md->start_match;
6465 md->start_match = eptr;
6466
6467 /* OK, now we can do the recursion. For each top-level alternative we
6468 restore the offset and recursion data. */
6469
6470 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6471 do
6472 {
6473 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6474 eptrb, match_isgroup);
6475 if (rrc == MATCH_MATCH)
6476 {
6477 md->recursive = new_recursive.prevrec;
6478 if (new_recursive.offset_save != stacksave)
6479 (pcre_free)(new_recursive.offset_save);
6480 RRETURN(MATCH_MATCH);
6481 }
6482 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6483
6484 md->recursive = &new_recursive;
6485 memcpy(md->offset_vector, new_recursive.offset_save,
6486 new_recursive.saved_max * sizeof(int));
6487 callpat += GET(callpat, 1);
6488 }
6489 while (*callpat == OP_ALT);
6490
6491 DPRINTF(("Recursion didn't match\n"));
6492 md->recursive = new_recursive.prevrec;
6493 if (new_recursive.offset_save != stacksave)
6494 (pcre_free)(new_recursive.offset_save);
6495 RRETURN(MATCH_NOMATCH);
6496 }
6497 /* Control never reaches here */
6498
6499 /* "Once" brackets are like assertion brackets except that after a match,
6500 the point in the subject string is not moved back. Thus there can never be
6501 a move back into the brackets. Friedl calls these "atomic" subpatterns.
6502 Check the alternative branches in turn - the matching won't pass the KET
6503 for this kind of subpattern. If any one branch matches, we carry on as at
6504 the end of a normal bracket, leaving the subject pointer. */
6505
6506 case OP_ONCE:
6507 {
6508 prev = ecode;
6509 saved_eptr = eptr;
6510
6511 do
6512 {
6513 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6514 eptrb, match_isgroup);
6515 if (rrc == MATCH_MATCH) break;
6516 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6517 ecode += GET(ecode,1);
6518 }
6519 while (*ecode == OP_ALT);
6520
6521 /* If hit the end of the group (which could be repeated), fail */
6522
6523 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6524
6525 /* Continue as from after the assertion, updating the offsets high water
6526 mark, since extracts may have been taken. */
6527
6528 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6529
6530 offset_top = md->end_offset_top;
6531 eptr = md->end_match_ptr;
6532
6533 /* For a non-repeating ket, just continue at this level. This also
6534 happens for a repeating ket if no characters were matched in the group.
6535 This is the forcible breaking of infinite loops as implemented in Perl
6536 5.005. If there is an options reset, it will get obeyed in the normal
6537 course of events. */
6538
6539 if (*ecode == OP_KET || eptr == saved_eptr)
6540 {
6541 ecode += 1+LINK_SIZE;
6542 break;
6543 }
6544
6545 /* The repeating kets try the rest of the pattern or restart from the
6546 preceding bracket, in the appropriate order. We need to reset any options
6547 that changed within the bracket before re-running it, so check the next
6548 opcode. */
6549
6550 if (ecode[1+LINK_SIZE] == OP_OPT)
6551 {
6552 ims = (ims & ~PCRE_IMS) | ecode[4];
6553 DPRINTF(("ims set to %02lx at group repeat\n", ims));
6554 }
6555
6556 if (*ecode == OP_KETRMIN)
6557 {
6558 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6559 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6560 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6561 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6562 }
6563 else /* OP_KETRMAX */
6564 {
6565 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6566 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6567 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6568 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6569 }
6570 }
6571 RRETURN(MATCH_NOMATCH);
6572
6573 /* An alternation is the end of a branch; scan along to find the end of the
6574 bracketed group and go to there. */
6575
6576 case OP_ALT:
6577 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6578 break;
6579
6580 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6581 that it may occur zero times. It may repeat infinitely, or not at all -
6582 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6583 repeat limits are compiled as a number of copies, with the optional ones
6584 preceded by BRAZERO or BRAMINZERO. */
6585
6586 case OP_BRAZERO:
6587 {
6588 next = ecode+1;
6589 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6590 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6591 do next += GET(next,1); while (*next == OP_ALT);
6592 ecode = next + 1+LINK_SIZE;
6593 }
6594 break;
6595
6596 case OP_BRAMINZERO:
6597 {
6598 next = ecode+1;
6599 do next += GET(next,1); while (*next == OP_ALT);
6600 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6601 match_isgroup);
6602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6603 ecode++;
6604 }
6605 break;
6606
6607 /* End of a group, repeated or non-repeating. If we are at the end of
6608 an assertion "group", stop matching and return MATCH_MATCH, but record the
6609 current high water mark for use by positive assertions. Do this also
6610 for the "once" (not-backup up) groups. */
6611
6612 case OP_KET:
6613 case OP_KETRMIN:
6614 case OP_KETRMAX:
6615 {
6616 prev = ecode - GET(ecode, 1);
6617 saved_eptr = eptrb->epb_saved_eptr;
6618
6619 /* Back up the stack of bracket start pointers. */
6620
6621 eptrb = eptrb->epb_prev;
6622
6623 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6624 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6625 *prev == OP_ONCE)
6626 {
6627 md->end_match_ptr = eptr; /* For ONCE */
6628 md->end_offset_top = offset_top;
6629 RRETURN(MATCH_MATCH);
6630 }
6631
6632 /* In all other cases except a conditional group we have to check the
6633 group number back at the start and if necessary complete handling an
6634 extraction by setting the offsets and bumping the high water mark. */
6635
6636 if (*prev != OP_COND)
6637 {
6638 number = *prev - OP_BRA;
6639
6640 /* For extended extraction brackets (large number), we have to fish out
6641 the number from a dummy opcode at the start. */
6642
6643 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6644 offset = number << 1;
6645
6646 #ifdef DEBUG
6647 printf("end bracket %d", number);
6648 printf("\n");
6649 #endif
6650
6651 /* Test for a numbered group. This includes groups called as a result
6652 of recursion. Note that whole-pattern recursion is coded as a recurse
6653 into group 0, so it won't be picked up here. Instead, we catch it when
6654 the OP_END is reached. */
6655
6656 if (number > 0)
6657 {
6658 md->capture_last = number;
6659 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6660 {
6661 md->offset_vector[offset] =
6662 md->offset_vector[md->offset_end - number];
6663 md->offset_vector[offset+1] = eptr - md->start_subject;
6664 if (offset_top <= offset) offset_top = offset + 2;
6665 }
6666
6667 /* Handle a recursively called group. Restore the offsets
6668 appropriately and continue from after the call. */
6669
6670 if (md->recursive != NULL && md->recursive->group_num == number)
6671 {
6672 recursion_info *rec = md->recursive;
6673 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6674 md->recursive = rec->prevrec;
6675 md->start_match = rec->save_start;
6676 memcpy(md->offset_vector, rec->offset_save,
6677 rec->saved_max * sizeof(int));
6678 ecode = rec->after_call;
6679 ims = original_ims;
6680 break;
6681 }
6682 }
6683 }
6684
6685 /* Reset the value of the ims flags, in case they got changed during
6686 the group. */
6687
6688 ims = original_ims;
6689 DPRINTF(("ims reset to %02lx\n", ims));
6690
6691 /* For a non-repeating ket, just continue at this level. This also
6692 happens for a repeating ket if no characters were