/[pcre]/code/branches/pcre16/pcretest.c
ViewVC logotype

Contents of /code/branches/pcre16/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 774 - (show annotations)
Thu Dec 1 06:08:45 2011 UTC (8 years, 7 months ago) by zherczeg
File MIME type: text/plain
File size: 94005 byte(s)
better digit parsing, first_byte, req_byte are renamed to first_char req_char respectively
1 /*************************************************
2 * PCRE testing program *
3 *************************************************/
4
5 /* This program was hacked up as a tester for PCRE. I really should have
6 written it more tidily in the first place. Will I ever learn? It has grown and
7 been extended and consequently is now rather, er, *very* untidy in places.
8
9 -----------------------------------------------------------------------------
10 Redistribution and use in source and binary forms, with or without
11 modification, are permitted provided that the following conditions are met:
12
13 * Redistributions of source code must retain the above copyright notice,
14 this list of conditions and the following disclaimer.
15
16 * Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 * Neither the name of the University of Cambridge nor the names of its
21 contributors may be used to endorse or promote products derived from
22 this software without specific prior written permission.
23
24 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
28 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35 -----------------------------------------------------------------------------
36 */
37
38
39 #ifdef HAVE_CONFIG_H
40 #include "config.h"
41 #endif
42
43 #include <ctype.h>
44 #include <stdio.h>
45 #include <string.h>
46 #include <stdlib.h>
47 #include <time.h>
48 #include <locale.h>
49 #include <errno.h>
50
51 #ifdef SUPPORT_LIBREADLINE
52 #ifdef HAVE_UNISTD_H
53 #include <unistd.h>
54 #endif
55 #include <readline/readline.h>
56 #include <readline/history.h>
57 #endif
58
59
60 /* A number of things vary for Windows builds. Originally, pcretest opened its
61 input and output without "b"; then I was told that "b" was needed in some
62 environments, so it was added for release 5.0 to both the input and output. (It
63 makes no difference on Unix-like systems.) Later I was told that it is wrong
64 for the input on Windows. I've now abstracted the modes into two macros that
65 are set here, to make it easier to fiddle with them, and removed "b" from the
66 input mode under Windows. */
67
68 #if defined(_WIN32) || defined(WIN32)
69 #include <io.h> /* For _setmode() */
70 #include <fcntl.h> /* For _O_BINARY */
71 #define INPUT_MODE "r"
72 #define OUTPUT_MODE "wb"
73
74 #ifndef isatty
75 #define isatty _isatty /* This is what Windows calls them, I'm told, */
76 #endif /* though in some environments they seem to */
77 /* be already defined, hence the #ifndefs. */
78 #ifndef fileno
79 #define fileno _fileno
80 #endif
81
82 /* A user sent this fix for Borland Builder 5 under Windows. */
83
84 #ifdef __BORLANDC__
85 #define _setmode(handle, mode) setmode(handle, mode)
86 #endif
87
88 /* Not Windows */
89
90 #else
91 #include <sys/time.h> /* These two includes are needed */
92 #include <sys/resource.h> /* for setrlimit(). */
93 #define INPUT_MODE "rb"
94 #define OUTPUT_MODE "wb"
95 #endif
96
97
98 /* We have to include pcre_internal.h because we need the internal info for
99 displaying the results of pcre_study() and we also need to know about the
100 internal macros, structures, and other internal data values; pcretest has
101 "inside information" compared to a program that strictly follows the PCRE API.
102
103 Although pcre_internal.h does itself include pcre.h, we explicitly include it
104 here before pcre_internal.h so that the PCRE_EXP_xxx macros get set
105 appropriately for an application, not for building PCRE. */
106
107 #include "pcre.h"
108 #include "pcre_internal.h"
109
110 /* We need access to some of the data tables that PCRE uses. So as not to have
111 to keep two copies, we include the source file here, changing the names of the
112 external symbols to prevent clashes. */
113
114 #define _pcre_ucp_gentype ucp_gentype
115 #define _pcre_ucp_typerange ucp_typerange
116 #define _pcre_utf8_table1 utf8_table1
117 #define _pcre_utf8_table1_size utf8_table1_size
118 #define _pcre_utf8_table2 utf8_table2
119 #define _pcre_utf8_table3 utf8_table3
120 #define _pcre_utf8_table4 utf8_table4
121 #define _pcre_utt utt
122 #define _pcre_utt_size utt_size
123 #define _pcre_utt_names utt_names
124 #define _pcre_OP_lengths OP_lengths
125
126 #include "pcre_tables.c"
127
128 /* We also need the pcre_printint() function for printing out compiled
129 patterns. This function is in a separate file so that it can be included in
130 pcre_compile.c when that module is compiled with debugging enabled. It needs to
131 know which case is being compiled. */
132
133 #define COMPILING_PCRETEST
134 #include "pcre_printint.src"
135
136 /* The definition of the macro PRINTABLE, which determines whether to print an
137 output character as-is or as a hex value when showing compiled patterns, is
138 contained in the printint.src file. We uses it here also, in cases when the
139 locale has not been explicitly changed, so as to get consistent output from
140 systems that differ in their output from isprint() even in the "C" locale. */
141
142 #define PRINTHEX(c) (locale_set? isprint(c) : PRINTABLE(c))
143
144 /* It is possible to compile this test program without including support for
145 testing the POSIX interface, though this is not available via the standard
146 Makefile. */
147
148 #if !defined NOPOSIX
149 #include "pcreposix.h"
150 #endif
151
152 /* It is also possible, for the benefit of the version currently imported into
153 Exim, to build pcretest without support for UTF8 (define NOUTF8), without the
154 interface to the DFA matcher (NODFA), and without the doublecheck of the old
155 "info" function (define NOINFOCHECK). In fact, we automatically cut out the
156 UTF8 support if PCRE is built without it. */
157
158 #ifndef SUPPORT_UTF8
159 #ifndef NOUTF8
160 #define NOUTF8
161 #endif
162 #endif
163
164
165 /* Other parameters */
166
167 #ifndef CLOCKS_PER_SEC
168 #ifdef CLK_TCK
169 #define CLOCKS_PER_SEC CLK_TCK
170 #else
171 #define CLOCKS_PER_SEC 100
172 #endif
173 #endif
174
175 /* This is the default loop count for timing. */
176
177 #define LOOPREPEAT 500000
178
179 /* Static variables */
180
181 static FILE *outfile;
182 static int log_store = 0;
183 static int callout_count;
184 static int callout_extra;
185 static int callout_fail_count;
186 static int callout_fail_id;
187 static int debug_lengths;
188 static int first_callout;
189 static int locale_set = 0;
190 static int show_malloc;
191 static int use_utf8;
192 static size_t gotten_store;
193 static const unsigned char *last_callout_mark = NULL;
194
195 /* The buffers grow automatically if very long input lines are encountered. */
196
197 static int buffer_size = 50000;
198 static pcre_uint8 *buffer = NULL;
199 static pcre_uint8 *dbuffer = NULL;
200 static pcre_uint8 *pbuffer = NULL;
201
202 /* Textual explanations for runtime error codes */
203
204 static const char *errtexts[] = {
205 NULL, /* 0 is no error */
206 NULL, /* NOMATCH is handled specially */
207 "NULL argument passed",
208 "bad option value",
209 "magic number missing",
210 "unknown opcode - pattern overwritten?",
211 "no more memory",
212 NULL, /* never returned by pcre_exec() or pcre_dfa_exec() */
213 "match limit exceeded",
214 "callout error code",
215 NULL, /* BADUTF8 is handled specially */
216 "bad UTF-8 offset",
217 NULL, /* PARTIAL is handled specially */
218 "not used - internal error",
219 "internal error - pattern overwritten?",
220 "bad count value",
221 "item unsupported for DFA matching",
222 "backreference condition or recursion test not supported for DFA matching",
223 "match limit not supported for DFA matching",
224 "workspace size exceeded in DFA matching",
225 "too much recursion for DFA matching",
226 "recursion limit exceeded",
227 "not used - internal error",
228 "invalid combination of newline options",
229 "bad offset value",
230 NULL, /* SHORTUTF8 is handled specially */
231 "nested recursion at the same subject position",
232 "JIT stack limit reached"
233 };
234
235
236 /*************************************************
237 * Alternate character tables *
238 *************************************************/
239
240 /* By default, the "tables" pointer when calling PCRE is set to NULL, thereby
241 using the default tables of the library. However, the T option can be used to
242 select alternate sets of tables, for different kinds of testing. Note also that
243 the L (locale) option also adjusts the tables. */
244
245 /* This is the set of tables distributed as default with PCRE. It recognizes
246 only ASCII characters. */
247
248 static const unsigned char tables0[] = {
249
250 /* This table is a lower casing table. */
251
252 0, 1, 2, 3, 4, 5, 6, 7,
253 8, 9, 10, 11, 12, 13, 14, 15,
254 16, 17, 18, 19, 20, 21, 22, 23,
255 24, 25, 26, 27, 28, 29, 30, 31,
256 32, 33, 34, 35, 36, 37, 38, 39,
257 40, 41, 42, 43, 44, 45, 46, 47,
258 48, 49, 50, 51, 52, 53, 54, 55,
259 56, 57, 58, 59, 60, 61, 62, 63,
260 64, 97, 98, 99,100,101,102,103,
261 104,105,106,107,108,109,110,111,
262 112,113,114,115,116,117,118,119,
263 120,121,122, 91, 92, 93, 94, 95,
264 96, 97, 98, 99,100,101,102,103,
265 104,105,106,107,108,109,110,111,
266 112,113,114,115,116,117,118,119,
267 120,121,122,123,124,125,126,127,
268 128,129,130,131,132,133,134,135,
269 136,137,138,139,140,141,142,143,
270 144,145,146,147,148,149,150,151,
271 152,153,154,155,156,157,158,159,
272 160,161,162,163,164,165,166,167,
273 168,169,170,171,172,173,174,175,
274 176,177,178,179,180,181,182,183,
275 184,185,186,187,188,189,190,191,
276 192,193,194,195,196,197,198,199,
277 200,201,202,203,204,205,206,207,
278 208,209,210,211,212,213,214,215,
279 216,217,218,219,220,221,222,223,
280 224,225,226,227,228,229,230,231,
281 232,233,234,235,236,237,238,239,
282 240,241,242,243,244,245,246,247,
283 248,249,250,251,252,253,254,255,
284
285 /* This table is a case flipping table. */
286
287 0, 1, 2, 3, 4, 5, 6, 7,
288 8, 9, 10, 11, 12, 13, 14, 15,
289 16, 17, 18, 19, 20, 21, 22, 23,
290 24, 25, 26, 27, 28, 29, 30, 31,
291 32, 33, 34, 35, 36, 37, 38, 39,
292 40, 41, 42, 43, 44, 45, 46, 47,
293 48, 49, 50, 51, 52, 53, 54, 55,
294 56, 57, 58, 59, 60, 61, 62, 63,
295 64, 97, 98, 99,100,101,102,103,
296 104,105,106,107,108,109,110,111,
297 112,113,114,115,116,117,118,119,
298 120,121,122, 91, 92, 93, 94, 95,
299 96, 65, 66, 67, 68, 69, 70, 71,
300 72, 73, 74, 75, 76, 77, 78, 79,
301 80, 81, 82, 83, 84, 85, 86, 87,
302 88, 89, 90,123,124,125,126,127,
303 128,129,130,131,132,133,134,135,
304 136,137,138,139,140,141,142,143,
305 144,145,146,147,148,149,150,151,
306 152,153,154,155,156,157,158,159,
307 160,161,162,163,164,165,166,167,
308 168,169,170,171,172,173,174,175,
309 176,177,178,179,180,181,182,183,
310 184,185,186,187,188,189,190,191,
311 192,193,194,195,196,197,198,199,
312 200,201,202,203,204,205,206,207,
313 208,209,210,211,212,213,214,215,
314 216,217,218,219,220,221,222,223,
315 224,225,226,227,228,229,230,231,
316 232,233,234,235,236,237,238,239,
317 240,241,242,243,244,245,246,247,
318 248,249,250,251,252,253,254,255,
319
320 /* This table contains bit maps for various character classes. Each map is 32
321 bytes long and the bits run from the least significant end of each byte. The
322 classes that have their own maps are: space, xdigit, digit, upper, lower, word,
323 graph, print, punct, and cntrl. Other classes are built from combinations. */
324
325 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
329
330 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
331 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
334
335 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
339
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
341 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
344
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
346 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
349
350 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
351 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
354
355 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff,
356 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
359
360 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,
361 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
364
365 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc,
366 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
369
370 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
374
375 /* This table identifies various classes of character by individual bits:
376 0x01 white space character
377 0x02 letter
378 0x04 decimal digit
379 0x08 hexadecimal digit
380 0x10 alphanumeric or '_'
381 0x80 regular expression metacharacter or binary zero
382 */
383
384 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
385 0x00,0x01,0x01,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
388 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
389 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
390 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
391 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
392 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
393 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
394 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
395 0x12,0x12,0x12,0x80,0x80,0x00,0x80,0x10, /* X - _ */
396 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
397 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
398 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
399 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
406 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
414 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
416
417 /* This is a set of tables that came orginally from a Windows user. It seems to
418 be at least an approximation of ISO 8859. In particular, there are characters
419 greater than 128 that are marked as spaces, letters, etc. */
420
421 static const unsigned char tables1[] = {
422 0,1,2,3,4,5,6,7,
423 8,9,10,11,12,13,14,15,
424 16,17,18,19,20,21,22,23,
425 24,25,26,27,28,29,30,31,
426 32,33,34,35,36,37,38,39,
427 40,41,42,43,44,45,46,47,
428 48,49,50,51,52,53,54,55,
429 56,57,58,59,60,61,62,63,
430 64,97,98,99,100,101,102,103,
431 104,105,106,107,108,109,110,111,
432 112,113,114,115,116,117,118,119,
433 120,121,122,91,92,93,94,95,
434 96,97,98,99,100,101,102,103,
435 104,105,106,107,108,109,110,111,
436 112,113,114,115,116,117,118,119,
437 120,121,122,123,124,125,126,127,
438 128,129,130,131,132,133,134,135,
439 136,137,138,139,140,141,142,143,
440 144,145,146,147,148,149,150,151,
441 152,153,154,155,156,157,158,159,
442 160,161,162,163,164,165,166,167,
443 168,169,170,171,172,173,174,175,
444 176,177,178,179,180,181,182,183,
445 184,185,186,187,188,189,190,191,
446 224,225,226,227,228,229,230,231,
447 232,233,234,235,236,237,238,239,
448 240,241,242,243,244,245,246,215,
449 248,249,250,251,252,253,254,223,
450 224,225,226,227,228,229,230,231,
451 232,233,234,235,236,237,238,239,
452 240,241,242,243,244,245,246,247,
453 248,249,250,251,252,253,254,255,
454 0,1,2,3,4,5,6,7,
455 8,9,10,11,12,13,14,15,
456 16,17,18,19,20,21,22,23,
457 24,25,26,27,28,29,30,31,
458 32,33,34,35,36,37,38,39,
459 40,41,42,43,44,45,46,47,
460 48,49,50,51,52,53,54,55,
461 56,57,58,59,60,61,62,63,
462 64,97,98,99,100,101,102,103,
463 104,105,106,107,108,109,110,111,
464 112,113,114,115,116,117,118,119,
465 120,121,122,91,92,93,94,95,
466 96,65,66,67,68,69,70,71,
467 72,73,74,75,76,77,78,79,
468 80,81,82,83,84,85,86,87,
469 88,89,90,123,124,125,126,127,
470 128,129,130,131,132,133,134,135,
471 136,137,138,139,140,141,142,143,
472 144,145,146,147,148,149,150,151,
473 152,153,154,155,156,157,158,159,
474 160,161,162,163,164,165,166,167,
475 168,169,170,171,172,173,174,175,
476 176,177,178,179,180,181,182,183,
477 184,185,186,187,188,189,190,191,
478 224,225,226,227,228,229,230,231,
479 232,233,234,235,236,237,238,239,
480 240,241,242,243,244,245,246,215,
481 248,249,250,251,252,253,254,223,
482 192,193,194,195,196,197,198,199,
483 200,201,202,203,204,205,206,207,
484 208,209,210,211,212,213,214,247,
485 216,217,218,219,220,221,222,255,
486 0,62,0,0,1,0,0,0,
487 0,0,0,0,0,0,0,0,
488 32,0,0,0,1,0,0,0,
489 0,0,0,0,0,0,0,0,
490 0,0,0,0,0,0,255,3,
491 126,0,0,0,126,0,0,0,
492 0,0,0,0,0,0,0,0,
493 0,0,0,0,0,0,0,0,
494 0,0,0,0,0,0,255,3,
495 0,0,0,0,0,0,0,0,
496 0,0,0,0,0,0,12,2,
497 0,0,0,0,0,0,0,0,
498 0,0,0,0,0,0,0,0,
499 254,255,255,7,0,0,0,0,
500 0,0,0,0,0,0,0,0,
501 255,255,127,127,0,0,0,0,
502 0,0,0,0,0,0,0,0,
503 0,0,0,0,254,255,255,7,
504 0,0,0,0,0,4,32,4,
505 0,0,0,128,255,255,127,255,
506 0,0,0,0,0,0,255,3,
507 254,255,255,135,254,255,255,7,
508 0,0,0,0,0,4,44,6,
509 255,255,127,255,255,255,127,255,
510 0,0,0,0,254,255,255,255,
511 255,255,255,255,255,255,255,127,
512 0,0,0,0,254,255,255,255,
513 255,255,255,255,255,255,255,255,
514 0,2,0,0,255,255,255,255,
515 255,255,255,255,255,255,255,127,
516 0,0,0,0,255,255,255,255,
517 255,255,255,255,255,255,255,255,
518 0,0,0,0,254,255,0,252,
519 1,0,0,248,1,0,0,120,
520 0,0,0,0,254,255,255,255,
521 0,0,128,0,0,0,128,0,
522 255,255,255,255,0,0,0,0,
523 0,0,0,0,0,0,0,128,
524 255,255,255,255,0,0,0,0,
525 0,0,0,0,0,0,0,0,
526 128,0,0,0,0,0,0,0,
527 0,1,1,0,1,1,0,0,
528 0,0,0,0,0,0,0,0,
529 0,0,0,0,0,0,0,0,
530 1,0,0,0,128,0,0,0,
531 128,128,128,128,0,0,128,0,
532 28,28,28,28,28,28,28,28,
533 28,28,0,0,0,0,0,128,
534 0,26,26,26,26,26,26,18,
535 18,18,18,18,18,18,18,18,
536 18,18,18,18,18,18,18,18,
537 18,18,18,128,128,0,128,16,
538 0,26,26,26,26,26,26,18,
539 18,18,18,18,18,18,18,18,
540 18,18,18,18,18,18,18,18,
541 18,18,18,128,128,0,0,0,
542 0,0,0,0,0,1,0,0,
543 0,0,0,0,0,0,0,0,
544 0,0,0,0,0,0,0,0,
545 0,0,0,0,0,0,0,0,
546 1,0,0,0,0,0,0,0,
547 0,0,18,0,0,0,0,0,
548 0,0,20,20,0,18,0,0,
549 0,20,18,0,0,0,0,0,
550 18,18,18,18,18,18,18,18,
551 18,18,18,18,18,18,18,18,
552 18,18,18,18,18,18,18,0,
553 18,18,18,18,18,18,18,18,
554 18,18,18,18,18,18,18,18,
555 18,18,18,18,18,18,18,18,
556 18,18,18,18,18,18,18,0,
557 18,18,18,18,18,18,18,18
558 };
559
560
561
562
563 #ifndef HAVE_STRERROR
564 /*************************************************
565 * Provide strerror() for non-ANSI libraries *
566 *************************************************/
567
568 /* Some old-fashioned systems still around (e.g. SunOS4) don't have strerror()
569 in their libraries, but can provide the same facility by this simple
570 alternative function. */
571
572 extern int sys_nerr;
573 extern char *sys_errlist[];
574
575 char *
576 strerror(int n)
577 {
578 if (n < 0 || n >= sys_nerr) return "unknown error number";
579 return sys_errlist[n];
580 }
581 #endif /* HAVE_STRERROR */
582
583
584 /*************************************************
585 * JIT memory callback *
586 *************************************************/
587
588 static pcre_jit_stack* jit_callback(void *arg)
589 {
590 return (pcre_jit_stack *)arg;
591 }
592
593
594 /*************************************************
595 * Read or extend an input line *
596 *************************************************/
597
598 /* Input lines are read into buffer, but both patterns and data lines can be
599 continued over multiple input lines. In addition, if the buffer fills up, we
600 want to automatically expand it so as to be able to handle extremely large
601 lines that are needed for certain stress tests. When the input buffer is
602 expanded, the other two buffers must also be expanded likewise, and the
603 contents of pbuffer, which are a copy of the input for callouts, must be
604 preserved (for when expansion happens for a data line). This is not the most
605 optimal way of handling this, but hey, this is just a test program!
606
607 Arguments:
608 f the file to read
609 start where in buffer to start (this *must* be within buffer)
610 prompt for stdin or readline()
611
612 Returns: pointer to the start of new data
613 could be a copy of start, or could be moved
614 NULL if no data read and EOF reached
615 */
616
617 static pcre_uint8 *
618 extend_inputline(FILE *f, pcre_uint8 *start, const char *prompt)
619 {
620 pcre_uint8 *here = start;
621
622 for (;;)
623 {
624 int rlen = (int)(buffer_size - (here - buffer));
625
626 if (rlen > 1000)
627 {
628 int dlen;
629
630 /* If libreadline support is required, use readline() to read a line if the
631 input is a terminal. Note that readline() removes the trailing newline, so
632 we must put it back again, to be compatible with fgets(). */
633
634 #ifdef SUPPORT_LIBREADLINE
635 if (isatty(fileno(f)))
636 {
637 size_t len;
638 char *s = readline(prompt);
639 if (s == NULL) return (here == start)? NULL : start;
640 len = strlen(s);
641 if (len > 0) add_history(s);
642 if (len > rlen - 1) len = rlen - 1;
643 memcpy(here, s, len);
644 here[len] = '\n';
645 here[len+1] = 0;
646 free(s);
647 }
648 else
649 #endif
650
651 /* Read the next line by normal means, prompting if the file is stdin. */
652
653 {
654 if (f == stdin) printf("%s", prompt);
655 if (fgets((char *)here, rlen, f) == NULL)
656 return (here == start)? NULL : start;
657 }
658
659 dlen = (int)strlen((char *)here);
660 if (dlen > 0 && here[dlen - 1] == '\n') return start;
661 here += dlen;
662 }
663
664 else
665 {
666 int new_buffer_size = 2*buffer_size;
667 pcre_uint8 *new_buffer = (unsigned char *)malloc(new_buffer_size);
668 pcre_uint8 *new_dbuffer = (unsigned char *)malloc(new_buffer_size);
669 pcre_uint8 *new_pbuffer = (unsigned char *)malloc(new_buffer_size);
670
671 if (new_buffer == NULL || new_dbuffer == NULL || new_pbuffer == NULL)
672 {
673 fprintf(stderr, "pcretest: malloc(%d) failed\n", new_buffer_size);
674 exit(1);
675 }
676
677 memcpy(new_buffer, buffer, buffer_size);
678 memcpy(new_pbuffer, pbuffer, buffer_size);
679
680 buffer_size = new_buffer_size;
681
682 start = new_buffer + (start - buffer);
683 here = new_buffer + (here - buffer);
684
685 free(buffer);
686 free(dbuffer);
687 free(pbuffer);
688
689 buffer = new_buffer;
690 dbuffer = new_dbuffer;
691 pbuffer = new_pbuffer;
692 }
693 }
694
695 return NULL; /* Control never gets here */
696 }
697
698
699
700
701
702
703
704 /*************************************************
705 * Read number from string *
706 *************************************************/
707
708 /* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
709 around with conditional compilation, just do the job by hand. It is only used
710 for unpicking arguments, so just keep it simple.
711
712 Arguments:
713 str string to be converted
714 endptr where to put the end pointer
715
716 Returns: the unsigned long
717 */
718
719 static int
720 get_value(unsigned char *str, unsigned char **endptr)
721 {
722 int result = 0;
723 while(*str != 0 && isspace(*str)) str++;
724 while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0');
725 *endptr = str;
726 return(result);
727 }
728
729
730
731
732 /*************************************************
733 * Convert UTF-8 string to value *
734 *************************************************/
735
736 /* This function takes one or more bytes that represents a UTF-8 character,
737 and returns the value of the character.
738
739 Argument:
740 utf8bytes a pointer to the byte vector
741 vptr a pointer to an int to receive the value
742
743 Returns: > 0 => the number of bytes consumed
744 -6 to 0 => malformed UTF-8 character at offset = (-return)
745 */
746
747 #if !defined NOUTF8
748
749 static int
750 utf82ord(unsigned char *utf8bytes, int *vptr)
751 {
752 int c = *utf8bytes++;
753 int d = c;
754 int i, j, s;
755
756 for (i = -1; i < 6; i++) /* i is number of additional bytes */
757 {
758 if ((d & 0x80) == 0) break;
759 d <<= 1;
760 }
761
762 if (i == -1) { *vptr = c; return 1; } /* ascii character */
763 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
764
765 /* i now has a value in the range 1-5 */
766
767 s = 6*i;
768 d = (c & utf8_table3[i]) << s;
769
770 for (j = 0; j < i; j++)
771 {
772 c = *utf8bytes++;
773 if ((c & 0xc0) != 0x80) return -(j+1);
774 s -= 6;
775 d |= (c & 0x3f) << s;
776 }
777
778 /* Check that encoding was the correct unique one */
779
780 for (j = 0; j < utf8_table1_size; j++)
781 if (d <= utf8_table1[j]) break;
782 if (j != i) return -(i+1);
783
784 /* Valid value */
785
786 *vptr = d;
787 return i+1;
788 }
789
790 #endif
791
792
793
794 /*************************************************
795 * Convert character value to UTF-8 *
796 *************************************************/
797
798 /* This function takes an integer value in the range 0 - 0x7fffffff
799 and encodes it as a UTF-8 character in 0 to 6 bytes.
800
801 Arguments:
802 cvalue the character value
803 utf8bytes pointer to buffer for result - at least 6 bytes long
804
805 Returns: number of characters placed in the buffer
806 */
807
808 #if !defined NOUTF8
809
810 static int
811 ord2utf8(int cvalue, pcre_uint8 *utf8bytes)
812 {
813 register int i, j;
814 for (i = 0; i < utf8_table1_size; i++)
815 if (cvalue <= utf8_table1[i]) break;
816 utf8bytes += i;
817 for (j = i; j > 0; j--)
818 {
819 *utf8bytes-- = 0x80 | (cvalue & 0x3f);
820 cvalue >>= 6;
821 }
822 *utf8bytes = utf8_table2[i] | cvalue;
823 return i + 1;
824 }
825
826 #endif
827
828
829
830 /*************************************************
831 * Print character string *
832 *************************************************/
833
834 /* Character string printing function. Must handle UTF-8 strings in utf8
835 mode. Yields number of characters printed. If handed a NULL file, just counts
836 chars without printing. */
837
838 static int pchars(unsigned char *p, int length, FILE *f)
839 {
840 int c = 0;
841 int yield = 0;
842
843 while (length-- > 0)
844 {
845 #if !defined NOUTF8
846 if (use_utf8)
847 {
848 int rc = utf82ord(p, &c);
849
850 if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
851 {
852 length -= rc - 1;
853 p += rc;
854 if (PRINTHEX(c))
855 {
856 if (f != NULL) fprintf(f, "%c", c);
857 yield++;
858 }
859 else
860 {
861 int n = 4;
862 if (f != NULL) fprintf(f, "\\x{%02x}", c);
863 yield += (n <= 0x000000ff)? 2 :
864 (n <= 0x00000fff)? 3 :
865 (n <= 0x0000ffff)? 4 :
866 (n <= 0x000fffff)? 5 : 6;
867 }
868 continue;
869 }
870 }
871 #endif
872
873 /* Not UTF-8, or malformed UTF-8 */
874
875 c = *p++;
876 if (PRINTHEX(c))
877 {
878 if (f != NULL) fprintf(f, "%c", c);
879 yield++;
880 }
881 else
882 {
883 if (f != NULL) fprintf(f, "\\x%02x", c);
884 yield += 4;
885 }
886 }
887
888 return yield;
889 }
890
891
892
893 /*************************************************
894 * Callout function *
895 *************************************************/
896
897 /* Called from PCRE as a result of the (?C) item. We print out where we are in
898 the match. Yield zero unless more callouts than the fail count, or the callout
899 data is not zero. */
900
901 static int callout(pcre_callout_block *cb)
902 {
903 FILE *f = (first_callout | callout_extra)? outfile : NULL;
904 int i, pre_start, post_start, subject_length;
905
906 if (callout_extra)
907 {
908 fprintf(f, "Callout %d: last capture = %d\n",
909 cb->callout_number, cb->capture_last);
910
911 for (i = 0; i < cb->capture_top * 2; i += 2)
912 {
913 if (cb->offset_vector[i] < 0)
914 fprintf(f, "%2d: <unset>\n", i/2);
915 else
916 {
917 fprintf(f, "%2d: ", i/2);
918 (void)pchars((unsigned char *)cb->subject + cb->offset_vector[i],
919 cb->offset_vector[i+1] - cb->offset_vector[i], f);
920 fprintf(f, "\n");
921 }
922 }
923 }
924
925 /* Re-print the subject in canonical form, the first time or if giving full
926 datails. On subsequent calls in the same match, we use pchars just to find the
927 printed lengths of the substrings. */
928
929 if (f != NULL) fprintf(f, "--->");
930
931 pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
932 post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
933 cb->current_position - cb->start_match, f);
934
935 subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
936
937 (void)pchars((unsigned char *)(cb->subject + cb->current_position),
938 cb->subject_length - cb->current_position, f);
939
940 if (f != NULL) fprintf(f, "\n");
941
942 /* Always print appropriate indicators, with callout number if not already
943 shown. For automatic callouts, show the pattern offset. */
944
945 if (cb->callout_number == 255)
946 {
947 fprintf(outfile, "%+3d ", cb->pattern_position);
948 if (cb->pattern_position > 99) fprintf(outfile, "\n ");
949 }
950 else
951 {
952 if (callout_extra) fprintf(outfile, " ");
953 else fprintf(outfile, "%3d ", cb->callout_number);
954 }
955
956 for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
957 fprintf(outfile, "^");
958
959 if (post_start > 0)
960 {
961 for (i = 0; i < post_start - 1; i++) fprintf(outfile, " ");
962 fprintf(outfile, "^");
963 }
964
965 for (i = 0; i < subject_length - pre_start - post_start + 4; i++)
966 fprintf(outfile, " ");
967
968 fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
969 pbuffer + cb->pattern_position);
970
971 fprintf(outfile, "\n");
972 first_callout = 0;
973
974 if (cb->mark != last_callout_mark)
975 {
976 fprintf(outfile, "Latest Mark: %s\n",
977 (cb->mark == NULL)? "<unset>" : (char *)(cb->mark));
978 last_callout_mark = cb->mark;
979 }
980
981 if (cb->callout_data != NULL)
982 {
983 int callout_data = *((int *)(cb->callout_data));
984 if (callout_data != 0)
985 {
986 fprintf(outfile, "Callout data = %d\n", callout_data);
987 return callout_data;
988 }
989 }
990
991 return (cb->callout_number != callout_fail_id)? 0 :
992 (++callout_count >= callout_fail_count)? 1 : 0;
993 }
994
995
996 /*************************************************
997 * Local malloc functions *
998 *************************************************/
999
1000 /* Alternative malloc function, to test functionality and save the size of a
1001 compiled re. The show_malloc variable is set only during matching. */
1002
1003 static void *new_malloc(size_t size)
1004 {
1005 void *block = malloc(size);
1006 gotten_store = size;
1007 if (show_malloc)
1008 fprintf(outfile, "malloc %3d %p\n", (int)size, block);
1009 return block;
1010 }
1011
1012 static void new_free(void *block)
1013 {
1014 if (show_malloc)
1015 fprintf(outfile, "free %p\n", block);
1016 free(block);
1017 }
1018
1019 /* For recursion malloc/free, to test stacking calls */
1020
1021 static void *stack_malloc(size_t size)
1022 {
1023 void *block = malloc(size);
1024 if (show_malloc)
1025 fprintf(outfile, "stack_malloc %3d %p\n", (int)size, block);
1026 return block;
1027 }
1028
1029 static void stack_free(void *block)
1030 {
1031 if (show_malloc)
1032 fprintf(outfile, "stack_free %p\n", block);
1033 free(block);
1034 }
1035
1036
1037 /*************************************************
1038 * Call pcre_fullinfo() *
1039 *************************************************/
1040
1041 /* Get one piece of information from the pcre_fullinfo() function */
1042
1043 static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
1044 {
1045 int rc;
1046 if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
1047 fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
1048 }
1049
1050
1051
1052 /*************************************************
1053 * Byte flipping function *
1054 *************************************************/
1055
1056 static unsigned long int
1057 byteflip(unsigned long int value, int n)
1058 {
1059 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
1060 return ((value & 0x000000ff) << 24) |
1061 ((value & 0x0000ff00) << 8) |
1062 ((value & 0x00ff0000) >> 8) |
1063 ((value & 0xff000000) >> 24);
1064 }
1065
1066
1067
1068
1069 /*************************************************
1070 * Check match or recursion limit *
1071 *************************************************/
1072
1073 static int
1074 check_match_limit(pcre *re, pcre_extra *extra, pcre_uint8 *bptr, int len,
1075 int start_offset, int options, int *use_offsets, int use_size_offsets,
1076 int flag, unsigned long int *limit, int errnumber, const char *msg)
1077 {
1078 int count;
1079 int min = 0;
1080 int mid = 64;
1081 int max = -1;
1082
1083 extra->flags |= flag;
1084
1085 for (;;)
1086 {
1087 *limit = mid;
1088
1089 count = pcre_exec(re, extra, (char *)bptr, len, start_offset, options,
1090 use_offsets, use_size_offsets);
1091
1092 if (count == errnumber)
1093 {
1094 /* fprintf(outfile, "Testing %s limit = %d\n", msg, mid); */
1095 min = mid;
1096 mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
1097 }
1098
1099 else if (count >= 0 || count == PCRE_ERROR_NOMATCH ||
1100 count == PCRE_ERROR_PARTIAL)
1101 {
1102 if (mid == min + 1)
1103 {
1104 fprintf(outfile, "Minimum %s limit = %d\n", msg, mid);
1105 break;
1106 }
1107 /* fprintf(outfile, "Testing %s limit = %d\n", msg, mid); */
1108 max = mid;
1109 mid = (min + mid)/2;
1110 }
1111 else break; /* Some other error */
1112 }
1113
1114 extra->flags &= ~flag;
1115 return count;
1116 }
1117
1118
1119
1120 /*************************************************
1121 * Case-independent strncmp() function *
1122 *************************************************/
1123
1124 /*
1125 Arguments:
1126 s first string
1127 t second string
1128 n number of characters to compare
1129
1130 Returns: < 0, = 0, or > 0, according to the comparison
1131 */
1132
1133 static int
1134 strncmpic(pcre_uint8 *s, pcre_uint8 *t, int n)
1135 {
1136 while (n--)
1137 {
1138 int c = tolower(*s++) - tolower(*t++);
1139 if (c) return c;
1140 }
1141 return 0;
1142 }
1143
1144
1145
1146 /*************************************************
1147 * Check newline indicator *
1148 *************************************************/
1149
1150 /* This is used both at compile and run-time to check for <xxx> escapes. Print
1151 a message and return 0 if there is no match.
1152
1153 Arguments:
1154 p points after the leading '<'
1155 f file for error message
1156
1157 Returns: appropriate PCRE_NEWLINE_xxx flags, or 0
1158 */
1159
1160 static int
1161 check_newline(pcre_uint8 *p, FILE *f)
1162 {
1163 if (strncmpic(p, (pcre_uint8 *)"cr>", 3) == 0) return PCRE_NEWLINE_CR;
1164 if (strncmpic(p, (pcre_uint8 *)"lf>", 3) == 0) return PCRE_NEWLINE_LF;
1165 if (strncmpic(p, (pcre_uint8 *)"crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;
1166 if (strncmpic(p, (pcre_uint8 *)"anycrlf>", 8) == 0) return PCRE_NEWLINE_ANYCRLF;
1167 if (strncmpic(p, (pcre_uint8 *)"any>", 4) == 0) return PCRE_NEWLINE_ANY;
1168 if (strncmpic(p, (pcre_uint8 *)"bsr_anycrlf>", 12) == 0) return PCRE_BSR_ANYCRLF;
1169 if (strncmpic(p, (pcre_uint8 *)"bsr_unicode>", 12) == 0) return PCRE_BSR_UNICODE;
1170 fprintf(f, "Unknown newline type at: <%s\n", p);
1171 return 0;
1172 }
1173
1174
1175
1176 /*************************************************
1177 * Usage function *
1178 *************************************************/
1179
1180 static void
1181 usage(void)
1182 {
1183 printf("Usage: pcretest [options] [<input file> [<output file>]]\n\n");
1184 printf("Input and output default to stdin and stdout.\n");
1185 #ifdef SUPPORT_LIBREADLINE
1186 printf("If input is a terminal, readline() is used to read from it.\n");
1187 #else
1188 printf("This version of pcretest is not linked with readline().\n");
1189 #endif
1190 printf("\nOptions:\n");
1191 printf(" -b show compiled code (bytecode)\n");
1192 printf(" -C show PCRE compile-time options and exit\n");
1193 printf(" -d debug: show compiled code and information (-b and -i)\n");
1194 #if !defined NODFA
1195 printf(" -dfa force DFA matching for all subjects\n");
1196 #endif
1197 printf(" -help show usage information\n");
1198 printf(" -i show information about compiled patterns\n"
1199 " -M find MATCH_LIMIT minimum for each subject\n"
1200 " -m output memory used information\n"
1201 " -o <n> set size of offsets vector to <n>\n");
1202 #if !defined NOPOSIX
1203 printf(" -p use POSIX interface\n");
1204 #endif
1205 printf(" -q quiet: do not output PCRE version number at start\n");
1206 printf(" -S <n> set stack size to <n> megabytes\n");
1207 printf(" -s force each pattern to be studied at basic level\n"
1208 " -s+ force each pattern to be studied, using JIT if available\n"
1209 " -t time compilation and execution\n");
1210 printf(" -t <n> time compilation and execution, repeating <n> times\n");
1211 printf(" -tm time execution (matching) only\n");
1212 printf(" -tm <n> time execution (matching) only, repeating <n> times\n");
1213 }
1214
1215
1216
1217 /*************************************************
1218 * Main Program *
1219 *************************************************/
1220
1221 /* Read lines from named file or stdin and write to named file or stdout; lines
1222 consist of a regular expression, in delimiters and optionally followed by
1223 options, followed by a set of test data, terminated by an empty line. */
1224
1225 int main(int argc, char **argv)
1226 {
1227 FILE *infile = stdin;
1228 int options = 0;
1229 int study_options = 0;
1230 int default_find_match_limit = FALSE;
1231 int op = 1;
1232 int timeit = 0;
1233 int timeitm = 0;
1234 int showinfo = 0;
1235 int showstore = 0;
1236 int force_study = -1;
1237 int force_study_options = 0;
1238 int quiet = 0;
1239 int size_offsets = 45;
1240 int size_offsets_max;
1241 int *offsets = NULL;
1242 #if !defined NOPOSIX
1243 int posix = 0;
1244 #endif
1245 int debug = 0;
1246 int done = 0;
1247 int all_use_dfa = 0;
1248 int yield = 0;
1249 int stack_size;
1250
1251 pcre_jit_stack *jit_stack = NULL;
1252
1253
1254 /* These vectors store, end-to-end, a list of captured substring names. Assume
1255 that 1024 is plenty long enough for the few names we'll be testing. */
1256
1257 pcre_uchar copynames[1024];
1258 pcre_uchar getnames[1024];
1259
1260 pcre_uchar *copynamesptr;
1261 pcre_uchar *getnamesptr;
1262
1263 /* Get buffers from malloc() so that Electric Fence will check their misuse
1264 when I am debugging. They grow automatically when very long lines are read. */
1265
1266 buffer = (pcre_uint8 *)malloc(buffer_size);
1267 dbuffer = (pcre_uint8 *)malloc(buffer_size);
1268 pbuffer = (pcre_uint8 *)malloc(buffer_size);
1269
1270 /* The outfile variable is static so that new_malloc can use it. */
1271
1272 outfile = stdout;
1273
1274 /* The following _setmode() stuff is some Windows magic that tells its runtime
1275 library to translate CRLF into a single LF character. At least, that's what
1276 I've been told: never having used Windows I take this all on trust. Originally
1277 it set 0x8000, but then I was advised that _O_BINARY was better. */
1278
1279 #if defined(_WIN32) || defined(WIN32)
1280 _setmode( _fileno( stdout ), _O_BINARY );
1281 #endif
1282
1283 /* Scan options */
1284
1285 while (argc > 1 && argv[op][0] == '-')
1286 {
1287 unsigned char *endptr;
1288
1289 if (strcmp(argv[op], "-m") == 0) showstore = 1;
1290 else if (strcmp(argv[op], "-s") == 0) force_study = 0;
1291 else if (strcmp(argv[op], "-s+") == 0)
1292 {
1293 force_study = 1;
1294 force_study_options = PCRE_STUDY_JIT_COMPILE;
1295 }
1296 else if (strcmp(argv[op], "-q") == 0) quiet = 1;
1297 else if (strcmp(argv[op], "-b") == 0) debug = 1;
1298 else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
1299 else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
1300 else if (strcmp(argv[op], "-M") == 0) default_find_match_limit = TRUE;
1301 #if !defined NODFA
1302 else if (strcmp(argv[op], "-dfa") == 0) all_use_dfa = 1;
1303 #endif
1304 else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
1305 ((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
1306 *endptr == 0))
1307 {
1308 op++;
1309 argc--;
1310 }
1311 else if (strcmp(argv[op], "-t") == 0 || strcmp(argv[op], "-tm") == 0)
1312 {
1313 int both = argv[op][2] == 0;
1314 int temp;
1315 if (argc > 2 && (temp = get_value((unsigned char *)argv[op+1], &endptr),
1316 *endptr == 0))
1317 {
1318 timeitm = temp;
1319 op++;
1320 argc--;
1321 }
1322 else timeitm = LOOPREPEAT;
1323 if (both) timeit = timeitm;
1324 }
1325 else if (strcmp(argv[op], "-S") == 0 && argc > 2 &&
1326 ((stack_size = get_value((unsigned char *)argv[op+1], &endptr)),
1327 *endptr == 0))
1328 {
1329 #if defined(_WIN32) || defined(WIN32) || defined(__minix)
1330 printf("PCRE: -S not supported on this OS\n");
1331 exit(1);
1332 #else
1333 int rc;
1334 struct rlimit rlim;
1335 getrlimit(RLIMIT_STACK, &rlim);
1336 rlim.rlim_cur = stack_size * 1024 * 1024;
1337 rc = setrlimit(RLIMIT_STACK, &rlim);
1338 if (rc != 0)
1339 {
1340 printf("PCRE: setrlimit() failed with error %d\n", rc);
1341 exit(1);
1342 }
1343 op++;
1344 argc--;
1345 #endif
1346 }
1347 #if !defined NOPOSIX
1348 else if (strcmp(argv[op], "-p") == 0) posix = 1;
1349 #endif
1350 else if (strcmp(argv[op], "-C") == 0)
1351 {
1352 int rc;
1353 unsigned long int lrc;
1354 printf("PCRE version %s\n", pcre_version());
1355 printf("Compiled with\n");
1356 (void)pcre_config(PCRE_CONFIG_UTF8, &rc);
1357 printf(" %sUTF-8 support\n", rc? "" : "No ");
1358 (void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
1359 printf(" %sUnicode properties support\n", rc? "" : "No ");
1360 (void)pcre_config(PCRE_CONFIG_JIT, &rc);
1361 if (rc)
1362 printf(" Just-in-time compiler support\n");
1363 else
1364 printf(" No just-in-time compiler support\n");
1365 (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
1366 /* Note that these values are always the ASCII values, even
1367 in EBCDIC environments. CR is 13 and NL is 10. */
1368 printf(" Newline sequence is %s\n", (rc == 13)? "CR" :
1369 (rc == 10)? "LF" : (rc == (13<<8 | 10))? "CRLF" :
1370 (rc == -2)? "ANYCRLF" :
1371 (rc == -1)? "ANY" : "???");
1372 (void)pcre_config(PCRE_CONFIG_BSR, &rc);
1373 printf(" \\R matches %s\n", rc? "CR, LF, or CRLF only" :
1374 "all Unicode newlines");
1375 (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
1376 printf(" Internal link size = %d\n", rc);
1377 (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
1378 printf(" POSIX malloc threshold = %d\n", rc);
1379 (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &lrc);
1380 printf(" Default match limit = %ld\n", lrc);
1381 (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT_RECURSION, &lrc);
1382 printf(" Default recursion depth limit = %ld\n", lrc);
1383 (void)pcre_config(PCRE_CONFIG_STACKRECURSE, &rc);
1384 printf(" Match recursion uses %s\n", rc? "stack" : "heap");
1385 goto EXIT;
1386 }
1387 else if (strcmp(argv[op], "-help") == 0 ||
1388 strcmp(argv[op], "--help") == 0)
1389 {
1390 usage();
1391 goto EXIT;
1392 }
1393 else
1394 {
1395 printf("** Unknown or malformed option %s\n", argv[op]);
1396 usage();
1397 yield = 1;
1398 goto EXIT;
1399 }
1400 op++;
1401 argc--;
1402 }
1403
1404 /* Get the store for the offsets vector, and remember what it was */
1405
1406 size_offsets_max = size_offsets;
1407 offsets = (int *)malloc(size_offsets_max * sizeof(int));
1408 if (offsets == NULL)
1409 {
1410 printf("** Failed to get %d bytes of memory for offsets vector\n",
1411 (int)(size_offsets_max * sizeof(int)));
1412 yield = 1;
1413 goto EXIT;
1414 }
1415
1416 /* Sort out the input and output files */
1417
1418 if (argc > 1)
1419 {
1420 infile = fopen(argv[op], INPUT_MODE);
1421 if (infile == NULL)
1422 {
1423 printf("** Failed to open %s\n", argv[op]);
1424 yield = 1;
1425 goto EXIT;
1426 }
1427 }
1428
1429 if (argc > 2)
1430 {
1431 outfile = fopen(argv[op+1], OUTPUT_MODE);
1432 if (outfile == NULL)
1433 {
1434 printf("** Failed to open %s\n", argv[op+1]);
1435 yield = 1;
1436 goto EXIT;
1437 }
1438 }
1439
1440 /* Set alternative malloc function */
1441
1442 pcre_malloc = new_malloc;
1443 pcre_free = new_free;
1444 pcre_stack_malloc = stack_malloc;
1445 pcre_stack_free = stack_free;
1446
1447 /* Heading line unless quiet, then prompt for first regex if stdin */
1448
1449 if (!quiet) fprintf(outfile, "PCRE version %s\n\n", pcre_version());
1450
1451 /* Main loop */
1452
1453 while (!done)
1454 {
1455 pcre *re = NULL;
1456 pcre_extra *extra = NULL;
1457
1458 #if !defined NOPOSIX /* There are still compilers that require no indent */
1459 regex_t preg;
1460 int do_posix = 0;
1461 #endif
1462
1463 const char *error;
1464 unsigned char *markptr;
1465 unsigned char *p, *pp, *ppp;
1466 unsigned char *to_file = NULL;
1467 const unsigned char *tables = NULL;
1468 unsigned long int true_size, true_study_size = 0;
1469 size_t size, regex_gotten_store;
1470 int do_allcaps = 0;
1471 int do_mark = 0;
1472 int do_study = 0;
1473 int no_force_study = 0;
1474 int do_debug = debug;
1475 int do_G = 0;
1476 int do_g = 0;
1477 int do_showinfo = showinfo;
1478 int do_showrest = 0;
1479 int do_showcaprest = 0;
1480 int do_flip = 0;
1481 int erroroffset, len, delimiter, poffset;
1482
1483 use_utf8 = 0;
1484 debug_lengths = 1;
1485
1486 if (extend_inputline(infile, buffer, " re> ") == NULL) break;
1487 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
1488 fflush(outfile);
1489
1490 p = buffer;
1491 while (isspace(*p)) p++;
1492 if (*p == 0) continue;
1493
1494 /* See if the pattern is to be loaded pre-compiled from a file. */
1495
1496 if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
1497 {
1498 unsigned long int magic, get_options;
1499 pcre_uint8 sbuf[8];
1500 FILE *f;
1501
1502 p++;
1503 pp = p + (int)strlen((char *)p);
1504 while (isspace(pp[-1])) pp--;
1505 *pp = 0;
1506
1507 f = fopen((char *)p, "rb");
1508 if (f == NULL)
1509 {
1510 fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
1511 continue;
1512 }
1513
1514 if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
1515
1516 true_size =
1517 (sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
1518 true_study_size =
1519 (sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
1520
1521 re = (real_pcre *)new_malloc(true_size);
1522 regex_gotten_store = gotten_store;
1523
1524 if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
1525
1526 magic = ((real_pcre *)re)->magic_number;
1527 if (magic != MAGIC_NUMBER)
1528 {
1529 if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
1530 {
1531 do_flip = 1;
1532 }
1533 else
1534 {
1535 fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
1536 fclose(f);
1537 continue;
1538 }
1539 }
1540
1541 fprintf(outfile, "Compiled pattern%s loaded from %s\n",
1542 do_flip? " (byte-inverted)" : "", p);
1543
1544 /* Need to know if UTF-8 for printing data strings */
1545
1546 new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
1547 use_utf8 = (get_options & PCRE_UTF8) != 0;
1548
1549 /* Now see if there is any following study data. */
1550
1551 if (true_study_size != 0)
1552 {
1553 pcre_study_data *psd;
1554
1555 extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
1556 extra->flags = PCRE_EXTRA_STUDY_DATA;
1557
1558 psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
1559 extra->study_data = psd;
1560
1561 if (fread(psd, 1, true_study_size, f) != true_study_size)
1562 {
1563 FAIL_READ:
1564 fprintf(outfile, "Failed to read data from %s\n", p);
1565 if (extra != NULL) pcre_free_study(extra);
1566 if (re != NULL) new_free(re);
1567 fclose(f);
1568 continue;
1569 }
1570 fprintf(outfile, "Study data loaded from %s\n", p);
1571 do_study = 1; /* To get the data output if requested */
1572 }
1573 else fprintf(outfile, "No study data\n");
1574
1575 fclose(f);
1576 goto SHOW_INFO;
1577 }
1578
1579 /* In-line pattern (the usual case). Get the delimiter and seek the end of
1580 the pattern; if is isn't complete, read more. */
1581
1582 delimiter = *p++;
1583
1584 if (isalnum(delimiter) || delimiter == '\\')
1585 {
1586 fprintf(outfile, "** Delimiter must not be alphanumeric or \\\n");
1587 goto SKIP_DATA;
1588 }
1589
1590 pp = p;
1591 poffset = (int)(p - buffer);
1592
1593 for(;;)
1594 {
1595 while (*pp != 0)
1596 {
1597 if (*pp == '\\' && pp[1] != 0) pp++;
1598 else if (*pp == delimiter) break;
1599 pp++;
1600 }
1601 if (*pp != 0) break;
1602 if ((pp = extend_inputline(infile, pp, " > ")) == NULL)
1603 {
1604 fprintf(outfile, "** Unexpected EOF\n");
1605 done = 1;
1606 goto CONTINUE;
1607 }
1608 if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
1609 }
1610
1611 /* The buffer may have moved while being extended; reset the start of data
1612 pointer to the correct relative point in the buffer. */
1613
1614 p = buffer + poffset;
1615
1616 /* If the first character after the delimiter is backslash, make
1617 the pattern end with backslash. This is purely to provide a way
1618 of testing for the error message when a pattern ends with backslash. */
1619
1620 if (pp[1] == '\\') *pp++ = '\\';
1621
1622 /* Terminate the pattern at the delimiter, and save a copy of the pattern
1623 for callouts. */
1624
1625 *pp++ = 0;
1626 strcpy((char *)pbuffer, (char *)p);
1627
1628 /* Look for options after final delimiter */
1629
1630 options = 0;
1631 log_store = showstore; /* default from command line */
1632
1633 while (*pp != 0)
1634 {
1635 switch (*pp++)
1636 {
1637 case 'f': options |= PCRE_FIRSTLINE; break;
1638 case 'g': do_g = 1; break;
1639 case 'i': options |= PCRE_CASELESS; break;
1640 case 'm': options |= PCRE_MULTILINE; break;
1641 case 's': options |= PCRE_DOTALL; break;
1642 case 'x': options |= PCRE_EXTENDED; break;
1643
1644 case '+':
1645 if (do_showrest) do_showcaprest = 1; else do_showrest = 1;
1646 break;
1647
1648 case '=': do_allcaps = 1; break;
1649 case 'A': options |= PCRE_ANCHORED; break;
1650 case 'B': do_debug = 1; break;
1651 case 'C': options |= PCRE_AUTO_CALLOUT; break;
1652 case 'D': do_debug = do_showinfo = 1; break;
1653 case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
1654 case 'F': do_flip = 1; break;
1655 case 'G': do_G = 1; break;
1656 case 'I': do_showinfo = 1; break;
1657 case 'J': options |= PCRE_DUPNAMES; break;
1658 case 'K': do_mark = 1; break;
1659 case 'M': log_store = 1; break;
1660 case 'N': options |= PCRE_NO_AUTO_CAPTURE; break;
1661
1662 #if !defined NOPOSIX
1663 case 'P': do_posix = 1; break;
1664 #endif
1665
1666 case 'S':
1667 if (do_study == 0)
1668 {
1669 do_study = 1;
1670 if (*pp == '+')
1671 {
1672 study_options |= PCRE_STUDY_JIT_COMPILE;
1673 pp++;
1674 }
1675 }
1676 else
1677 {
1678 do_study = 0;
1679 no_force_study = 1;
1680 }
1681 break;
1682
1683 case 'U': options |= PCRE_UNGREEDY; break;
1684 case 'W': options |= PCRE_UCP; break;
1685 case 'X': options |= PCRE_EXTRA; break;
1686 case 'Y': options |= PCRE_NO_START_OPTIMISE; break;
1687 case 'Z': debug_lengths = 0; break;
1688 case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
1689 case '?': options |= PCRE_NO_UTF8_CHECK; break;
1690
1691 case 'T':
1692 switch (*pp++)
1693 {
1694 case '0': tables = tables0; break;
1695 case '1': tables = tables1; break;
1696
1697 case '\r':
1698 case '\n':
1699 case ' ':
1700 case 0:
1701 fprintf(outfile, "** Missing table number after /T\n");
1702 goto SKIP_DATA;
1703
1704 default:
1705 fprintf(outfile, "** Bad table number \"%c\" after /T\n", pp[-1]);
1706 goto SKIP_DATA;
1707 }
1708 break;
1709
1710 case 'L':
1711 ppp = pp;
1712 /* The '\r' test here is so that it works on Windows. */
1713 /* The '0' test is just in case this is an unterminated line. */
1714 while (*ppp != 0 && *ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++;
1715 *ppp = 0;
1716 if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
1717 {
1718 fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
1719 goto SKIP_DATA;
1720 }
1721 locale_set = 1;
1722 tables = pcre_maketables();
1723 pp = ppp;
1724 break;
1725
1726 case '>':
1727 to_file = pp;
1728 while (*pp != 0) pp++;
1729 while (isspace(pp[-1])) pp--;
1730 *pp = 0;
1731 break;
1732
1733 case '<':
1734 {
1735 if (strncmpic(pp, (pcre_uint8 *)"JS>", 3) == 0)
1736 {
1737 options |= PCRE_JAVASCRIPT_COMPAT;
1738 pp += 3;
1739 }
1740 else
1741 {
1742 int x = check_newline(pp, outfile);
1743 if (x == 0) goto SKIP_DATA;
1744 options |= x;
1745 while (*pp++ != '>');
1746 }
1747 }
1748 break;
1749
1750 case '\r': /* So that it works in Windows */
1751 case '\n':
1752 case ' ':
1753 break;
1754
1755 default:
1756 fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
1757 goto SKIP_DATA;
1758 }
1759 }
1760
1761 /* Handle compiling via the POSIX interface, which doesn't support the
1762 timing, showing, or debugging options, nor the ability to pass over
1763 local character tables. */
1764
1765 #if !defined NOPOSIX
1766 if (posix || do_posix)
1767 {
1768 int rc;
1769 int cflags = 0;
1770
1771 if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
1772 if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
1773 if ((options & PCRE_DOTALL) != 0) cflags |= REG_DOTALL;
1774 if ((options & PCRE_NO_AUTO_CAPTURE) != 0) cflags |= REG_NOSUB;
1775 if ((options & PCRE_UTF8) != 0) cflags |= REG_UTF8;
1776 if ((options & PCRE_UCP) != 0) cflags |= REG_UCP;
1777 if ((options & PCRE_UNGREEDY) != 0) cflags |= REG_UNGREEDY;
1778
1779 rc = regcomp(&preg, (char *)p, cflags);
1780
1781 /* Compilation failed; go back for another re, skipping to blank line
1782 if non-interactive. */
1783
1784 if (rc != 0)
1785 {
1786 (void)regerror(rc, &preg, (char *)buffer, buffer_size);
1787 fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
1788 goto SKIP_DATA;
1789 }
1790 }
1791
1792 /* Handle compiling via the native interface */
1793
1794 else
1795 #endif /* !defined NOPOSIX */
1796
1797 {
1798 unsigned long int get_options;
1799
1800 if (timeit > 0)
1801 {
1802 register int i;
1803 clock_t time_taken;
1804 clock_t start_time = clock();
1805 for (i = 0; i < timeit; i++)
1806 {
1807 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
1808 if (re != NULL) free(re);
1809 }
1810 time_taken = clock() - start_time;
1811 fprintf(outfile, "Compile time %.4f milliseconds\n",
1812 (((double)time_taken * 1000.0) / (double)timeit) /
1813 (double)CLOCKS_PER_SEC);
1814 }
1815
1816 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
1817
1818 /* Compilation failed; go back for another re, skipping to blank line
1819 if non-interactive. */
1820
1821 if (re == NULL)
1822 {
1823 fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
1824 SKIP_DATA:
1825 if (infile != stdin)
1826 {
1827 for (;;)
1828 {
1829 if (extend_inputline(infile, buffer, NULL) == NULL)
1830 {
1831 done = 1;
1832 goto CONTINUE;
1833 }
1834 len = (int)strlen((char *)buffer);
1835 while (len > 0 && isspace(buffer[len-1])) len--;
1836 if (len == 0) break;
1837 }
1838 fprintf(outfile, "\n");
1839 }
1840 goto CONTINUE;
1841 }
1842
1843 /* Compilation succeeded. It is now possible to set the UTF-8 option from
1844 within the regex; check for this so that we know how to process the data
1845 lines. */
1846
1847 new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
1848 if ((get_options & PCRE_UTF8) != 0) use_utf8 = 1;
1849
1850 /* Print information if required. There are now two info-returning
1851 functions. The old one has a limited interface and returns only limited
1852 data. Check that it agrees with the newer one. */
1853
1854 if (log_store)
1855 fprintf(outfile, "Memory allocation (code space): %d\n",
1856 (int)(gotten_store -
1857 sizeof(real_pcre) -
1858 ((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
1859
1860 /* Extract the size for possible writing before possibly flipping it,
1861 and remember the store that was got. */
1862
1863 true_size = ((real_pcre *)re)->size;
1864 regex_gotten_store = gotten_store;
1865
1866 /* If -s or /S was present, study the regex to generate additional info to
1867 help with the matching, unless the pattern has the SS option, which
1868 suppresses the effect of /S (used for a few test patterns where studying is
1869 never sensible). */
1870
1871 if (do_study || (force_study >= 0 && !no_force_study))
1872 {
1873 if (timeit > 0)
1874 {
1875 register int i;
1876 clock_t time_taken;
1877 clock_t start_time = clock();
1878 for (i = 0; i < timeit; i++)
1879 extra = pcre_study(re, study_options | force_study_options, &error);
1880 time_taken = clock() - start_time;
1881 if (extra != NULL) pcre_free_study(extra);
1882 fprintf(outfile, " Study time %.4f milliseconds\n",
1883 (((double)time_taken * 1000.0) / (double)timeit) /
1884 (double)CLOCKS_PER_SEC);
1885 }
1886 extra = pcre_study(re, study_options | force_study_options, &error);
1887 if (error != NULL)
1888 fprintf(outfile, "Failed to study: %s\n", error);
1889 else if (extra != NULL)
1890 true_study_size = ((pcre_study_data *)(extra->study_data))->size;
1891 }
1892
1893 /* If /K was present, we set up for handling MARK data. */
1894
1895 if (do_mark)
1896 {
1897 if (extra == NULL)
1898 {
1899 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1900 extra->flags = 0;
1901 }
1902 extra->mark = &markptr;
1903 extra->flags |= PCRE_EXTRA_MARK;
1904 }
1905
1906 /* If the 'F' option was present, we flip the bytes of all the integer
1907 fields in the regex data block and the study block. This is to make it
1908 possible to test PCRE's handling of byte-flipped patterns, e.g. those
1909 compiled on a different architecture. */
1910
1911 if (do_flip)
1912 {
1913 real_pcre *rre = (real_pcre *)re;
1914 rre->magic_number =
1915 byteflip(rre->magic_number, sizeof(rre->magic_number));
1916 rre->size = byteflip(rre->size, sizeof(rre->size));
1917 rre->options = byteflip(rre->options, sizeof(rre->options));
1918 rre->flags = (pcre_uint16)byteflip(rre->flags, sizeof(rre->flags));
1919 rre->top_bracket =
1920 (pcre_uint16)byteflip(rre->top_bracket, sizeof(rre->top_bracket));
1921 rre->top_backref =
1922 (pcre_uint16)byteflip(rre->top_backref, sizeof(rre->top_backref));
1923 rre->first_char =
1924 (pcre_uint16)byteflip(rre->first_char, sizeof(rre->first_char));
1925 rre->req_char =
1926 (pcre_uint16)byteflip(rre->req_char, sizeof(rre->req_char));
1927 rre->name_table_offset = (pcre_uint16)byteflip(rre->name_table_offset,
1928 sizeof(rre->name_table_offset));
1929 rre->name_entry_size = (pcre_uint16)byteflip(rre->name_entry_size,
1930 sizeof(rre->name_entry_size));
1931 rre->name_count = (pcre_uint16)byteflip(rre->name_count,
1932 sizeof(rre->name_count));
1933
1934 if (extra != NULL)
1935 {
1936 pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
1937 rsd->size = byteflip(rsd->size, sizeof(rsd->size));
1938 rsd->flags = byteflip(rsd->flags, sizeof(rsd->flags));
1939 rsd->minlength = byteflip(rsd->minlength, sizeof(rsd->minlength));
1940 }
1941 }
1942
1943 /* Extract information from the compiled data if required */
1944
1945 SHOW_INFO:
1946
1947 if (do_debug)
1948 {
1949 fprintf(outfile, "------------------------------------------------------------------\n");
1950 pcre_printint(re, outfile, debug_lengths);
1951 }
1952
1953 /* We already have the options in get_options (see above) */
1954
1955 if (do_showinfo)
1956 {
1957 unsigned long int all_options;
1958 #if !defined NOINFOCHECK
1959 int old_first_char, old_options, old_count;
1960 #endif
1961 int count, backrefmax, first_char, need_char, okpartial, jchanged,
1962 hascrorlf;
1963 int nameentrysize, namecount;
1964 const pcre_uchar *nametable;
1965
1966 new_info(re, NULL, PCRE_INFO_SIZE, &size);
1967 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
1968 new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
1969 new_info(re, NULL, PCRE_INFO_FIRSTBYTE, &first_char);
1970 new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
1971 new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
1972 new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
1973 new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
1974 new_info(re, NULL, PCRE_INFO_OKPARTIAL, &okpartial);
1975 new_info(re, NULL, PCRE_INFO_JCHANGED, &jchanged);
1976 new_info(re, NULL, PCRE_INFO_HASCRORLF, &hascrorlf);
1977
1978 #if !defined NOINFOCHECK
1979 old_count = pcre_info(re, &old_options, &old_first_char);
1980 if (count < 0) fprintf(outfile,
1981 "Error %d from pcre_info()\n", count);
1982 else
1983 {
1984 if (old_count != count) fprintf(outfile,
1985 "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
1986 old_count);
1987
1988 if (old_first_char != first_char) fprintf(outfile,
1989 "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
1990 first_char, old_first_char);
1991
1992 if (old_options != (int)get_options) fprintf(outfile,
1993 "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
1994 get_options, old_options);
1995 }
1996 #endif
1997
1998 if (size != regex_gotten_store) fprintf(outfile,
1999 "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
2000 (int)size, (int)regex_gotten_store);
2001
2002 fprintf(outfile, "Capturing subpattern count = %d\n", count);
2003 if (backrefmax > 0)
2004 fprintf(outfile, "Max back reference = %d\n", backrefmax);
2005
2006 if (namecount > 0)
2007 {
2008 fprintf(outfile, "Named capturing subpatterns:\n");
2009 while (namecount-- > 0)
2010 {
2011 fprintf(outfile, " %s %*s%3d\n", nametable + 2,
2012 nameentrysize - 3 - (int)strlen((char *)nametable + 2), "",
2013 GET2(nametable, 0));
2014 nametable += nameentrysize;
2015 }
2016 }
2017
2018 if (!okpartial) fprintf(outfile, "Partial matching not supported\n");
2019 if (hascrorlf) fprintf(outfile, "Contains explicit CR or LF match\n");
2020
2021 all_options = ((real_pcre *)re)->options;
2022 if (do_flip) all_options = byteflip(all_options, sizeof(all_options));
2023
2024 if (get_options == 0) fprintf(outfile, "No options\n");
2025 else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
2026 ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
2027 ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
2028 ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
2029 ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
2030 ((get_options & PCRE_FIRSTLINE) != 0)? " firstline" : "",
2031 ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
2032 ((get_options & PCRE_BSR_ANYCRLF) != 0)? " bsr_anycrlf" : "",
2033 ((get_options & PCRE_BSR_UNICODE) != 0)? " bsr_unicode" : "",
2034 ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
2035 ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
2036 ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
2037 ((get_options & PCRE_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "",
2038 ((get_options & PCRE_UTF8) != 0)? " utf8" : "",
2039 ((get_options & PCRE_UCP) != 0)? " ucp" : "",
2040 ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
2041 ((get_options & PCRE_NO_START_OPTIMIZE) != 0)? " no_start_optimize" : "",
2042 ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");
2043
2044 if (jchanged) fprintf(outfile, "Duplicate name status changes\n");
2045
2046 switch (get_options & PCRE_NEWLINE_BITS)
2047 {
2048 case PCRE_NEWLINE_CR:
2049 fprintf(outfile, "Forced newline sequence: CR\n");
2050 break;
2051
2052 case PCRE_NEWLINE_LF:
2053 fprintf(outfile, "Forced newline sequence: LF\n");
2054 break;
2055
2056 case PCRE_NEWLINE_CRLF:
2057 fprintf(outfile, "Forced newline sequence: CRLF\n");
2058 break;
2059
2060 case PCRE_NEWLINE_ANYCRLF:
2061 fprintf(outfile, "Forced newline sequence: ANYCRLF\n");
2062 break;
2063
2064 case PCRE_NEWLINE_ANY:
2065 fprintf(outfile, "Forced newline sequence: ANY\n");
2066 break;
2067
2068 default:
2069 break;
2070 }
2071
2072 if (first_char == -1)
2073 {
2074 fprintf(outfile, "First char at start or follows newline\n");
2075 }
2076 else if (first_char < 0)
2077 {
2078 fprintf(outfile, "No first char\n");
2079 }
2080 else
2081 {
2082 const char *caseless =
2083 ((((real_pcre *)re)->flags & PCRE_FCH_CASELESS) == 0)?
2084 "" : " (caseless)";
2085
2086 if (PRINTHEX(first_char))
2087 fprintf(outfile, "First char = \'%c\'%s\n", first_char, caseless);
2088 else
2089 fprintf(outfile, "First char = %d%s\n", first_char, caseless);
2090 }
2091
2092 if (need_char < 0)
2093 {
2094 fprintf(outfile, "No need char\n");
2095 }
2096 else
2097 {
2098 const char *caseless =
2099 ((((real_pcre *)re)->flags & PCRE_RCH_CASELESS) == 0)?
2100 "" : " (caseless)";
2101
2102 if (PRINTHEX(need_char))
2103 fprintf(outfile, "Need char = \'%c\'%s\n", need_char, caseless);
2104 else
2105 fprintf(outfile, "Need char = %d%s\n", need_char, caseless);
2106 }
2107
2108 /* Don't output study size; at present it is in any case a fixed
2109 value, but it varies, depending on the computer architecture, and
2110 so messes up the test suite. (And with the /F option, it might be
2111 flipped.) If study was forced by an external -s, don't show this
2112 information unless -i or -d was also present. This means that, except
2113 when auto-callouts are involved, the output from runs with and without
2114 -s should be identical. */
2115
2116 if (do_study || (force_study >= 0 && showinfo && !no_force_study))
2117 {
2118 if (extra == NULL)
2119 fprintf(outfile, "Study returned NULL\n");
2120 else
2121 {
2122 pcre_uint8 *start_bits = NULL;
2123 int minlength;
2124
2125 new_info(re, extra, PCRE_INFO_MINLENGTH, &minlength);
2126 fprintf(outfile, "Subject length lower bound = %d\n", minlength);
2127
2128 new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
2129 if (start_bits == NULL)
2130 fprintf(outfile, "No set of starting bytes\n");
2131 else
2132 {
2133 int i;
2134 int c = 24;
2135 fprintf(outfile, "Starting byte set: ");
2136 for (i = 0; i < 256; i++)
2137 {
2138 if ((start_bits[i/8] & (1<<(i&7))) != 0)
2139 {
2140 if (c > 75)
2141 {
2142 fprintf(outfile, "\n ");
2143 c = 2;
2144 }
2145 if (PRINTHEX(i) && i != ' ')
2146 {
2147 fprintf(outfile, "%c ", i);
2148 c += 2;
2149 }
2150 else
2151 {
2152 fprintf(outfile, "\\x%02x ", i);
2153 c += 5;
2154 }
2155 }
2156 }
2157 fprintf(outfile, "\n");
2158 }
2159 }
2160
2161 /* Show this only if the JIT was set by /S, not by -s. */
2162
2163 if ((study_options & PCRE_STUDY_JIT_COMPILE) != 0)
2164 {
2165 int jit;
2166 new_info(re, extra, PCRE_INFO_JIT, &jit);
2167 if (jit)
2168 fprintf(outfile, "JIT study was successful\n");
2169 else
2170 #ifdef SUPPORT_JIT
2171 fprintf(outfile, "JIT study was not successful\n");
2172 #else
2173 fprintf(outfile, "JIT support is not available in this version of PCRE\n");
2174 #endif
2175 }
2176 }
2177 }
2178
2179 /* If the '>' option was present, we write out the regex to a file, and
2180 that is all. The first 8 bytes of the file are the regex length and then
2181 the study length, in big-endian order. */
2182
2183 if (to_file != NULL)
2184 {
2185 FILE *f = fopen((char *)to_file, "wb");
2186 if (f == NULL)
2187 {
2188 fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
2189 }
2190 else
2191 {
2192 pcre_uint8 sbuf[8];
2193 sbuf[0] = (pcre_uint8)((true_size >> 24) & 255);
2194 sbuf[1] = (pcre_uint8)((true_size >> 16) & 255);
2195 sbuf[2] = (pcre_uint8)((true_size >> 8) & 255);
2196 sbuf[3] = (pcre_uint8)((true_size) & 255);
2197
2198 sbuf[4] = (pcre_uint8)((true_study_size >> 24) & 255);
2199 sbuf[5] = (pcre_uint8)((true_study_size >> 16) & 255);
2200 sbuf[6] = (pcre_uint8)((true_study_size >> 8) & 255);
2201 sbuf[7] = (pcre_uint8)((true_study_size) & 255);
2202
2203 if (fwrite(sbuf, 1, 8, f) < 8 ||
2204 fwrite(re, 1, true_size, f) < true_size)
2205 {
2206 fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
2207 }
2208 else
2209 {
2210 fprintf(outfile, "Compiled pattern written to %s\n", to_file);
2211
2212 /* If there is study data, write it. */
2213
2214 if (extra != NULL)
2215 {
2216 if (fwrite(extra->study_data, 1, true_study_size, f) <
2217 true_study_size)
2218 {
2219 fprintf(outfile, "Write error on %s: %s\n", to_file,
2220 strerror(errno));
2221 }
2222 else fprintf(outfile, "Study data written to %s\n", to_file);
2223 }
2224 }
2225 fclose(f);
2226 }
2227
2228 new_free(re);
2229 if (extra != NULL) pcre_free_study(extra);
2230 if (locale_set)
2231 {
2232 new_free((void *)tables);
2233 setlocale(LC_CTYPE, "C");
2234 locale_set = 0;
2235 }
2236 continue; /* With next regex */
2237 }
2238 } /* End of non-POSIX compile */
2239
2240 /* Read data lines and test them */
2241
2242 for (;;)
2243 {
2244 pcre_uint8 *q;
2245 pcre_uint8 *bptr;
2246 int *use_offsets = offsets;
2247 int use_size_offsets = size_offsets;
2248 int callout_data = 0;
2249 int callout_data_set = 0;
2250 int count, c;
2251 int copystrings = 0;
2252 int find_match_limit = default_find_match_limit;
2253 int getstrings = 0;
2254 int getlist = 0;
2255 int gmatched = 0;
2256 int start_offset = 0;
2257 int start_offset_sign = 1;
2258 int g_notempty = 0;
2259 int use_dfa = 0;
2260
2261 options = 0;
2262
2263 *copynames = 0;
2264 *getnames = 0;
2265
2266 copynamesptr = copynames;
2267 getnamesptr = getnames;
2268
2269 pcre_callout = callout;
2270 first_callout = 1;
2271 last_callout_mark = NULL;
2272 callout_extra = 0;
2273 callout_count = 0;
2274 callout_fail_count = 999999;
2275 callout_fail_id = -1;
2276 show_malloc = 0;
2277
2278 if (extra != NULL) extra->flags &=
2279 ~(PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION);
2280
2281 len = 0;
2282 for (;;)
2283 {
2284 if (extend_inputline(infile, buffer + len, "data> ") == NULL)
2285 {
2286 if (len > 0) /* Reached EOF without hitting a newline */
2287 {
2288 fprintf(outfile, "\n");
2289 break;
2290 }
2291 done = 1;
2292 goto CONTINUE;
2293 }
2294 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
2295 len = (int)strlen((char *)buffer);
2296 if (buffer[len-1] == '\n') break;
2297 }
2298
2299 while (len > 0 && isspace(buffer[len-1])) len--;
2300 buffer[len] = 0;
2301 if (len == 0) break;
2302
2303 p = buffer;
2304 while (isspace(*p)) p++;
2305
2306 bptr = q = dbuffer;
2307 while ((c = *p++) != 0)
2308 {
2309 int i = 0;
2310 int n = 0;
2311
2312 if (c == '\\') switch ((c = *p++))
2313 {
2314 case 'a': c = 7; break;
2315 case 'b': c = '\b'; break;
2316 case 'e': c = 27; break;
2317 case 'f': c = '\f'; break;
2318 case 'n': c = '\n'; break;
2319 case 'r': c = '\r'; break;
2320 case 't': c = '\t'; break;
2321 case 'v': c = '\v'; break;
2322
2323 case '0': case '1': case '2': case '3':
2324 case '4': case '5': case '6': case '7':
2325 c -= '0';
2326 while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
2327 c = c * 8 + *p++ - '0';
2328
2329 #if !defined NOUTF8
2330 if (use_utf8 && c > 255)
2331 {
2332 unsigned char buff8[8];
2333 int ii, utn;
2334 utn = ord2utf8(c, buff8);
2335 for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
2336 c = buff8[ii]; /* Last byte */
2337 }
2338 #endif
2339 break;
2340
2341 case 'x':
2342
2343 /* Handle \x{..} specially - new Perl thing for utf8 */
2344
2345 #if !defined NOUTF8
2346 if (*p == '{')
2347 {
2348 unsigned char *pt = p;
2349 c = 0;
2350
2351 /* We used to have "while (isxdigit(*(++pt)))" here, but it fails
2352 when isxdigit() is a macro that refers to its argument more than
2353 once. This is banned by the C Standard, but apparently happens in at
2354 least one MacOS environment. */
2355
2356 for (pt++; isxdigit(*pt); pt++)
2357 c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'a' - 10);
2358 if (*pt == '}')
2359 {
2360 unsigned char buff8[8];
2361 int ii, utn;
2362 if (use_utf8)
2363 {
2364 utn = ord2utf8(c, buff8);
2365 for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
2366 c = buff8[ii]; /* Last byte */
2367 }
2368 else
2369 {
2370 if (c > 255)
2371 fprintf(outfile, "** Character \\x{%x} is greater than 255 and "
2372 "UTF-8 mode is not enabled.\n"
2373 "** Truncation will probably give the wrong result.\n", c);
2374 }
2375 p = pt + 1;
2376 break;
2377 }
2378 /* Not correct form; fall through */
2379 }
2380 #endif
2381
2382 /* Ordinary \x */
2383
2384 c = 0;
2385 while (i++ < 2 && isxdigit(*p))
2386 {
2387 c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'a' - 10);
2388 p++;
2389 }
2390 break;
2391
2392 case 0: /* \ followed by EOF allows for an empty line */
2393 p--;
2394 continue;
2395
2396 case '>':
2397 if (*p == '-')
2398 {
2399 start_offset_sign = -1;
2400 p++;
2401 }
2402 while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
2403 start_offset *= start_offset_sign;
2404 continue;
2405
2406 case 'A': /* Option setting */
2407 options |= PCRE_ANCHORED;
2408 continue;
2409
2410 case 'B':
2411 options |= PCRE_NOTBOL;
2412 continue;
2413
2414 case 'C':
2415 if (isdigit(*p)) /* Set copy string */
2416 {
2417 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2418 copystrings |= 1 << n;
2419 }
2420 else if (isalnum(*p))
2421 {
2422 pcre_uchar *npp = copynamesptr;
2423 while (isalnum(*p)) *npp++ = *p++;
2424 *npp++ = 0;
2425 *npp = 0;
2426 n = pcre_get_stringnumber(re, (char *)copynamesptr);
2427 if (n < 0)
2428 fprintf(outfile, "no parentheses with name \"%s\"\n", copynamesptr);
2429 copynamesptr = npp;
2430 }
2431 else if (*p == '+')
2432 {
2433 callout_extra = 1;
2434 p++;
2435 }
2436 else if (*p == '-')
2437 {
2438 pcre_callout = NULL;
2439 p++;
2440 }
2441 else if (*p == '!')
2442 {
2443 callout_fail_id = 0;
2444 p++;
2445 while(isdigit(*p))
2446 callout_fail_id = callout_fail_id * 10 + *p++ - '0';
2447 callout_fail_count = 0;
2448 if (*p == '!')
2449 {
2450 p++;
2451 while(isdigit(*p))
2452 callout_fail_count = callout_fail_count * 10 + *p++ - '0';
2453 }
2454 }
2455 else if (*p == '*')
2456 {
2457 int sign = 1;
2458 callout_data = 0;
2459 if (*(++p) == '-') { sign = -1; p++; }
2460 while(isdigit(*p))
2461 callout_data = callout_data * 10 + *p++ - '0';
2462 callout_data *= sign;
2463 callout_data_set = 1;
2464 }
2465 continue;
2466
2467 #if !defined NODFA
2468 case 'D':
2469 #if !defined NOPOSIX
2470 if (posix || do_posix)
2471 printf("** Can't use dfa matching in POSIX mode: \\D ignored\n");
2472 else
2473 #endif
2474 use_dfa = 1;
2475 continue;
2476 #endif
2477
2478 #if !defined NODFA
2479 case 'F':
2480 options |= PCRE_DFA_SHORTEST;
2481 continue;
2482 #endif
2483
2484 case 'G':
2485 if (isdigit(*p))
2486 {
2487 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2488 getstrings |= 1 << n;
2489 }
2490 else if (isalnum(*p))
2491 {
2492 pcre_uchar *npp = getnamesptr;
2493 while (isalnum(*p)) *npp++ = *p++;
2494 *npp++ = 0;
2495 *npp = 0;
2496 n = pcre_get_stringnumber(re, (char *)getnamesptr);
2497 if (n < 0)
2498 fprintf(outfile, "no parentheses with name \"%s\"\n", getnamesptr);
2499 getnamesptr = npp;
2500 }
2501 continue;
2502
2503 case 'J':
2504 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2505 if (extra != NULL
2506 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
2507 && extra->executable_jit != NULL)
2508 {
2509 if (jit_stack != NULL) pcre_jit_stack_free(jit_stack);
2510 jit_stack = pcre_jit_stack_alloc(1, n * 1024);
2511 pcre_assign_jit_stack(extra, jit_callback, jit_stack);
2512 }
2513 continue;
2514
2515 case 'L':
2516 getlist = 1;
2517 continue;
2518
2519 case 'M':
2520 find_match_limit = 1;
2521 continue;
2522
2523 case 'N':
2524 if ((options & PCRE_NOTEMPTY) != 0)
2525 options = (options & ~PCRE_NOTEMPTY) | PCRE_NOTEMPTY_ATSTART;
2526 else
2527 options |= PCRE_NOTEMPTY;
2528 continue;
2529
2530 case 'O':
2531 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2532 if (n > size_offsets_max)
2533 {
2534 size_offsets_max = n;
2535 free(offsets);
2536 use_offsets = offsets = (int *)malloc(size_offsets_max * sizeof(int));
2537 if (offsets == NULL)
2538 {
2539 printf("** Failed to get %d bytes of memory for offsets vector\n",
2540 (int)(size_offsets_max * sizeof(int)));
2541 yield = 1;
2542 goto EXIT;
2543 }
2544 }
2545 use_size_offsets = n;
2546 if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
2547 continue;
2548
2549 case 'P':
2550 options |= ((options & PCRE_PARTIAL_SOFT) == 0)?
2551 PCRE_PARTIAL_SOFT : PCRE_PARTIAL_HARD;
2552 continue;
2553
2554 case 'Q':
2555 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2556 if (extra == NULL)
2557 {
2558 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
2559 extra->flags = 0;
2560 }
2561 extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
2562 extra->match_limit_recursion = n;
2563 continue;
2564
2565 case 'q':
2566 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2567 if (extra == NULL)
2568 {
2569 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
2570 extra->flags = 0;
2571 }
2572 extra->flags |= PCRE_EXTRA_MATCH_LIMIT;
2573 extra->match_limit = n;
2574 continue;
2575
2576 #if !defined NODFA
2577 case 'R':
2578 options |= PCRE_DFA_RESTART;
2579 continue;
2580 #endif
2581
2582 case 'S':
2583 show_malloc = 1;
2584 continue;
2585
2586 case 'Y':
2587 options |= PCRE_NO_START_OPTIMIZE;
2588 continue;
2589
2590 case 'Z':
2591 options |= PCRE_NOTEOL;
2592 continue;
2593
2594 case '?':
2595 options |= PCRE_NO_UTF8_CHECK;
2596 continue;
2597
2598 case '<':
2599 {
2600 int x = check_newline(p, outfile);
2601 if (x == 0) goto NEXT_DATA;
2602 options |= x;
2603 while (*p++ != '>');
2604 }
2605 continue;
2606 }
2607 *q++ = c;
2608 }
2609 *q = 0;
2610 len = (int)(q - dbuffer);
2611
2612 /* Move the data to the end of the buffer so that a read over the end of
2613 the buffer will be seen by valgrind, even if it doesn't cause a crash. If
2614 we are using the POSIX interface, we must include the terminating zero. */
2615
2616 #if !defined NOPOSIX
2617 if (posix || do_posix)
2618 {
2619 memmove(bptr + buffer_size - len - 1, bptr, len + 1);
2620 bptr += buffer_size - len - 1;
2621 }
2622 else
2623 #endif
2624 {
2625 memmove(bptr + buffer_size - len, bptr, len);
2626 bptr += buffer_size - len;
2627 }
2628
2629 if ((all_use_dfa || use_dfa) && find_match_limit)
2630 {
2631 printf("**Match limit not relevant for DFA matching: ignored\n");
2632 find_match_limit = 0;
2633 }
2634
2635 /* Handle matching via the POSIX interface, which does not
2636 support timing or playing with the match limit or callout data. */
2637
2638 #if !defined NOPOSIX
2639 if (posix || do_posix)
2640 {
2641 int rc;
2642 int eflags = 0;
2643 regmatch_t *pmatch = NULL;
2644 if (use_size_offsets > 0)
2645 pmatch = (regmatch_t *)malloc(sizeof(regmatch_t) * use_size_offsets);
2646 if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
2647 if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
2648 if ((options & PCRE_NOTEMPTY) != 0) eflags |= REG_NOTEMPTY;
2649
2650 rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
2651
2652 if (rc != 0)
2653 {
2654 (void)regerror(rc, &preg, (char *)buffer, buffer_size);
2655 fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
2656 }
2657 else if ((((const pcre *)preg.re_pcre)->options & PCRE_NO_AUTO_CAPTURE)
2658 != 0)
2659 {
2660 fprintf(outfile, "Matched with REG_NOSUB\n");
2661 }
2662 else
2663 {
2664 size_t i;
2665 for (i = 0; i < (size_t)use_size_offsets; i++)
2666 {
2667 if (pmatch[i].rm_so >= 0)
2668 {
2669 fprintf(outfile, "%2d: ", (int)i);
2670 (void)pchars(dbuffer + pmatch[i].rm_so,
2671 pmatch[i].rm_eo - pmatch[i].rm_so, outfile);
2672 fprintf(outfile, "\n");
2673 if (do_showcaprest || (i == 0 && do_showrest))
2674 {
2675 fprintf(outfile, "%2d+ ", (int)i);
2676 (void)pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo,
2677 outfile);
2678 fprintf(outfile, "\n");
2679 }
2680 }
2681 }
2682 }
2683 free(pmatch);
2684 }
2685
2686 /* Handle matching via the native interface - repeats for /g and /G */
2687
2688 else
2689 #endif /* !defined NOPOSIX */
2690
2691 for (;; gmatched++) /* Loop for /g or /G */
2692 {
2693 markptr = NULL;
2694
2695 if (timeitm > 0)
2696 {
2697 register int i;
2698 clock_t time_taken;
2699 clock_t start_time = clock();
2700
2701 #if !defined NODFA
2702 if (all_use_dfa || use_dfa)
2703 {
2704 int workspace[1000];
2705 for (i = 0; i < timeitm; i++)
2706 count = pcre_dfa_exec(re, extra, (char *)bptr, len, start_offset,
2707 options | g_notempty, use_offsets, use_size_offsets, workspace,
2708 sizeof(workspace)/sizeof(int));
2709 }
2710 else
2711 #endif
2712
2713 for (i = 0; i < timeitm; i++)
2714 count = pcre_exec(re, extra, (char *)bptr, len,
2715 start_offset, options | g_notempty, use_offsets, use_size_offsets);
2716
2717 time_taken = clock() - start_time;
2718 fprintf(outfile, "Execute time %.4f milliseconds\n",
2719 (((double)time_taken * 1000.0) / (double)timeitm) /
2720 (double)CLOCKS_PER_SEC);
2721 }
2722
2723 /* If find_match_limit is set, we want to do repeated matches with
2724 varying limits in order to find the minimum value for the match limit and
2725 for the recursion limit. The match limits are relevant only to the normal
2726 running of pcre_exec(), so disable the JIT optimization. This makes it
2727 possible to run the same set of tests with and without JIT externally
2728 requested. */
2729
2730 if (find_match_limit)
2731 {
2732 if (extra == NULL)
2733 {
2734 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
2735 extra->flags = 0;
2736 }
2737 else extra->flags &= ~PCRE_EXTRA_EXECUTABLE_JIT;
2738
2739 (void)check_match_limit(re, extra, bptr, len, start_offset,
2740 options|g_notempty, use_offsets, use_size_offsets,
2741 PCRE_EXTRA_MATCH_LIMIT, &(extra->match_limit),
2742 PCRE_ERROR_MATCHLIMIT, "match()");
2743
2744 count = check_match_limit(re, extra, bptr, len, start_offset,
2745 options|g_notempty, use_offsets, use_size_offsets,
2746 PCRE_EXTRA_MATCH_LIMIT_RECURSION, &(extra->match_limit_recursion),
2747 PCRE_ERROR_RECURSIONLIMIT, "match() recursion");
2748 }
2749
2750 /* If callout_data is set, use the interface with additional data */
2751
2752 else if (callout_data_set)
2753 {
2754 if (extra == NULL)
2755 {
2756 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
2757 extra->flags = 0;
2758 }
2759 extra->flags |= PCRE_EXTRA_CALLOUT_DATA;
2760 extra->callout_data = &callout_data;
2761 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
2762 options | g_notempty, use_offsets, use_size_offsets);
2763 extra->flags &= ~PCRE_EXTRA_CALLOUT_DATA;
2764 }
2765
2766 /* The normal case is just to do the match once, with the default
2767 value of match_limit. */
2768
2769 #if !defined NODFA
2770 else if (all_use_dfa || use_dfa)
2771 {
2772 int workspace[1000];
2773 count = pcre_dfa_exec(re, extra, (char *)bptr, len, start_offset,
2774 options | g_notempty, use_offsets, use_size_offsets, workspace,
2775 sizeof(workspace)/sizeof(int));
2776 if (count == 0)
2777 {
2778 fprintf(outfile, "Matched, but too many subsidiary matches\n");
2779 count = use_size_offsets/2;
2780 }
2781 }
2782 #endif
2783
2784 else
2785 {
2786 count = pcre_exec(re, extra, (char *)bptr, len,
2787 start_offset, options | g_notempty, use_offsets, use_size_offsets);
2788 if (count == 0)
2789 {
2790 fprintf(outfile, "Matched, but too many substrings\n");
2791 count = use_size_offsets/3;
2792 }
2793 }
2794
2795 /* Matched */
2796
2797 if (count >= 0)
2798 {
2799 int i, maxcount;
2800
2801 #if !defined NODFA
2802 if (all_use_dfa || use_dfa) maxcount = use_size_offsets/2; else
2803 #endif
2804 maxcount = use_size_offsets/3;
2805
2806 /* This is a check against a lunatic return value. */
2807
2808 if (count > maxcount)
2809 {
2810 fprintf(outfile,
2811 "** PCRE error: returned count %d is too big for offset size %d\n",
2812 count, use_size_offsets);
2813 count = use_size_offsets/3;
2814 if (do_g || do_G)
2815 {
2816 fprintf(outfile, "** /%c loop abandoned\n", do_g? 'g' : 'G');
2817 do_g = do_G = FALSE; /* Break g/G loop */
2818 }
2819 }
2820
2821 /* do_allcaps requests showing of all captures in the pattern, to check
2822 unset ones at the end. */
2823
2824 if (do_allcaps)
2825 {
2826 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
2827 count++; /* Allow for full match */
2828 if (count * 2 > use_size_offsets) count = use_size_offsets/2;
2829 }
2830
2831 /* Output the captured substrings */
2832
2833 for (i = 0; i < count * 2; i += 2)
2834 {
2835 if (use_offsets[i] < 0)
2836 {
2837 if (use_offsets[i] != -1)
2838 fprintf(outfile, "ERROR: bad negative value %d for offset %d\n",
2839 use_offsets[i], i);
2840 if (use_offsets[i+1] != -1)
2841 fprintf(outfile, "ERROR: bad negative value %d for offset %d\n",
2842 use_offsets[i+1], i+1);
2843 fprintf(outfile, "%2d: <unset>\n", i/2);
2844 }
2845 else
2846 {
2847 fprintf(outfile, "%2d: ", i/2);
2848 (void)pchars(bptr + use_offsets[i],
2849 use_offsets[i+1] - use_offsets[i], outfile);
2850 fprintf(outfile, "\n");
2851 if (do_showcaprest || (i == 0 && do_showrest))
2852 {
2853 fprintf(outfile, "%2d+ ", i/2);
2854 (void)pchars(bptr + use_offsets[i+1], len - use_offsets[i+1],
2855 outfile);
2856 fprintf(outfile, "\n");
2857 }
2858 }
2859 }
2860
2861 if (markptr != NULL) fprintf(outfile, "MK: %s\n", markptr);
2862
2863 for (i = 0; i < 32; i++)
2864 {
2865 if ((copystrings & (1 << i)) != 0)
2866 {
2867 char copybuffer[256];
2868 int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
2869 i, copybuffer, sizeof(copybuffer));
2870 if (rc < 0)
2871 fprintf(outfile, "copy substring %d failed %d\n", i, rc);
2872 else
2873 fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
2874 }
2875 }
2876
2877 for (copynamesptr = copynames;
2878 *copynamesptr != 0;
2879 copynamesptr += (int)strlen((char*)copynamesptr) + 1)
2880 {
2881 char copybuffer[256];
2882 int rc = pcre_copy_named_substring(re, (char *)bptr, use_offsets,
2883 count, (char *)copynamesptr, copybuffer, sizeof(copybuffer));
2884 if (rc < 0)
2885 fprintf(outfile, "copy substring %s failed %d\n", copynamesptr, rc);
2886 else
2887 fprintf(outfile, " C %s (%d) %s\n", copybuffer, rc, copynamesptr);
2888 }
2889
2890 for (i = 0; i < 32; i++)
2891 {
2892 if ((getstrings & (1 << i)) != 0)
2893 {
2894 const char *substring;
2895 int rc = pcre_get_substring((char *)bptr, use_offsets, count,
2896 i, &substring);
2897 if (rc < 0)
2898 fprintf(outfile, "get substring %d failed %d\n", i, rc);
2899 else
2900 {
2901 fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
2902 pcre_free_substring(substring);
2903 }
2904 }
2905 }
2906
2907 for (getnamesptr = getnames;
2908 *getnamesptr != 0;
2909 getnamesptr += (int)strlen((char*)getnamesptr) + 1)
2910 {
2911 const char *substring;
2912 int rc = pcre_get_named_substring(re, (char *)bptr, use_offsets,
2913 count, (char *)getnamesptr, &substring);
2914 if (rc < 0)
2915 fprintf(outfile, "copy substring %s failed %d\n", getnamesptr, rc);
2916 else
2917 {
2918 fprintf(outfile, " G %s (%d) %s\n", substring, rc, getnamesptr);
2919 pcre_free_substring(substring);
2920 }
2921 }
2922
2923 if (getlist)
2924 {
2925 const char **stringlist;
2926 int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
2927 &stringlist);
2928 if (rc < 0)
2929 fprintf(outfile, "get substring list failed %d\n", rc);
2930 else
2931 {
2932 for (i = 0; i < count; i++)
2933 fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
2934 if (stringlist[i] != NULL)
2935 fprintf(outfile, "string list not terminated by NULL\n");
2936 pcre_free_substring_list(stringlist);
2937 }
2938 }
2939 }
2940
2941 /* There was a partial match */
2942
2943 else if (count == PCRE_ERROR_PARTIAL)
2944 {
2945 if (markptr == NULL) fprintf(outfile, "Partial match");
2946 else fprintf(outfile, "Partial match, mark=%s", markptr);
2947 if (use_size_offsets > 1)
2948 {
2949 fprintf(outfile, ": ");
2950 pchars(bptr + use_offsets[0], use_offsets[1] - use_offsets[0],
2951 outfile);
2952 }
2953 fprintf(outfile, "\n");
2954 break; /* Out of the /g loop */
2955 }
2956
2957 /* Failed to match. If this is a /g or /G loop and we previously set
2958 g_notempty after a null match, this is not necessarily the end. We want
2959 to advance the start offset, and continue. We won't be at the end of the
2960 string - that was checked before setting g_notempty.
2961
2962 Complication arises in the case when the newline convention is "any",
2963 "crlf", or "anycrlf". If the previous match was at the end of a line
2964 terminated by CRLF, an advance of one character just passes the \r,
2965 whereas we should prefer the longer newline sequence, as does the code in
2966 pcre_exec(). Fudge the offset value to achieve this. We check for a
2967 newline setting in the pattern; if none was set, use pcre_config() to
2968 find the default.
2969
2970 Otherwise, in the case of UTF-8 matching, the advance must be one
2971 character, not one byte. */
2972
2973 else
2974 {
2975 if (g_notempty != 0)
2976 {
2977 int onechar = 1;
2978 unsigned int obits = ((real_pcre *)re)->options;
2979 use_offsets[0] = start_offset;
2980 if ((obits & PCRE_NEWLINE_BITS) == 0)
2981 {
2982 int d;
2983 (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
2984 /* Note that these values are always the ASCII ones, even in
2985 EBCDIC environments. CR = 13, NL = 10. */
2986 obits = (d == 13)? PCRE_NEWLINE_CR :
2987 (d == 10)? PCRE_NEWLINE_LF :
2988 (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
2989 (d == -2)? PCRE_NEWLINE_ANYCRLF :
2990 (d == -1)? PCRE_NEWLINE_ANY : 0;
2991 }
2992 if (((obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY ||
2993 (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_CRLF ||
2994 (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANYCRLF)
2995 &&
2996 start_offset < len - 1 &&
2997 bptr[start_offset] == '\r' &&
2998 bptr[start_offset+1] == '\n')
2999 onechar++;
3000 else if (use_utf8)
3001 {
3002 while (start_offset + onechar < len)
3003 {
3004 if ((bptr[start_offset+onechar] & 0xc0) != 0x80) break;
3005 onechar++;
3006 }
3007 }
3008 use_offsets[1] = start_offset + onechar;
3009 }
3010 else
3011 {
3012 switch(count)
3013 {
3014 case PCRE_ERROR_NOMATCH:
3015 if (gmatched == 0)
3016 {
3017 if (markptr == NULL) fprintf(outfile, "No match\n");
3018 else fprintf(outfile, "No match, mark = %s\n", markptr);
3019 }
3020 break;
3021
3022 case PCRE_ERROR_BADUTF8:
3023 case PCRE_ERROR_SHORTUTF8:
3024 fprintf(outfile, "Error %d (%s UTF-8 string)", count,
3025 (count == PCRE_ERROR_BADUTF8)? "bad" : "short");
3026 if (use_size_offsets >= 2)
3027 fprintf(outfile, " offset=%d reason=%d", use_offsets[0],
3028 use_offsets[1]);
3029 fprintf(outfile, "\n");
3030 break;
3031
3032 default:
3033 if (count < 0 && (-count) < sizeof(errtexts)/sizeof(const char *))
3034 fprintf(outfile, "Error %d (%s)\n", count, errtexts[-count]);
3035 else
3036 fprintf(outfile, "Error %d (Unexpected value)\n", count);
3037 break;
3038 }
3039
3040 break; /* Out of the /g loop */
3041 }
3042 }
3043
3044 /* If not /g or /G we are done */
3045
3046 if (!do_g && !do_G) break;
3047
3048 /* If we have matched an empty string, first check to see if we are at
3049 the end of the subject. If so, the /g loop is over. Otherwise, mimic what
3050 Perl's /g options does. This turns out to be rather cunning. First we set
3051 PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED and try the match again at the
3052 same point. If this fails (picked up above) we advance to the next
3053 character. */
3054
3055 g_notempty = 0;
3056
3057 if (use_offsets[0] == use_offsets[1])
3058 {
3059 if (use_offsets[0] == len) break;
3060 g_notempty = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
3061 }
3062
3063 /* For /g, update the start offset, leaving the rest alone */
3064
3065 if (do_g) start_offset = use_offsets[1];
3066
3067 /* For /G, update the pointer and length */
3068
3069 else
3070 {
3071 bptr += use_offsets[1];
3072 len -= use_offsets[1];
3073 }
3074 } /* End of loop for /g and /G */
3075
3076 NEXT_DATA: continue;
3077 } /* End of loop for data lines */
3078
3079 CONTINUE:
3080
3081 #if !defined NOPOSIX
3082 if (posix || do_posix) regfree(&preg);
3083 #endif
3084
3085 if (re != NULL) new_free(re);
3086 if (extra != NULL) pcre_free_study(extra);
3087 if (locale_set)
3088 {
3089 new_free((void *)tables);
3090 setlocale(LC_CTYPE, "C");
3091 locale_set = 0;
3092 }
3093 if (jit_stack != NULL)
3094 {
3095 pcre_jit_stack_free(jit_stack);
3096 jit_stack = NULL;
3097 }
3098 }
3099
3100 if (infile == stdin) fprintf(outfile, "\n");
3101
3102 EXIT:
3103
3104 if (infile != NULL && infile != stdin) fclose(infile);
3105 if (outfile != NULL && outfile != stdout) fclose(outfile);
3106
3107 free(buffer);
3108 free(dbuffer);
3109 free(pbuffer);
3110 free(offsets);
3111
3112 return yield;
3113 }
3114
3115 /* End of pcretest.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5