/[pcre]/code/branches/pcre16/pcretest.c
ViewVC logotype

Contents of /code/branches/pcre16/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 756 - (show annotations)
Mon Nov 21 10:48:42 2011 UTC (3 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 93968 byte(s)
Error occurred while calculating annotation data.
Apply Zoltan's big patch.
1 /*************************************************
2 * PCRE testing program *
3 *************************************************/
4
5 /* This program was hacked up as a tester for PCRE. I really should have
6 written it more tidily in the first place. Will I ever learn? It has grown and
7 been extended and consequently is now rather, er, *very* untidy in places.
8
9 -----------------------------------------------------------------------------
10 Redistribution and use in source and binary forms, with or without
11 modification, are permitted provided that the following conditions are met:
12
13 * Redistributions of source code must retain the above copyright notice,
14 this list of conditions and the following disclaimer.
15
16 * Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 * Neither the name of the University of Cambridge nor the names of its
21 contributors may be used to endorse or promote products derived from
22 this software without specific prior written permission.
23
24 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
28 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35 -----------------------------------------------------------------------------
36 */
37
38
39 #ifdef HAVE_CONFIG_H
40 #include "config.h"
41 #endif
42
43 #include <ctype.h>
44 #include <stdio.h>
45 #include <string.h>
46 #include <stdlib.h>
47 #include <time.h>
48 #include <locale.h>
49 #include <errno.h>
50
51 #ifdef SUPPORT_LIBREADLINE
52 #ifdef HAVE_UNISTD_H
53 #include <unistd.h>
54 #endif
55 #include <readline/readline.h>
56 #include <readline/history.h>
57 #endif
58
59
60 /* A number of things vary for Windows builds. Originally, pcretest opened its
61 input and output without "b"; then I was told that "b" was needed in some
62 environments, so it was added for release 5.0 to both the input and output. (It
63 makes no difference on Unix-like systems.) Later I was told that it is wrong
64 for the input on Windows. I've now abstracted the modes into two macros that
65 are set here, to make it easier to fiddle with them, and removed "b" from the
66 input mode under Windows. */
67
68 #if defined(_WIN32) || defined(WIN32)
69 #include <io.h> /* For _setmode() */
70 #include <fcntl.h> /* For _O_BINARY */
71 #define INPUT_MODE "r"
72 #define OUTPUT_MODE "wb"
73
74 #ifndef isatty
75 #define isatty _isatty /* This is what Windows calls them, I'm told, */
76 #endif /* though in some environments they seem to */
77 /* be already defined, hence the #ifndefs. */
78 #ifndef fileno
79 #define fileno _fileno
80 #endif
81
82 /* A user sent this fix for Borland Builder 5 under Windows. */
83
84 #ifdef __BORLANDC__
85 #define _setmode(handle, mode) setmode(handle, mode)
86 #endif
87
88 /* Not Windows */
89
90 #else
91 #include <sys/time.h> /* These two includes are needed */
92 #include <sys/resource.h> /* for setrlimit(). */
93 #define INPUT_MODE "rb"
94 #define OUTPUT_MODE "wb"
95 #endif
96
97
98 /* We have to include pcre_internal.h because we need the internal info for
99 displaying the results of pcre_study() and we also need to know about the
100 internal macros, structures, and other internal data values; pcretest has
101 "inside information" compared to a program that strictly follows the PCRE API.
102
103 Although pcre_internal.h does itself include pcre.h, we explicitly include it
104 here before pcre_internal.h so that the PCRE_EXP_xxx macros get set
105 appropriately for an application, not for building PCRE. */
106
107 #include "pcre.h"
108 #include "pcre_internal.h"
109
110 /* We need access to some of the data tables that PCRE uses. So as not to have
111 to keep two copies, we include the source file here, changing the names of the
112 external symbols to prevent clashes. */
113
114 #define _pcre_ucp_gentype ucp_gentype
115 #define _pcre_ucp_typerange ucp_typerange
116 #define _pcre_utf8_table1 utf8_table1
117 #define _pcre_utf8_table1_size utf8_table1_size
118 #define _pcre_utf8_table2 utf8_table2
119 #define _pcre_utf8_table3 utf8_table3
120 #define _pcre_utf8_table4 utf8_table4
121 #define _pcre_utt utt
122 #define _pcre_utt_size utt_size
123 #define _pcre_utt_names utt_names
124 #define _pcre_OP_lengths OP_lengths
125
126 #include "pcre_tables.c"
127
128 /* We also need the pcre_printint() function for printing out compiled
129 patterns. This function is in a separate file so that it can be included in
130 pcre_compile.c when that module is compiled with debugging enabled. It needs to
131 know which case is being compiled. */
132
133 #define COMPILING_PCRETEST
134 #include "pcre_printint.src"
135
136 /* The definition of the macro PRINTABLE, which determines whether to print an
137 output character as-is or as a hex value when showing compiled patterns, is
138 contained in the printint.src file. We uses it here also, in cases when the
139 locale has not been explicitly changed, so as to get consistent output from
140 systems that differ in their output from isprint() even in the "C" locale. */
141
142 #define PRINTHEX(c) (locale_set? isprint(c) : PRINTABLE(c))
143
144 /* It is possible to compile this test program without including support for
145 testing the POSIX interface, though this is not available via the standard
146 Makefile. */
147
148 #if !defined NOPOSIX
149 #include "pcreposix.h"
150 #endif
151
152 /* It is also possible, for the benefit of the version currently imported into
153 Exim, to build pcretest without support for UTF8 (define NOUTF8), without the
154 interface to the DFA matcher (NODFA), and without the doublecheck of the old
155 "info" function (define NOINFOCHECK). In fact, we automatically cut out the
156 UTF8 support if PCRE is built without it. */
157
158 #ifndef SUPPORT_UTF8
159 #ifndef NOUTF8
160 #define NOUTF8
161 #endif
162 #endif
163
164
165 /* Other parameters */
166
167 #ifndef CLOCKS_PER_SEC
168 #ifdef CLK_TCK
169 #define CLOCKS_PER_SEC CLK_TCK
170 #else
171 #define CLOCKS_PER_SEC 100
172 #endif
173 #endif
174
175 /* This is the default loop count for timing. */
176
177 #define LOOPREPEAT 500000
178
179 /* Static variables */
180
181 static FILE *outfile;
182 static int log_store = 0;
183 static int callout_count;
184 static int callout_extra;
185 static int callout_fail_count;
186 static int callout_fail_id;
187 static int debug_lengths;
188 static int first_callout;
189 static int locale_set = 0;
190 static int show_malloc;
191 static int use_utf8;
192 static size_t gotten_store;
193 static const unsigned char *last_callout_mark = NULL;
194
195 /* The buffers grow automatically if very long input lines are encountered. */
196
197 static int buffer_size = 50000;
198 static pcre_uint8 *buffer = NULL;
199 static pcre_uint8 *dbuffer = NULL;
200 static pcre_uint8 *pbuffer = NULL;
201
202 /* Textual explanations for runtime error codes */
203
204 static const char *errtexts[] = {
205 NULL, /* 0 is no error */
206 NULL, /* NOMATCH is handled specially */
207 "NULL argument passed",
208 "bad option value",
209 "magic number missing",
210 "unknown opcode - pattern overwritten?",
211 "no more memory",
212 NULL, /* never returned by pcre_exec() or pcre_dfa_exec() */
213 "match limit exceeded",
214 "callout error code",
215 NULL, /* BADUTF8 is handled specially */
216 "bad UTF-8 offset",
217 NULL, /* PARTIAL is handled specially */
218 "not used - internal error",
219 "internal error - pattern overwritten?",
220 "bad count value",
221 "item unsupported for DFA matching",
222 "backreference condition or recursion test not supported for DFA matching",
223 "match limit not supported for DFA matching",
224 "workspace size exceeded in DFA matching",
225 "too much recursion for DFA matching",
226 "recursion limit exceeded",
227 "not used - internal error",
228 "invalid combination of newline options",
229 "bad offset value",
230 NULL, /* SHORTUTF8 is handled specially */
231 "nested recursion at the same subject position",
232 "JIT stack limit reached"
233 };
234
235
236 /*************************************************
237 * Alternate character tables *
238 *************************************************/
239
240 /* By default, the "tables" pointer when calling PCRE is set to NULL, thereby
241 using the default tables of the library. However, the T option can be used to
242 select alternate sets of tables, for different kinds of testing. Note also that
243 the L (locale) option also adjusts the tables. */
244
245 /* This is the set of tables distributed as default with PCRE. It recognizes
246 only ASCII characters. */
247
248 static const unsigned char tables0[] = {
249
250 /* This table is a lower casing table. */
251
252 0, 1, 2, 3, 4, 5, 6, 7,
253 8, 9, 10, 11, 12, 13, 14, 15,
254 16, 17, 18, 19, 20, 21, 22, 23,
255 24, 25, 26, 27, 28, 29, 30, 31,
256 32, 33, 34, 35, 36, 37, 38, 39,
257 40, 41, 42, 43, 44, 45, 46, 47,
258 48, 49, 50, 51, 52, 53, 54, 55,
259 56, 57, 58, 59, 60, 61, 62, 63,
260 64, 97, 98, 99,100,101,102,103,
261 104,105,106,107,108,109,110,111,
262 112,113,114,115,116,117,118,119,
263 120,121,122, 91, 92, 93, 94, 95,
264 96, 97, 98, 99,100,101,102,103,
265 104,105,106,107,108,109,110,111,
266 112,113,114,115,116,117,118,119,
267 120,121,122,123,124,125,126,127,
268 128,129,130,131,132,133,134,135,
269 136,137,138,139,140,141,142,143,
270 144,145,146,147,148,149,150,151,
271 152,153,154,155,156,157,158,159,
272 160,161,162,163,164,165,166,167,
273 168,169,170,171,172,173,174,175,
274 176,177,178,179,180,181,182,183,
275 184,185,186,187,188,189,190,191,
276 192,193,194,195,196,197,198,199,
277 200,201,202,203,204,205,206,207,
278 208,209,210,211,212,213,214,215,
279 216,217,218,219,220,221,222,223,
280 224,225,226,227,228,229,230,231,
281 232,233,234,235,236,237,238,239,
282 240,241,242,243,244,245,246,247,
283 248,249,250,251,252,253,254,255,
284
285 /* This table is a case flipping table. */
286
287 0, 1, 2, 3, 4, 5, 6, 7,
288 8, 9, 10, 11, 12, 13, 14, 15,
289 16, 17, 18, 19, 20, 21, 22, 23,
290 24, 25, 26, 27, 28, 29, 30, 31,
291 32, 33, 34, 35, 36, 37, 38, 39,
292 40, 41, 42, 43, 44, 45, 46, 47,
293 48, 49, 50, 51, 52, 53, 54, 55,
294 56, 57, 58, 59, 60, 61, 62, 63,
295 64, 97, 98, 99,100,101,102,103,
296 104,105,106,107,108,109,110,111,
297 112,113,114,115,116,117,118,119,
298 120,121,122, 91, 92, 93, 94, 95,
299 96, 65, 66, 67, 68, 69, 70, 71,
300 72, 73, 74, 75, 76, 77, 78, 79,
301 80, 81, 82, 83, 84, 85, 86, 87,
302 88, 89, 90,123,124,125,126,127,
303 128,129,130,131,132,133,134,135,
304 136,137,138,139,140,141,142,143,
305 144,145,146,147,148,149,150,151,
306 152,153,154,155,156,157,158,159,
307 160,161,162,163,164,165,166,167,
308 168,169,170,171,172,173,174,175,
309 176,177,178,179,180,181,182,183,
310 184,185,186,187,188,189,190,191,
311 192,193,194,195,196,197,198,199,
312 200,201,202,203,204,205,206,207,
313 208,209,210,211,212,213,214,215,
314 216,217,218,219,220,221,222,223,
315 224,225,226,227,228,229,230,231,
316 232,233,234,235,236,237,238,239,
317 240,241,242,243,244,245,246,247,
318 248,249,250,251,252,253,254,255,
319
320 /* This table contains bit maps for various character classes. Each map is 32
321 bytes long and the bits run from the least significant end of each byte. The
322 classes that have their own maps are: space, xdigit, digit, upper, lower, word,
323 graph, print, punct, and cntrl. Other classes are built from combinations. */
324
325 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
329
330 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
331 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
334
335 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
339
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
341 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
344
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
346 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
349
350 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
351 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
354
355 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff,
356 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
359
360 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,
361 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
364
365 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc,
366 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
369
370 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
374
375 /* This table identifies various classes of character by individual bits:
376 0x01 white space character
377 0x02 letter
378 0x04 decimal digit
379 0x08 hexadecimal digit
380 0x10 alphanumeric or '_'
381 0x80 regular expression metacharacter or binary zero
382 */
383
384 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
385 0x00,0x01,0x01,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
388 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
389 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
390 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
391 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
392 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
393 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
394 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
395 0x12,0x12,0x12,0x80,0x80,0x00,0x80,0x10, /* X - _ */
396 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
397 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
398 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
399 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
406 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
414 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
416
417 /* This is a set of tables that came orginally from a Windows user. It seems to
418 be at least an approximation of ISO 8859. In particular, there are characters
419 greater than 128 that are marked as spaces, letters, etc. */
420
421 static const unsigned char tables1[] = {
422 0,1,2,3,4,5,6,7,
423 8,9,10,11,12,13,14,15,
424 16,17,18,19,20,21,22,23,
425 24,25,26,27,28,29,30,31,
426 32,33,34,35,36,37,38,39,
427 40,41,42,43,44,45,46,47,
428 48,49,50,51,52,53,54,55,
429 56,57,58,59,60,61,62,63,
430 64,97,98,99,100,101,102,103,
431 104,105,106,107,108,109,110,111,
432 112,113,114,115,116,117,118,119,
433 120,121,122,91,92,93,94,95,
434 96,97,98,99,100,101,102,103,
435 104,105,106,107,108,109,110,111,
436 112,113,114,115,116,117,118,119,
437 120,121,122,123,124,125,126,127,
438 128,129,130,131,132,133,134,135,
439 136,137,138,139,140,141,142,143,
440 144,145,146,147,148,149,150,151,
441 152,153,154,155,156,157,158,159,
442 160,161,162,163,164,165,166,167,
443 168,169,170,171,172,173,174,175,
444 176,177,178,179,180,181,182,183,
445 184,185,186,187,188,189,190,191,
446 224,225,226,227,228,229,230,231,
447 232,233,234,235,236,237,238,239,
448 240,241,242,243,244,245,246,215,
449 248,249,250,251,252,253,254,223,
450 224,225,226,227,228,229,230,231,
451 232,233,234,235,236,237,238,239,
452 240,241,242,243,244,245,246,247,
453 248,249,250,251,252,253,254,255,
454 0,1,2,3,4,5,6,7,
455 8,9,10,11,12,13,14,15,
456 16,17,18,19,20,21,22,23,
457 24,25,26,27,28,29,30,31,
458 32,33,34,35,36,37,38,39,
459 40,41,42,43,44,45,46,47,
460 48,49,50,51,52,53,54,55,
461 56,57,58,59,60,61,62,63,
462 64,97,98,99,100,101,102,103,
463 104,105,106,107,108,109,110,111,
464 112,113,114,115,116,117,118,119,
465 120,121,122,91,92,93,94,95,
466 96,65,66,67,68,69,70,71,
467 72,73,74,75,76,77,78,79,
468 80,81,82,83,84,85,86,87,
469 88,89,90,123,124,125,126,127,
470 128,129,130,131,132,133,134,135,
471 136,137,138,139,140,141,142,143,
472 144,145,146,147,148,149,150,151,
473 152,153,154,155,156,157,158,159,
474 160,161,162,163,164,165,166,167,
475 168,169,170,171,172,173,174,175,
476 176,177,178,179,180,181,182,183,
477 184,185,186,187,188,189,190,191,
478 224,225,226,227,228,229,230,231,
479 232,233,234,235,236,237,238,239,
480 240,241,242,243,244,245,246,215,
481 248,249,250,251,252,253,254,223,
482 192,193,194,195,196,197,198,199,
483 200,201,202,203,204,205,206,207,
484 208,209,210,211,212,213,214,247,
485 216,217,218,219,220,221,222,255,
486 0,62,0,0,1,0,0,0,
487 0,0,0,0,0,0,0,0,
488 32,0,0,0,1,0,0,0,
489 0,0,0,0,0,0,0,0,
490 0,0,0,0,0,0,255,3,
491 126,0,0,0,126,0,0,0,
492 0,0,0,0,0,0,0,0,
493 0,0,0,0,0,0,0,0,
494 0,0,0,0,0,0,255,3,
495 0,0,0,0,0,0,0,0,
496 0,0,0,0,0,0,12,2,
497 0,0,0,0,0,0,0,0,
498 0,0,0,0,0,0,0,0,
499 254,255,255,7,0,0,0,0,
500 0,0,0,0,0,0,0,0,
501 255,255,127,127,0,0,0,0,
502 0,0,0,0,0,0,0,0,
503 0,0,0,0,254,255,255,7,
504 0,0,0,0,0,4,32,4,
505 0,0,0,128,255,255,127,255,
506 0,0,0,0,0,0,255,3,
507 254,255,255,135,254,255,255,7,
508 0,0,0,0,0,4,44,6,
509 255,255,127,255,255,255,127,255,
510 0,0,0,0,254,255,255,255,
511 255,255,255,255,255,255,255,127,
512 0,0,0,0,254,255,255,255,
513 255,255,255,255,255,255,255,255,
514 0,2,0,0,255,255,255,255,
515 255,255,255,255,255,255,255,127,
516 0,0,0,0,255,255,255,255,
517 255,255,255,255,255,255,255,255,
518 0,0,0,0,254,255,0,252,
519 1,0,0,248,1,0,0,120,
520 0,0,0,0,254,255,255,255,
521 0,0,128,0,0,0,128,0,
522 255,255,255,255,0,0,0,0,
523 0,0,0,0,0,0,0,128,
524 255,255,255,255,0,0,0,0,
525 0,0,0,0,0,0,0,0,
526 128,0,0,0,0,0,0,0,
527 0,1,1,0,1,1,0,0,
528 0,0,0,0,0,0,0,0,
529 0,0,0,0,0,0,0,0,
530 1,0,0,0,128,0,0,0,
531 128,128,128,128,0,0,128,0,
532 28,28,28,28,28,28,28,28,
533 28,28,0,0,0,0,0,128,
534 0,26,26,26,26,26,26,18,
535 18,18,18,18,18,18,18,18,
536 18,18,18,18,18,18,18,18,
537 18,18,18,128,128,0,128,16,
538 0,26,26,26,26,26,26,18,
539 18,18,18,18,18,18,18,18,
540 18,18,18,18,18,18,18,18,
541 18,18,18,128,128,0,0,0,
542 0,0,0,0,0,1,0,0,
543 0,0,0,0,0,0,0,0,
544 0,0,0,0,0,0,0,0,
545 0,0,0,0,0,0,0,0,
546 1,0,0,0,0,0,0,0,
547 0,0,18,0,0,0,0,0,
548 0,0,20,20,0,18,0,0,
549 0,20,18,0,0,0,0,0,
550 18,18,18,18,18,18,18,18,
551 18,18,18,18,18,18,18,18,
552 18,18,18,18,18,18,18,0,
553 18,18,18,18,18,18,18,18,
554 18,18,18,18,18,18,18,18,
555 18,18,18,18,18,18,18,18,
556 18,18,18,18,18,18,18,0,
557 18,18,18,18,18,18,18,18
558 };
559
560
561
562
563 #ifndef HAVE_STRERROR
564 /*************************************************
565 * Provide strerror() for non-ANSI libraries *
566 *************************************************/
567
568 /* Some old-fashioned systems still around (e.g. SunOS4) don't have strerror()
569 in their libraries, but can provide the same facility by this simple
570 alternative function. */
571
572 extern int sys_nerr;
573 extern char *sys_errlist[];
574
575 char *
576 strerror(int n)
577 {
578 if (n < 0 || n >= sys_nerr) return "unknown error number";
579 return sys_errlist[n];
580 }
581 #endif /* HAVE_STRERROR */
582
583
584 /*************************************************
585 * JIT memory callback *
586 *************************************************/
587
588 static pcre_jit_stack* jit_callback(void *arg)
589 {
590 return (pcre_jit_stack *)arg;
591 }
592
593
594 /*************************************************
595 * Read or extend an input line *
596 *************************************************/
597
598 /* Input lines are read into buffer, but both patterns and data lines can be
599 continued over multiple input lines. In addition, if the buffer fills up, we
600 want to automatically expand it so as to be able to handle extremely large
601 lines that are needed for certain stress tests. When the input buffer is
602 expanded, the other two buffers must also be expanded likewise, and the
603 contents of pbuffer, which are a copy of the input for callouts, must be
604 preserved (for when expansion happens for a data line). This is not the most
605 optimal way of handling this, but hey, this is just a test program!
606
607 Arguments:
608 f the file to read
609 start where in buffer to start (this *must* be within buffer)
610 prompt for stdin or readline()
611
612 Returns: pointer to the start of new data
613 could be a copy of start, or could be moved
614 NULL if no data read and EOF reached
615 */
616
617 static pcre_uint8 *
618 extend_inputline(FILE *f, pcre_uint8 *start, const char *prompt)
619 {
620 pcre_uint8 *here = start;
621
622 for (;;)
623 {
624 int rlen = (int)(buffer_size - (here - buffer));
625
626 if (rlen > 1000)
627 {
628 int dlen;
629
630 /* If libreadline support is required, use readline() to read a line if the
631 input is a terminal. Note that readline() removes the trailing newline, so
632 we must put it back again, to be compatible with fgets(). */
633
634 #ifdef SUPPORT_LIBREADLINE
635 if (isatty(fileno(f)))
636 {
637 size_t len;
638 char *s = readline(prompt);
639 if (s == NULL) return (here == start)? NULL : start;
640 len = strlen(s);
641 if (len > 0) add_history(s);
642 if (len > rlen - 1) len = rlen - 1;
643 memcpy(here, s, len);
644 here[len] = '\n';
645 here[len+1] = 0;
646 free(s);
647 }
648 else
649 #endif
650
651 /* Read the next line by normal means, prompting if the file is stdin. */
652
653 {
654 if (f == stdin) printf("%s", prompt);
655 if (fgets((char *)here, rlen, f) == NULL)
656 return (here == start)? NULL : start;
657 }
658
659 dlen = (int)strlen((char *)here);
660 if (dlen > 0 && here[dlen - 1] == '\n') return start;
661 here += dlen;
662 }
663
664 else
665 {
666 int new_buffer_size = 2*buffer_size;
667 pcre_uint8 *new_buffer = (unsigned char *)malloc(new_buffer_size);
668 pcre_uint8 *new_dbuffer = (unsigned char *)malloc(new_buffer_size);
669 pcre_uint8 *new_pbuffer = (unsigned char *)malloc(new_buffer_size);
670
671 if (new_buffer == NULL || new_dbuffer == NULL || new_pbuffer == NULL)
672 {
673 fprintf(stderr, "pcretest: malloc(%d) failed\n", new_buffer_size);
674 exit(1);
675 }
676
677 memcpy(new_buffer, buffer, buffer_size);
678 memcpy(new_pbuffer, pbuffer, buffer_size);
679
680 buffer_size = new_buffer_size;
681
682 start = new_buffer + (start - buffer);
683 here = new_buffer + (here - buffer);
684
685 free(buffer);
686 free(dbuffer);
687 free(pbuffer);
688
689 buffer = new_buffer;
690 dbuffer = new_dbuffer;
691 pbuffer = new_pbuffer;
692 }
693 }
694
695 return NULL; /* Control never gets here */
696 }
697
698
699
700
701
702
703
704 /*************************************************
705 * Read number from string *
706 *************************************************/
707
708 /* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
709 around with conditional compilation, just do the job by hand. It is only used
710 for unpicking arguments, so just keep it simple.
711
712 Arguments:
713 str string to be converted
714 endptr where to put the end pointer
715
716 Returns: the unsigned long
717 */
718
719 static int
720 get_value(unsigned char *str, unsigned char **endptr)
721 {
722 int result = 0;
723 while(*str != 0 && isspace(*str)) str++;
724 while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0');
725 *endptr = str;
726 return(result);
727 }
728
729
730
731
732 /*************************************************
733 * Convert UTF-8 string to value *
734 *************************************************/
735
736 /* This function takes one or more bytes that represents a UTF-8 character,
737 and returns the value of the character.
738
739 Argument:
740 utf8bytes a pointer to the byte vector
741 vptr a pointer to an int to receive the value
742
743 Returns: > 0 => the number of bytes consumed
744 -6 to 0 => malformed UTF-8 character at offset = (-return)
745 */
746
747 #if !defined NOUTF8
748
749 static int
750 utf82ord(unsigned char *utf8bytes, int *vptr)
751 {
752 int c = *utf8bytes++;
753 int d = c;
754 int i, j, s;
755
756 for (i = -1; i < 6; i++) /* i is number of additional bytes */
757 {
758 if ((d & 0x80) == 0) break;
759 d <<= 1;
760 }
761
762 if (i == -1) { *vptr = c; return 1; } /* ascii character */
763 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
764
765 /* i now has a value in the range 1-5 */
766
767 s = 6*i;
768 d = (c & utf8_table3[i]) << s;
769
770 for (j = 0; j < i; j++)
771 {
772 c = *utf8bytes++;
773 if ((c & 0xc0) != 0x80) return -(j+1);
774 s -= 6;
775 d |= (c & 0x3f) << s;
776 }
777
778 /* Check that encoding was the correct unique one */
779
780 for (j = 0; j < utf8_table1_size; j++)
781 if (d <= utf8_table1[j]) break;
782 if (j != i) return -(i+1);
783
784 /* Valid value */
785
786 *vptr = d;
787 return i+1;
788 }
789
790 #endif
791
792
793
794 /*************************************************
795 * Convert character value to UTF-8 *
796 *************************************************/
797
798 /* This function takes an integer value in the range 0 - 0x7fffffff
799 and encodes it as a UTF-8 character in 0 to 6 bytes.
800
801 Arguments:
802 cvalue the character value
803 utf8bytes pointer to buffer for result - at least 6 bytes long
804
805 Returns: number of characters placed in the buffer
806 */
807
808 #if !defined NOUTF8
809
810 static int
811 ord2utf8(int cvalue, pcre_uint8 *utf8bytes)
812 {
813 register int i, j;
814 for (i = 0; i < utf8_table1_size; i++)
815 if (cvalue <= utf8_table1[i]) break;
816 utf8bytes += i;
817 for (j = i; j > 0; j--)
818 {
819 *utf8bytes-- = 0x80 | (cvalue & 0x3f);
820 cvalue >>= 6;
821 }
822 *utf8bytes = utf8_table2[i] | cvalue;
823 return i + 1;
824 }
825
826 #endif
827
828
829
830 /*************************************************
831 * Print character string *
832 *************************************************/
833
834 /* Character string printing function. Must handle UTF-8 strings in utf8
835 mode. Yields number of characters printed. If handed a NULL file, just counts
836 chars without printing. */
837
838 static int pchars(unsigned char *p, int length, FILE *f)
839 {
840 int c = 0;
841 int yield = 0;
842
843 while (length-- > 0)
844 {
845 #if !defined NOUTF8
846 if (use_utf8)
847 {
848 int rc = utf82ord(p, &c);
849
850 if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
851 {
852 length -= rc - 1;
853 p += rc;
854 if (PRINTHEX(c))
855 {
856 if (f != NULL) fprintf(f, "%c", c);
857 yield++;
858 }
859 else
860 {
861 int n = 4;
862 if (f != NULL) fprintf(f, "\\x{%02x}", c);
863 yield += (n <= 0x000000ff)? 2 :
864 (n <= 0x00000fff)? 3 :
865 (n <= 0x0000ffff)? 4 :
866 (n <= 0x000fffff)? 5 : 6;
867 }
868 continue;
869 }
870 }
871 #endif
872
873 /* Not UTF-8, or malformed UTF-8 */
874
875 c = *p++;
876 if (PRINTHEX(c))
877 {
878 if (f != NULL) fprintf(f, "%c", c);
879 yield++;
880 }
881 else
882 {
883 if (f != NULL) fprintf(f, "\\x%02x", c);
884 yield += 4;
885 }
886 }
887
888 return yield;
889 }
890
891
892
893 /*************************************************
894 * Callout function *
895 *************************************************/
896
897 /* Called from PCRE as a result of the (?C) item. We print out where we are in
898 the match. Yield zero unless more callouts than the fail count, or the callout
899 data is not zero. */
900
901 static int callout(pcre_callout_block *cb)
902 {
903 FILE *f = (first_callout | callout_extra)? outfile : NULL;
904 int i, pre_start, post_start, subject_length;
905
906 if (callout_extra)
907 {
908 fprintf(f, "Callout %d: last capture = %d\n",
909 cb->callout_number, cb->capture_last);
910
911 for (i = 0; i < cb->capture_top * 2; i += 2)
912 {
913 if (cb->offset_vector[i] < 0)
914 fprintf(f, "%2d: <unset>\n", i/2);
915 else
916 {
917 fprintf(f, "%2d: ", i/2);
918 (void)pchars((unsigned char *)cb->subject + cb->offset_vector[i],
919 cb->offset_vector[i+1] - cb->offset_vector[i], f);
920 fprintf(f, "\n");
921 }
922 }
923 }
924
925 /* Re-print the subject in canonical form, the first time or if giving full
926 datails. On subsequent calls in the same match, we use pchars just to find the
927 printed lengths of the substrings. */
928
929 if (f != NULL) fprintf(f, "--->");
930
931 pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
932 post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
933 cb->current_position - cb->start_match, f);
934
935 subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
936
937 (void)pchars((unsigned char *)(cb->subject + cb->current_position),
938 cb->subject_length - cb->current_position, f);
939
940 if (f != NULL) fprintf(f, "\n");
941
942 /* Always print appropriate indicators, with callout number if not already
943 shown. For automatic callouts, show the pattern offset. */
944
945 if (cb->callout_number == 255)
946 {
947 fprintf(outfile, "%+3d ", cb->pattern_position);
948 if (cb->pattern_position > 99) fprintf(outfile, "\n ");
949 }
950 else
951 {
952 if (callout_extra) fprintf(outfile, " ");
953 else fprintf(outfile, "%3d ", cb->callout_number);
954 }
955
956 for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
957 fprintf(outfile, "^");
958
959 if (post_start > 0)
960 {
961 for (i = 0; i < post_start - 1; i++) fprintf(outfile, " ");
962 fprintf(outfile, "^");
963 }
964
965 for (i = 0; i < subject_length - pre_start - post_start + 4; i++)
966 fprintf(outfile, " ");
967
968 fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
969 pbuffer + cb->pattern_position);
970
971 fprintf(outfile, "\n");
972 first_callout = 0;
973
974 if (cb->mark != last_callout_mark)
975 {
976 fprintf(outfile, "Latest Mark: %s\n",
977 (cb->mark == NULL)? "<unset>" : (char *)(cb->mark));
978 last_callout_mark = cb->mark;
979 }
980
981 if (cb->callout_data != NULL)
982 {
983 int callout_data = *((int *)(cb->callout_data));
984 if (callout_data != 0)
985 {
986 fprintf(outfile, "Callout data = %d\n", callout_data);
987 return callout_data;
988 }
989 }
990
991 return (cb->callout_number != callout_fail_id)? 0 :
992 (++callout_count >= callout_fail_count)? 1 : 0;
993 }
994
995
996 /*************************************************
997 * Local malloc functions *
998 *************************************************/
999
1000 /* Alternative malloc function, to test functionality and save the size of a
1001 compiled re. The show_malloc variable is set only during matching. */
1002
1003 static void *new_malloc(size_t size)
1004 {
1005 void *block = malloc(size);
1006 gotten_store = size;
1007 if (show_malloc)
1008 fprintf(outfile, "malloc %3d %p\n", (int)size, block);
1009 return block;
1010 }
1011
1012 static void new_free(void *block)
1013 {
1014 if (show_malloc)
1015 fprintf(outfile, "free %p\n", block);
1016 free(block);
1017 }
1018
1019 /* For recursion malloc/free, to test stacking calls */
1020
1021 static void *stack_malloc(size_t size)
1022 {
1023 void *block = malloc(size);
1024 if (show_malloc)
1025 fprintf(outfile, "stack_malloc %3d %p\n", (int)size, block);
1026 return block;
1027 }
1028
1029 static void stack_free(void *block)
1030 {
1031 if (show_malloc)
1032 fprintf(outfile, "stack_free %p\n", block);
1033 free(block);
1034 }
1035
1036
1037 /*************************************************
1038 * Call pcre_fullinfo() *
1039 *************************************************/
1040
1041 /* Get one piece of information from the pcre_fullinfo() function */
1042
1043 static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
1044 {
1045 int rc;
1046 if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
1047 fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
1048 }
1049
1050
1051
1052 /*************************************************
1053 * Byte flipping function *
1054 *************************************************/
1055
1056 static unsigned long int
1057 byteflip(unsigned long int value, int n)
1058 {
1059 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
1060 return ((value & 0x000000ff) << 24) |
1061 ((value & 0x0000ff00) << 8) |
1062 ((value & 0x00ff0000) >> 8) |
1063 ((value & 0xff000000) >> 24);
1064 }
1065
1066
1067
1068
1069 /*************************************************
1070 * Check match or recursion limit *
1071 *************************************************/
1072
1073 static int
1074 check_match_limit(pcre *re, pcre_extra *extra, pcre_uint8 *bptr, int len,
1075 int start_offset, int options, int *use_offsets, int use_size_offsets,
1076 int flag, unsigned long int *limit, int errnumber, const char *msg)
1077 {
1078 int count;
1079 int min = 0;
1080 int mid = 64;
1081 int max = -1;
1082
1083 extra->flags |= flag;
1084
1085 for (;;)
1086 {
1087 *limit = mid;
1088
1089 count = pcre_exec(re, extra, (char *)bptr, len, start_offset, options,
1090 use_offsets, use_size_offsets);
1091
1092 if (count == errnumber)
1093 {
1094 /* fprintf(outfile, "Testing %s limit = %d\n", msg, mid); */
1095 min = mid;
1096 mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
1097 }
1098
1099 else if (count >= 0 || count == PCRE_ERROR_NOMATCH ||
1100 count == PCRE_ERROR_PARTIAL)
1101 {
1102 if (mid == min + 1)
1103 {
1104 fprintf(outfile, "Minimum %s limit = %d\n", msg, mid);
1105 break;
1106 }
1107 /* fprintf(outfile, "Testing %s limit = %d\n", msg, mid); */
1108 max = mid;
1109 mid = (min + mid)/2;
1110 }
1111 else break; /* Some other error */
1112 }
1113
1114 extra->flags &= ~flag;
1115 return count;
1116 }
1117
1118
1119
1120 /*************************************************
1121 * Case-independent strncmp() function *
1122 *************************************************/
1123
1124 /*
1125 Arguments:
1126 s first string
1127 t second string
1128 n number of characters to compare
1129
1130 Returns: < 0, = 0, or > 0, according to the comparison
1131 */
1132
1133 static int
1134 strncmpic(pcre_uint8 *s, pcre_uint8 *t, int n)
1135 {
1136 while (n--)
1137 {
1138 int c = tolower(*s++) - tolower(*t++);
1139 if (c) return c;
1140 }
1141 return 0;
1142 }
1143
1144
1145
1146 /*************************************************
1147 * Check newline indicator *
1148 *************************************************/
1149
1150 /* This is used both at compile and run-time to check for <xxx> escapes. Print
1151 a message and return 0 if there is no match.
1152
1153 Arguments:
1154 p points after the leading '<'
1155 f file for error message
1156
1157 Returns: appropriate PCRE_NEWLINE_xxx flags, or 0
1158 */
1159
1160 static int
1161 check_newline(pcre_uint8 *p, FILE *f)
1162 {
1163 if (strncmpic(p, (pcre_uint8 *)"cr>", 3) == 0) return PCRE_NEWLINE_CR;
1164 if (strncmpic(p, (pcre_uint8 *)"lf>", 3) == 0) return PCRE_NEWLINE_LF;
1165 if (strncmpic(p, (pcre_uint8 *)"crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;
1166 if (strncmpic(p, (pcre_uint8 *)"anycrlf>", 8) == 0) return PCRE_NEWLINE_ANYCRLF;
1167 if (strncmpic(p, (pcre_uint8 *)"any>", 4) == 0) return PCRE_NEWLINE_ANY;
1168 if (strncmpic(p, (pcre_uint8 *)"bsr_anycrlf>", 12) == 0) return PCRE_BSR_ANYCRLF;
1169 if (strncmpic(p, (pcre_uint8 *)"bsr_unicode>", 12) == 0) return PCRE_BSR_UNICODE;
1170 fprintf(f, "Unknown newline type at: <%s\n", p);
1171 return 0;
1172 }
1173
1174
1175
1176 /*************************************************
1177 * Usage function *
1178 *************************************************/
1179
1180 static void
1181 usage(void)
1182 {
1183 printf("Usage: pcretest [options] [<input file> [<output file>]]\n\n");
1184 printf("Input and output default to stdin and stdout.\n");
1185 #ifdef SUPPORT_LIBREADLINE
1186 printf("If input is a terminal, readline() is used to read from it.\n");
1187 #else
1188 printf("This version of pcretest is not linked with readline().\n");
1189 #endif
1190 printf("\nOptions:\n");
1191 printf(" -b show compiled code (bytecode)\n");
1192 printf(" -C show PCRE compile-time options and exit\n");
1193 printf(" -d debug: show compiled code and information (-b and -i)\n");
1194 #if !defined NODFA
1195 printf(" -dfa force DFA matching for all subjects\n");
1196 #endif
1197 printf(" -help show usage information\n");
1198 printf(" -i show information about compiled patterns\n"
1199 " -M find MATCH_LIMIT minimum for each subject\n"
1200 " -m output memory used information\n"
1201 " -o <n> set size of offsets vector to <n>\n");
1202 #if !defined NOPOSIX
1203 printf(" -p use POSIX interface\n");
1204 #endif
1205 printf(" -q quiet: do not output PCRE version number at start\n");
1206 printf(" -S <n> set stack size to <n> megabytes\n");
1207 printf(" -s force each pattern to be studied at basic level\n"
1208 " -s+ force each pattern to be studied, using JIT if available\n"
1209 " -t time compilation and execution\n");
1210 printf(" -t <n> time compilation and execution, repeating <n> times\n");
1211 printf(" -tm time execution (matching) only\n");
1212 printf(" -tm <n> time execution (matching) only, repeating <n> times\n");
1213 }
1214
1215
1216
1217 /*************************************************
1218 * Main Program *
1219 *************************************************/
1220
1221 /* Read lines from named file or stdin and write to named file or stdout; lines
1222 consist of a regular expression, in delimiters and optionally followed by
1223 options, followed by a set of test data, terminated by an empty line. */
1224
1225 int main(int argc, char **argv)
1226 {
1227 FILE *infile = stdin;
1228 int options = 0;
1229 int study_options = 0;
1230 int default_find_match_limit = FALSE;
1231 int op = 1;
1232 int timeit = 0;
1233 int timeitm = 0;
1234 int showinfo = 0;
1235 int showstore = 0;
1236 int force_study = -1;
1237 int force_study_options = 0;
1238 int quiet = 0;
1239 int size_offsets = 45;
1240 int size_offsets_max;
1241 int *offsets = NULL;
1242 #if !defined NOPOSIX
1243 int posix = 0;
1244 #endif
1245 int debug = 0;
1246 int done = 0;
1247 int all_use_dfa = 0;
1248 int yield = 0;
1249 int stack_size;
1250
1251 pcre_jit_stack *jit_stack = NULL;
1252
1253
1254 /* These vectors store, end-to-end, a list of captured substring names. Assume
1255 that 1024 is plenty long enough for the few names we'll be testing. */
1256
1257 pcre_uchar copynames[1024];
1258 pcre_uchar getnames[1024];
1259
1260 pcre_uchar *copynamesptr;
1261 pcre_uchar *getnamesptr;
1262
1263 /* Get buffers from malloc() so that Electric Fence will check their misuse
1264 when I am debugging. They grow automatically when very long lines are read. */
1265
1266 buffer = (pcre_uint8 *)malloc(buffer_size);
1267 dbuffer = (pcre_uint8 *)malloc(buffer_size);
1268 pbuffer = (pcre_uint8 *)malloc(buffer_size);
1269
1270 /* The outfile variable is static so that new_malloc can use it. */
1271
1272 outfile = stdout;
1273
1274 /* The following _setmode() stuff is some Windows magic that tells its runtime
1275 library to translate CRLF into a single LF character. At least, that's what
1276 I've been told: never having used Windows I take this all on trust. Originally
1277 it set 0x8000, but then I was advised that _O_BINARY was better. */
1278
1279 #if defined(_WIN32) || defined(WIN32)
1280 _setmode( _fileno( stdout ), _O_BINARY );
1281 #endif
1282
1283 /* Scan options */
1284
1285 while (argc > 1 && argv[op][0] == '-')
1286 {
1287 unsigned char *endptr;
1288
1289 if (strcmp(argv[op], "-m") == 0) showstore = 1;
1290 else if (strcmp(argv[op], "-s") == 0) force_study = 0;
1291 else if (strcmp(argv[op], "-s+") == 0)
1292 {
1293 force_study = 1;
1294 force_study_options = PCRE_STUDY_JIT_COMPILE;
1295 }
1296 else if (strcmp(argv[op], "-q") == 0) quiet = 1;
1297 else if (strcmp(argv[op], "-b") == 0) debug = 1;
1298 else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
1299 else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
1300 else if (strcmp(argv[op], "-M") == 0) default_find_match_limit = TRUE;
1301 #if !defined NODFA
1302 else if (strcmp(argv[op], "-dfa") == 0) all_use_dfa = 1;
1303 #endif
1304 else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
1305 ((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
1306 *endptr == 0))
1307 {
1308 op++;
1309 argc--;
1310 }
1311 else if (strcmp(argv[op], "-t") == 0 || strcmp(argv[op], "-tm") == 0)
1312 {
1313 int both = argv[op][2] == 0;
1314 int temp;
1315 if (argc > 2 && (temp = get_value((unsigned char *)argv[op+1], &endptr),
1316 *endptr == 0))
1317 {
1318 timeitm = temp;
1319 op++;
1320 argc--;
1321 }
1322 else timeitm = LOOPREPEAT;
1323 if (both) timeit = timeitm;
1324 }
1325 else if (strcmp(argv[op], "-S") == 0 && argc > 2 &&
1326 ((stack_size = get_value((unsigned char *)argv[op+1], &endptr)),
1327 *endptr == 0))
1328 {
1329 #if defined(_WIN32) || defined(WIN32) || defined(__minix)
1330 printf("PCRE: -S not supported on this OS\n");
1331 exit(1);
1332 #else
1333 int rc;
1334 struct rlimit rlim;
1335 getrlimit(RLIMIT_STACK, &rlim);
1336 rlim.rlim_cur = stack_size * 1024 * 1024;
1337 rc = setrlimit(RLIMIT_STACK, &rlim);
1338 if (rc != 0)
1339 {
1340 printf("PCRE: setrlimit() failed with error %d\n", rc);
1341 exit(1);
1342 }
1343 op++;
1344 argc--;
1345 #endif
1346 }
1347 #if !defined NOPOSIX
1348 else if (strcmp(argv[op], "-p") == 0) posix = 1;
1349 #endif
1350 else if (strcmp(argv[op], "-C") == 0)
1351 {
1352 int rc;
1353 unsigned long int lrc;
1354 printf("PCRE version %s\n", pcre_version());
1355 printf("Compiled with\n");
1356 (void)pcre_config(PCRE_CONFIG_UTF8, &rc);
1357 printf(" %sUTF-8 support\n", rc? "" : "No ");
1358 (void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
1359 printf(" %sUnicode properties support\n", rc? "" : "No ");
1360 (void)pcre_config(PCRE_CONFIG_JIT, &rc);
1361 if (rc)
1362 printf(" Just-in-time compiler support\n");
1363 else
1364 printf(" No just-in-time compiler support\n");
1365 (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
1366 /* Note that these values are always the ASCII values, even
1367 in EBCDIC environments. CR is 13 and NL is 10. */
1368 printf(" Newline sequence is %s\n", (rc == 13)? "CR" :
1369 (rc == 10)? "LF" : (rc == (13<<8 | 10))? "CRLF" :
1370 (rc == -2)? "ANYCRLF" :
1371 (rc == -1)? "ANY" : "???");
1372 (void)pcre_config(PCRE_CONFIG_BSR, &rc);
1373 printf(" \\R matches %s\n", rc? "CR, LF, or CRLF only" :
1374 "all Unicode newlines");
1375 (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
1376 printf(" Internal link size = %d\n", rc);
1377 (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
1378 printf(" POSIX malloc threshold = %d\n", rc);
1379 (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &lrc);
1380 printf(" Default match limit = %ld\n", lrc);
1381 (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT_RECURSION, &lrc);
1382 printf(" Default recursion depth limit = %ld\n", lrc);
1383 (void)pcre_config(PCRE_CONFIG_STACKRECURSE, &rc);
1384 printf(" Match recursion uses %s\n", rc? "stack" : "heap");
1385 goto EXIT;
1386 }
1387 else if (strcmp(argv[op], "-help") == 0 ||
1388 strcmp(argv[op], "--help") == 0)
1389 {
1390 usage();
1391 goto EXIT;
1392 }
1393 else
1394 {
1395 printf("** Unknown or malformed option %s\n", argv[op]);
1396 usage();
1397 yield = 1;
1398 goto EXIT;
1399 }
1400 op++;
1401 argc--;
1402 }
1403
1404 /* Get the store for the offsets vector, and remember what it was */
1405
1406 size_offsets_max = size_offsets;
1407 offsets = (int *)malloc(size_offsets_max * sizeof(int));
1408 if (offsets == NULL)
1409 {
1410 printf("** Failed to get %d bytes of memory for offsets vector\n",
1411 (int)(size_offsets_max * sizeof(int)));
1412 yield = 1;
1413 goto EXIT;
1414 }
1415
1416 /* Sort out the input and output files */
1417
1418 if (argc > 1)
1419 {
1420 infile = fopen(argv[op], INPUT_MODE);
1421 if (infile == NULL)
1422 {
1423 printf("** Failed to open %s\n", argv[op]);
1424 yield = 1;
1425 goto EXIT;
1426 }
1427 }
1428
1429 if (argc > 2)
1430 {
1431 outfile = fopen(argv[op+1], OUTPUT_MODE);
1432 if (outfile == NULL)
1433 {
1434 printf("** Failed to open %s\n", argv[op+1]);
1435 yield = 1;
1436 goto EXIT;
1437 }
1438 }
1439
1440 /* Set alternative malloc function */
1441
1442 pcre_malloc = new_malloc;
1443 pcre_free = new_free;
1444 pcre_stack_malloc = stack_malloc;
1445 pcre_stack_free = stack_free;
1446
1447 /* Heading line unless quiet, then prompt for first regex if stdin */
1448
1449 if (!quiet) fprintf(outfile, "PCRE version %s\n\n", pcre_version());
1450
1451 /* Main loop */
1452
1453 while (!done)
1454 {
1455 pcre *re = NULL;
1456 pcre_extra *extra = NULL;
1457
1458 #if !defined NOPOSIX /* There are still compilers that require no indent */
1459 regex_t preg;
1460 int do_posix = 0;
1461 #endif
1462
1463 const char *error;
1464 unsigned char *markptr;
1465 unsigned char *p, *pp, *ppp;
1466 unsigned char *to_file = NULL;
1467 const unsigned char *tables = NULL;
1468 unsigned long int true_size, true_study_size = 0;
1469 size_t size, regex_gotten_store;
1470 int do_allcaps = 0;
1471 int do_mark = 0;
1472 int do_study = 0;
1473 int no_force_study = 0;
1474 int do_debug = debug;
1475 int do_G = 0;
1476 int do_g = 0;
1477 int do_showinfo = showinfo;
1478 int do_showrest = 0;
1479 int do_showcaprest = 0;
1480 int do_flip = 0;
1481 int erroroffset, len, delimiter, poffset;
1482
1483 use_utf8 = 0;
1484 debug_lengths = 1;
1485
1486 if (extend_inputline(infile, buffer, " re> ") == NULL) break;
1487 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
1488 fflush(outfile);
1489
1490 p = buffer;
1491 while (isspace(*p)) p++;
1492 if (*p == 0) continue;
1493
1494 /* See if the pattern is to be loaded pre-compiled from a file. */
1495
1496 if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
1497 {
1498 unsigned long int magic, get_options;
1499 pcre_uint8 sbuf[8];
1500 FILE *f;
1501
1502 p++;
1503 pp = p + (int)strlen((char *)p);
1504 while (isspace(pp[-1])) pp--;
1505 *pp = 0;
1506
1507 f = fopen((char *)p, "rb");
1508 if (f == NULL)
1509 {
1510 fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
1511 continue;
1512 }
1513
1514 if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
1515
1516 true_size =
1517 (sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
1518 true_study_size =
1519 (sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
1520
1521 re = (real_pcre *)new_malloc(true_size);
1522 regex_gotten_store = gotten_store;
1523
1524 if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
1525
1526 magic = ((real_pcre *)re)->magic_number;
1527 if (magic != MAGIC_NUMBER)
1528 {
1529 if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
1530 {
1531 do_flip = 1;
1532 }
1533 else
1534 {
1535 fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
1536 fclose(f);
1537 continue;
1538 }
1539 }
1540
1541 fprintf(outfile, "Compiled pattern%s loaded from %s\n",
1542 do_flip? " (byte-inverted)" : "", p);
1543
1544 /* Need to know if UTF-8 for printing data strings */
1545
1546 new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
1547 use_utf8 = (get_options & PCRE_UTF8) != 0;
1548
1549 /* Now see if there is any following study data. */
1550
1551 if (true_study_size != 0)
1552 {
1553 pcre_study_data *psd;
1554
1555 extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
1556 extra->flags = PCRE_EXTRA_STUDY_DATA;
1557
1558 psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
1559 extra->study_data = psd;
1560
1561 if (fread(psd, 1, true_study_size, f) != true_study_size)
1562 {
1563 FAIL_READ:
1564 fprintf(outfile, "Failed to read data from %s\n", p);
1565 if (extra != NULL) pcre_free_study(extra);
1566 if (re != NULL) new_free(re);
1567 fclose(f);
1568 continue;
1569 }
1570 fprintf(outfile, "Study data loaded from %s\n", p);
1571 do_study = 1; /* To get the data output if requested */
1572 }
1573 else fprintf(outfile, "No study data\n");
1574
1575 fclose(f);
1576 goto SHOW_INFO;
1577 }
1578
1579 /* In-line pattern (the usual case). Get the delimiter and seek the end of
1580 the pattern; if is isn't complete, read more. */
1581
1582 delimiter = *p++;
1583
1584 if (isalnum(delimiter) || delimiter == '\\')
1585 {
1586 fprintf(outfile, "** Delimiter must not be alphanumeric or \\\n");
1587 goto SKIP_DATA;
1588 }
1589
1590 pp = p;
1591 poffset = (int)(p - buffer);
1592
1593 for(;;)
1594 {
1595 while (*pp != 0)
1596 {
1597 if (*pp == '\\' && pp[1] != 0) pp++;
1598 else if (*pp == delimiter) break;
1599 pp++;
1600 }
1601 if (*pp != 0) break;
1602 if ((pp = extend_inputline(infile, pp, " > ")) == NULL)
1603 {
1604 fprintf(outfile, "** Unexpected EOF\n");
1605 done = 1;
1606 goto CONTINUE;
1607 }
1608 if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
1609 }
1610
1611 /* The buffer may have moved while being extended; reset the start of data
1612 pointer to the correct relative point in the buffer. */
1613
1614 p = buffer + poffset;
1615
1616 /* If the first character after the delimiter is backslash, make
1617 the pattern end with backslash. This is purely to provide a way
1618 of testing for the error message when a pattern ends with backslash. */
1619
1620 if (pp[1] == '\\') *pp++ = '\\';
1621
1622 /* Terminate the pattern at the delimiter, and save a copy of the pattern
1623 for callouts. */
1624
1625 *pp++ = 0;
1626 strcpy((char *)pbuffer, (char *)p);
1627
1628 /* Look for options after final delimiter */
1629
1630 options = 0;
1631 log_store = showstore; /* default from command line */
1632
1633 while (*pp != 0)
1634 {
1635 switch (*pp++)
1636 {
1637 case 'f': options |= PCRE_FIRSTLINE; break;
1638 case 'g': do_g = 1; break;
1639 case 'i': options |= PCRE_CASELESS; break;
1640 case 'm': options |= PCRE_MULTILINE; break;
1641 case 's': options |= PCRE_DOTALL; break;
1642 case 'x': options |= PCRE_EXTENDED; break;
1643
1644 case '+':
1645 if (do_showrest) do_showcaprest = 1; else do_showrest = 1;
1646 break;
1647
1648 case '=': do_allcaps = 1; break;
1649 case 'A': options |= PCRE_ANCHORED; break;
1650 case 'B': do_debug = 1; break;
1651 case 'C': options |= PCRE_AUTO_CALLOUT; break;
1652 case 'D': do_debug = do_showinfo = 1; break;
1653 case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
1654 case 'F': do_flip = 1; break;
1655 case 'G': do_G = 1; break;
1656 case 'I': do_showinfo = 1; break;
1657 case 'J': options |= PCRE_DUPNAMES; break;
1658 case 'K': do_mark = 1; break;
1659 case 'M': log_store = 1; break;
1660 case 'N': options |= PCRE_NO_AUTO_CAPTURE; break;
1661
1662 #if !defined NOPOSIX
1663 case 'P': do_posix = 1; break;
1664 #endif
1665
1666 case 'S':
1667 if (do_study == 0)
1668 {
1669 do_study = 1;
1670 if (*pp == '+')
1671 {
1672 study_options |= PCRE_STUDY_JIT_COMPILE;
1673 pp++;
1674 }
1675 }
1676 else
1677 {
1678 do_study = 0;
1679 no_force_study = 1;
1680 }
1681 break;
1682
1683 case 'U': options |= PCRE_UNGREEDY; break;
1684 case 'W': options |= PCRE_UCP; break;
1685 case 'X': options |= PCRE_EXTRA; break;
1686 case 'Y': options |= PCRE_NO_START_OPTIMISE; break;
1687 case 'Z': debug_lengths = 0; break;
1688 case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
1689 case '?': options |= PCRE_NO_UTF8_CHECK; break;
1690
1691 case 'T':
1692 switch (*pp++)
1693 {
1694 case '0': tables = tables0; break;
1695 case '1': tables = tables1; break;
1696
1697 case '\r':
1698 case '\n':
1699 case ' ':
1700 case 0:
1701 fprintf(outfile, "** Missing table number after /T\n");
1702 goto SKIP_DATA;
1703
1704 default:
1705 fprintf(outfile, "** Bad table number \"%c\" after /T\n", pp[-1]);
1706 goto SKIP_DATA;
1707 }
1708 break;
1709
1710 case 'L':
1711 ppp = pp;
1712 /* The '\r' test here is so that it works on Windows. */
1713 /* The '0' test is just in case this is an unterminated line. */
1714 while (*ppp != 0 && *ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++;
1715 *ppp = 0;
1716 if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
1717 {
1718 fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
1719 goto SKIP_DATA;
1720 }
1721 locale_set = 1;
1722 tables = pcre_maketables();
1723 pp = ppp;
1724 break;
1725
1726 case '>':
1727 to_file = pp;
1728 while (*pp != 0) pp++;
1729 while (isspace(pp[-1])) pp--;
1730 *pp = 0;
1731 break;
1732
1733 case '<':
1734 {
1735 if (strncmpic(pp, (pcre_uint8 *)"JS>", 3) == 0)
1736 {
1737 options |= PCRE_JAVASCRIPT_COMPAT;
1738 pp += 3;
1739 }
1740 else
1741 {
1742 int x = check_newline(pp, outfile);
1743 if (x == 0) goto SKIP_DATA;
1744 options |= x;
1745 while (*pp++ != '>');
1746 }
1747 }
1748 break;
1749
1750 case '\r': /* So that it works in Windows */
1751 case '\n':
1752 case ' ':
1753 break;
1754
1755 default:
1756 fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
1757 goto SKIP_DATA;
1758 }
1759 }
1760
1761 /* Handle compiling via the POSIX interface, which doesn't support the
1762 timing, showing, or debugging options, nor the ability to pass over
1763 local character tables. */
1764
1765 #if !defined NOPOSIX
1766 if (posix || do_posix)
1767 {
1768 int rc;
1769 int cflags = 0;
1770
1771 if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
1772 if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
1773 if ((options & PCRE_DOTALL) != 0) cflags |= REG_DOTALL;
1774 if ((options & PCRE_NO_AUTO_CAPTURE) != 0) cflags |= REG_NOSUB;
1775 if ((options & PCRE_UTF8) != 0) cflags |= REG_UTF8;
1776 if ((options & PCRE_UCP) != 0) cflags |= REG_UCP;
1777 if ((options & PCRE_UNGREEDY) != 0) cflags |= REG_UNGREEDY;
1778
1779 rc = regcomp(&preg, (char *)p, cflags);
1780
1781 /* Compilation failed; go back for another re, skipping to blank line
1782 if non-interactive. */
1783
1784 if (rc != 0)
1785 {
1786 (void)regerror(rc, &preg, (char *)buffer, buffer_size);
1787 fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
1788 goto SKIP_DATA;
1789 }
1790 }
1791
1792 /* Handle compiling via the native interface */
1793
1794 else
1795 #endif /* !defined NOPOSIX */
1796
1797 {
1798 unsigned long int get_options;
1799
1800 if (timeit > 0)
1801 {
1802 register int i;
1803 clock_t time_taken;
1804 clock_t start_time = clock();
1805 for (i = 0; i < timeit; i++)
1806 {
1807 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
1808 if (re != NULL) free(re);
1809 }
1810 time_taken = clock() - start_time;
1811 fprintf(outfile, "Compile time %.4f milliseconds\n",
1812 (((double)time_taken * 1000.0) / (double)timeit) /
1813 (double)CLOCKS_PER_SEC);
1814 }
1815
1816 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
1817
1818 /* Compilation failed; go back for another re, skipping to blank line
1819 if non-interactive. */
1820
1821 if (re == NULL)
1822 {
1823 fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
1824 SKIP_DATA:
1825 if (infile != stdin)
1826 {
1827 for (;;)
1828 {
1829 if (extend_inputline(infile, buffer, NULL) == NULL)
1830 {
1831 done = 1;
1832 goto CONTINUE;
1833 }
1834 len = (int)strlen((char *)buffer);
1835 while (len > 0 && isspace(buffer[len-1])) len--;
1836 if (len == 0) break;
1837 }
1838 fprintf(outfile, "\n");
1839 }
1840 goto CONTINUE;
1841 }
1842
1843 /* Compilation succeeded. It is now possible to set the UTF-8 option from
1844 within the regex; check for this so that we know how to process the data
1845 lines. */
1846
1847 new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
1848 if ((get_options & PCRE_UTF8) != 0) use_utf8 = 1;
1849
1850 /* Print information if required. There are now two info-returning
1851 functions. The old one has a limited interface and returns only limited
1852 data. Check that it agrees with the newer one. */
1853
1854 if (log_store)
1855 fprintf(outfile, "Memory allocation (code space): %d\n",
1856 (int)(gotten_store -
1857 sizeof(real_pcre) -
1858 ((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
1859
1860 /* Extract the size for possible writing before possibly flipping it,
1861 and remember the store that was got. */
1862
1863 true_size = ((real_pcre *)re)->size;
1864 regex_gotten_store = gotten_store;
1865
1866 /* If -s or /S was present, study the regex to generate additional info to
1867 help with the matching, unless the pattern has the SS option, which
1868 suppresses the effect of /S (used for a few test patterns where studying is
1869 never sensible). */
1870
1871 if (do_study || (force_study >= 0 && !no_force_study))
1872 {
1873 if (timeit > 0)
1874 {
1875 register int i;
1876 clock_t time_taken;
1877 clock_t start_time = clock();
1878 for (i = 0; i < timeit; i++)
1879 extra = pcre_study(re, study_options | force_study_options, &error);
1880 time_taken = clock() - start_time;
1881 if (extra != NULL) pcre_free_study(extra);
1882 fprintf(outfile, " Study time %.4f milliseconds\n",
1883 (((double)time_taken * 1000.0) / (double)timeit) /
1884 (double)CLOCKS_PER_SEC);
1885 }
1886 extra = pcre_study(re, study_options | force_study_options, &error);
1887 if (error != NULL)
1888 fprintf(outfile, "Failed to study: %s\n", error);
1889 else if (extra != NULL)
1890 true_study_size = ((pcre_study_data *)(extra->study_data))->size;
1891 }
1892
1893 /* If /K was present, we set up for handling MARK data. */
1894
1895 if (do_mark)
1896 {
1897 if (extra == NULL)
1898 {
1899 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1900 extra->flags = 0;
1901 }
1902 extra->mark = &markptr;
1903 extra->flags |= PCRE_EXTRA_MARK;
1904 }
1905
1906 /* If the 'F' option was present, we flip the bytes of all the integer
1907 fields in the regex data block and the study block. This is to make it
1908 possible to test PCRE's handling of byte-flipped patterns, e.g. those
1909 compiled on a different architecture. */
1910
1911 if (do_flip)
1912 {
1913 real_pcre *rre = (real_pcre *)re;
1914 rre->magic_number =
1915 byteflip(rre->magic_number, sizeof(rre->magic_number));
1916 rre->size = byteflip(rre->size, sizeof(rre->size));
1917 rre->options = byteflip(rre->options, sizeof(rre->options));
1918 rre->flags = (pcre_uint16)byteflip(rre->flags, sizeof(rre->flags));
1919 rre->top_bracket =
1920 (pcre_uint16)byteflip(rre->top_bracket, sizeof(rre->top_bracket));
1921 rre->top_backref =
1922 (pcre_uint16)byteflip(rre->top_backref, sizeof(rre->top_backref));
1923 rre->first_byte =
1924 (pcre_uint16)byteflip(rre->first_byte, sizeof(rre->first_byte));
1925 rre->req_byte =
1926 (pcre_uint16)byteflip(rre->req_byte, sizeof(rre->req_byte));
1927 rre->name_table_offset = (pcre_uint16)byteflip(rre->name_table_offset,
1928 sizeof(rre->name_table_offset));
1929 rre->name_entry_size = (pcre_uint16)byteflip(rre->name_entry_size,
1930 sizeof(rre->name_entry_size));
1931 rre->name_count = (pcre_uint16)byteflip(rre->name_count,
1932 sizeof(rre->name_count));
1933
1934 if (extra != NULL)
1935 {
1936 pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
1937 rsd->size = byteflip(rsd->size, sizeof(rsd->size));
1938 rsd->flags = byteflip(rsd->flags, sizeof(rsd->flags));
1939 rsd->minlength = byteflip(rsd->minlength, sizeof(rsd->minlength));
1940 }
1941 }
1942
1943 /* Extract information from the compiled data if required */
1944
1945 SHOW_INFO:
1946
1947 if (do_debug)
1948 {
1949 fprintf(outfile, "------------------------------------------------------------------\n");
1950 pcre_printint(re, outfile, debug_lengths);
1951 }
1952
1953 /* We already have the options in get_options (see above) */
1954
1955 if (do_showinfo)
1956 {
1957 unsigned long int all_options;
1958 #if !defined NOINFOCHECK
1959 int old_first_char, old_options, old_count;
1960 #endif
1961 int count, backrefmax, first_char, need_char, okpartial, jchanged,
1962 hascrorlf;
1963 int nameentrysize, namecount;
1964 const pcre_uchar *nametable;
1965
1966 new_info(re, NULL, PCRE_INFO_SIZE, &size);
1967 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
1968 new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
1969 new_info(re, NULL, PCRE_INFO_FIRSTBYTE, &first_char);
1970 new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
1971 new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
1972 new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
1973 new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
1974 new_info(re, NULL, PCRE_INFO_OKPARTIAL, &okpartial);
1975 new_info(re, NULL, PCRE_INFO_JCHANGED, &jchanged);
1976 new_info(re, NULL, PCRE_INFO_HASCRORLF, &hascrorlf);
1977
1978 #if !defined NOINFOCHECK
1979 old_count = pcre_info(re, &old_options, &old_first_char);
1980 if (count < 0) fprintf(outfile,
1981 "Error %d from pcre_info()\n", count);
1982 else
1983 {
1984 if (old_count != count) fprintf(outfile,
1985 "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
1986 old_count);
1987
1988 if (old_first_char != first_char) fprintf(outfile,
1989 "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
1990 first_char, old_first_char);
1991
1992 if (old_options != (int)get_options) fprintf(outfile,
1993 "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
1994 get_options, old_options);
1995 }
1996 #endif
1997
1998 if (size != regex_gotten_store) fprintf(outfile,
1999 "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
2000 (int)size, (int)regex_gotten_store);
2001
2002 fprintf(outfile, "Capturing subpattern count = %d\n", count);
2003 if (backrefmax > 0)
2004 fprintf(outfile, "Max back reference = %d\n", backrefmax);
2005
2006 if (namecount > 0)
2007 {
2008 fprintf(outfile, "Named capturing subpatterns:\n");
2009 while (namecount-- > 0)
2010 {
2011 fprintf(outfile, " %s %*s%3d\n", nametable + 2,
2012 nameentrysize - 3 - (int)strlen((char *)nametable + 2), "",
2013 GET2(nametable, 0));
2014 nametable += nameentrysize;
2015 }
2016 }
2017
2018 if (!okpartial) fprintf(outfile, "Partial matching not supported\n");
2019 if (hascrorlf) fprintf(outfile, "Contains explicit CR or LF match\n");
2020
2021 all_options = ((real_pcre *)re)->options;
2022 if (do_flip) all_options = byteflip(all_options, sizeof(all_options));
2023
2024 if (get_options == 0) fprintf(outfile, "No options\n");
2025 else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
2026 ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
2027 ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
2028 ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
2029 ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
2030 ((get_options & PCRE_FIRSTLINE) != 0)? " firstline" : "",
2031 ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
2032 ((get_options & PCRE_BSR_ANYCRLF) != 0)? " bsr_anycrlf" : "",
2033 ((get_options & PCRE_BSR_UNICODE) != 0)? " bsr_unicode" : "",
2034 ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
2035 ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
2036 ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
2037 ((get_options & PCRE_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "",
2038 ((get_options & PCRE_UTF8) != 0)? " utf8" : "",
2039 ((get_options & PCRE_UCP) != 0)? " ucp" : "",
2040 ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
2041 ((get_options & PCRE_NO_START_OPTIMIZE) != 0)? " no_start_optimize" : "",
2042 ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");
2043
2044 if (jchanged) fprintf(outfile, "Duplicate name status changes\n");
2045
2046 switch (get_options & PCRE_NEWLINE_BITS)
2047 {
2048 case PCRE_NEWLINE_CR:
2049 fprintf(outfile, "Forced newline sequence: CR\n");
2050 break;
2051
2052 case PCRE_NEWLINE_LF:
2053 fprintf(outfile, "Forced newline sequence: LF\n");
2054 break;
2055
2056 case PCRE_NEWLINE_CRLF:
2057 fprintf(outfile, "Forced newline sequence: CRLF\n");
2058 break;
2059
2060 case PCRE_NEWLINE_ANYCRLF:
2061 fprintf(outfile, "Forced newline sequence: ANYCRLF\n");
2062 break;
2063
2064 case PCRE_NEWLINE_ANY:
2065 fprintf(outfile, "Forced newline sequence: ANY\n");
2066 break;
2067
2068 default:
2069 break;
2070 }
2071
2072 if (first_char == -1)
2073 {
2074 fprintf(outfile, "First char at start or follows newline\n");
2075 }
2076 else if (first_char < 0)
2077 {
2078 fprintf(outfile, "No first char\n");
2079 }
2080 else
2081 {
2082 int ch = first_char & 255;
2083 const char *caseless = ((first_char & REQ_CASELESS) == 0)?
2084 "" : " (caseless)";
2085 if (PRINTHEX(ch))
2086 fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
2087 else
2088 fprintf(outfile, "First char = %d%s\n", ch, caseless);
2089 }
2090
2091 if (need_char < 0)
2092 {
2093 fprintf(outfile, "No need char\n");
2094 }
2095 else
2096 {
2097 int ch = need_char & 255;
2098 const char *caseless = ((need_char & REQ_CASELESS) == 0)?
2099 "" : " (caseless)";
2100 if (PRINTHEX(ch))
2101 fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
2102 else
2103 fprintf(outfile, "Need char = %d%s\n", ch, caseless);
2104 }
2105
2106 /* Don't output study size; at present it is in any case a fixed
2107 value, but it varies, depending on the computer architecture, and
2108 so messes up the test suite. (And with the /F option, it might be
2109 flipped.) If study was forced by an external -s, don't show this
2110 information unless -i or -d was also present. This means that, except
2111 when auto-callouts are involved, the output from runs with and without
2112 -s should be identical. */
2113
2114 if (do_study || (force_study >= 0 && showinfo && !no_force_study))
2115 {
2116 if (extra == NULL)
2117 fprintf(outfile, "Study returned NULL\n");
2118 else
2119 {
2120 pcre_uint8 *start_bits = NULL;
2121 int minlength;
2122
2123 new_info(re, extra, PCRE_INFO_MINLENGTH, &minlength);
2124 fprintf(outfile, "Subject length lower bound = %d\n", minlength);
2125
2126 new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
2127 if (start_bits == NULL)
2128 fprintf(outfile, "No set of starting bytes\n");
2129 else
2130 {
2131 int i;
2132 int c = 24;
2133 fprintf(outfile, "Starting byte set: ");
2134 for (i = 0; i < 256; i++)
2135 {
2136 if ((start_bits[i/8] & (1<<(i&7))) != 0)
2137 {
2138 if (c > 75)
2139 {
2140 fprintf(outfile, "\n ");
2141 c = 2;
2142 }
2143 if (PRINTHEX(i) && i != ' ')
2144 {
2145 fprintf(outfile, "%c ", i);
2146 c += 2;
2147 }
2148 else
2149 {
2150 fprintf(outfile, "\\x%02x ", i);
2151 c += 5;
2152 }
2153 }
2154 }
2155 fprintf(outfile, "\n");
2156 }
2157 }
2158
2159 /* Show this only if the JIT was set by /S, not by -s. */
2160
2161 if ((study_options & PCRE_STUDY_JIT_COMPILE) != 0)
2162 {
2163 int jit;
2164 new_info(re, extra, PCRE_INFO_JIT, &jit);
2165 if (jit)
2166 fprintf(outfile, "JIT study was successful\n");
2167 else
2168 #ifdef SUPPORT_JIT
2169 fprintf(outfile, "JIT study was not successful\n");
2170 #else
2171 fprintf(outfile, "JIT support is not available in this version of PCRE\n");
2172 #endif
2173 }
2174 }
2175 }
2176
2177 /* If the '>' option was present, we write out the regex to a file, and
2178 that is all. The first 8 bytes of the file are the regex length and then
2179 the study length, in big-endian order. */
2180
2181 if (to_file != NULL)
2182 {
2183 FILE *f = fopen((char *)to_file, "wb");
2184 if (f == NULL)
2185 {
2186 fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
2187 }
2188 else
2189 {
2190 pcre_uint8 sbuf[8];
2191 sbuf[0] = (pcre_uint8)((true_size >> 24) & 255);
2192 sbuf[1] = (pcre_uint8)((true_size >> 16) & 255);
2193 sbuf[2] = (pcre_uint8)((true_size >> 8) & 255);
2194 sbuf[3] = (pcre_uint8)((true_size) & 255);
2195
2196 sbuf[4] = (pcre_uint8)((true_study_size >> 24) & 255);
2197 sbuf[5] = (pcre_uint8)((true_study_size >> 16) & 255);
2198 sbuf[6] = (pcre_uint8)((true_study_size >> 8) & 255);
2199 sbuf[7] = (pcre_uint8)((true_study_size) & 255);
2200
2201 if (fwrite(sbuf, 1, 8, f) < 8 ||
2202 fwrite(re, 1, true_size, f) < true_size)
2203 {
2204 fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
2205 }
2206 else
2207 {
2208 fprintf(outfile, "Compiled pattern written to %s\n", to_file);
2209
2210 /* If there is study data, write it. */
2211
2212 if (extra != NULL)
2213 {
2214 if (fwrite(extra->study_data, 1, true_study_size, f) <
2215 true_study_size)
2216 {
2217 fprintf(outfile, "Write error on %s: %s\n", to_file,
2218 strerror(errno));
2219 }
2220 else fprintf(outfile, "Study data written to %s\n", to_file);
2221 }
2222 }
2223 fclose(f);
2224 }
2225
2226 new_free(re);
2227 if (extra != NULL) pcre_free_study(extra);
2228 if (locale_set)
2229 {
2230 new_free((void *)tables);
2231 setlocale(LC_CTYPE, "C");
2232 locale_set = 0;
2233 }
2234 continue; /* With next regex */
2235 }
2236 } /* End of non-POSIX compile */
2237
2238 /* Read data lines and test them */
2239
2240 for (;;)
2241 {
2242 pcre_uint8 *q;
2243 pcre_uint8 *bptr;
2244 int *use_offsets = offsets;
2245 int use_size_offsets = size_offsets;
2246 int callout_data = 0;
2247 int callout_data_set = 0;
2248 int count, c;
2249 int copystrings = 0;
2250 int find_match_limit = default_find_match_limit;
2251 int getstrings = 0;
2252 int getlist = 0;
2253 int gmatched = 0;
2254 int start_offset = 0;
2255 int start_offset_sign = 1;
2256 int g_notempty = 0;
2257 int use_dfa = 0;
2258
2259 options = 0;
2260
2261 *copynames = 0;
2262 *getnames = 0;
2263
2264 copynamesptr = copynames;
2265 getnamesptr = getnames;
2266
2267 pcre_callout = callout;
2268 first_callout = 1;
2269 last_callout_mark = NULL;
2270 callout_extra = 0;
2271 callout_count = 0;
2272 callout_fail_count = 999999;
2273 callout_fail_id = -1;
2274 show_malloc = 0;
2275
2276 if (extra != NULL) extra->flags &=
2277 ~(PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION);
2278
2279 len = 0;
2280 for (;;)
2281 {
2282 if (extend_inputline(infile, buffer + len, "data> ") == NULL)
2283 {
2284 if (len > 0) /* Reached EOF without hitting a newline */
2285 {
2286 fprintf(outfile, "\n");
2287 break;
2288 }
2289 done = 1;
2290 goto CONTINUE;
2291 }
2292 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
2293 len = (int)strlen((char *)buffer);
2294 if (buffer[len-1] == '\n') break;
2295 }
2296
2297 while (len > 0 && isspace(buffer[len-1])) len--;
2298 buffer[len] = 0;
2299 if (len == 0) break;
2300
2301 p = buffer;
2302 while (isspace(*p)) p++;
2303
2304 bptr = q = dbuffer;
2305 while ((c = *p++) != 0)
2306 {
2307 int i = 0;
2308 int n = 0;
2309
2310 if (c == '\\') switch ((c = *p++))
2311 {
2312 case 'a': c = 7; break;
2313 case 'b': c = '\b'; break;
2314 case 'e': c = 27; break;
2315 case 'f': c = '\f'; break;
2316 case 'n': c = '\n'; break;
2317 case 'r': c = '\r'; break;
2318 case 't': c = '\t'; break;
2319 case 'v': c = '\v'; break;
2320
2321 case '0': case '1': case '2': case '3':
2322 case '4': case '5': case '6': case '7':
2323 c -= '0';
2324 while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
2325 c = c * 8 + *p++ - '0';
2326
2327 #if !defined NOUTF8
2328 if (use_utf8 && c > 255)
2329 {
2330 unsigned char buff8[8];
2331 int ii, utn;
2332 utn = ord2utf8(c, buff8);
2333 for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
2334 c = buff8[ii]; /* Last byte */
2335 }
2336 #endif
2337 break;
2338
2339 case 'x':
2340
2341 /* Handle \x{..} specially - new Perl thing for utf8 */
2342
2343 #if !defined NOUTF8
2344 if (*p == '{')
2345 {
2346 unsigned char *pt = p;
2347 c = 0;
2348
2349 /* We used to have "while (isxdigit(*(++pt)))" here, but it fails
2350 when isxdigit() is a macro that refers to its argument more than
2351 once. This is banned by the C Standard, but apparently happens in at
2352 least one MacOS environment. */
2353
2354 for (pt++; isxdigit(*pt); pt++)
2355 c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'a' - 10);
2356 if (*pt == '}')
2357 {
2358 unsigned char buff8[8];
2359 int ii, utn;
2360 if (use_utf8)
2361 {
2362 utn = ord2utf8(c, buff8);
2363 for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
2364 c = buff8[ii]; /* Last byte */
2365 }
2366 else
2367 {
2368 if (c > 255)
2369 fprintf(outfile, "** Character \\x{%x} is greater than 255 and "
2370 "UTF-8 mode is not enabled.\n"
2371 "** Truncation will probably give the wrong result.\n", c);
2372 }
2373 p = pt + 1;
2374 break;
2375 }
2376 /* Not correct form; fall through */
2377 }
2378 #endif
2379
2380 /* Ordinary \x */
2381
2382 c = 0;
2383 while (i++ < 2 && isxdigit(*p))
2384 {
2385 c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'a' - 10);
2386 p++;
2387 }
2388 break;
2389
2390 case 0: /* \ followed by EOF allows for an empty line */
2391 p--;
2392 continue;
2393
2394 case '>':
2395 if (*p == '-')
2396 {
2397 start_offset_sign = -1;
2398 p++;
2399 }
2400 while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
2401 start_offset *= start_offset_sign;
2402 continue;
2403
2404 case 'A': /* Option setting */
2405 options |= PCRE_ANCHORED;
2406 continue;
2407
2408 case 'B':
2409 options |= PCRE_NOTBOL;
2410 continue;
2411
2412 case 'C':
2413 if (isdigit(*p)) /* Set copy string */
2414 {
2415 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2416 copystrings |= 1 << n;
2417 }
2418 else if (isalnum(*p))
2419 {
2420 pcre_uchar *npp = copynamesptr;
2421 while (isalnum(*p)) *npp++ = *p++;
2422 *npp++ = 0;
2423 *npp = 0;
2424 n = pcre_get_stringnumber(re, (char *)copynamesptr);
2425 if (n < 0)
2426 fprintf(outfile, "no parentheses with name \"%s\"\n", copynamesptr);
2427 copynamesptr = npp;
2428 }
2429 else if (*p == '+')
2430 {
2431 callout_extra = 1;
2432 p++;
2433 }
2434 else if (*p == '-')
2435 {
2436 pcre_callout = NULL;
2437 p++;
2438 }
2439 else if (*p == '!')
2440 {
2441 callout_fail_id = 0;
2442 p++;
2443 while(isdigit(*p))
2444 callout_fail_id = callout_fail_id * 10 + *p++ - '0';
2445 callout_fail_count = 0;
2446 if (*p == '!')
2447 {
2448 p++;
2449 while(isdigit(*p))
2450 callout_fail_count = callout_fail_count * 10 + *p++ - '0';
2451 }
2452 }
2453 else if (*p == '*')
2454 {
2455 int sign = 1;
2456 callout_data = 0;
2457 if (*(++p) == '-') { sign = -1; p++; }
2458 while(isdigit(*p))
2459 callout_data = callout_data * 10 + *p++ - '0';
2460 callout_data *= sign;
2461 callout_data_set = 1;
2462 }
2463 continue;
2464
2465 #if !defined NODFA
2466 case 'D':
2467 #if !defined NOPOSIX
2468 if (posix || do_posix)
2469 printf("** Can't use dfa matching in POSIX mode: \\D ignored\n");
2470 else
2471 #endif
2472 use_dfa = 1;
2473 continue;
2474 #endif
2475
2476 #if !defined NODFA
2477 case 'F':
2478 options |= PCRE_DFA_SHORTEST;
2479 continue;
2480 #endif
2481
2482 case 'G':
2483 if (isdigit(*p))
2484 {
2485 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2486 getstrings |= 1 << n;
2487 }
2488 else if (isalnum(*p))
2489 {
2490 pcre_uchar *npp = getnamesptr;
2491 while (isalnum(*p)) *npp++ = *p++;
2492 *npp++ = 0;
2493 *npp = 0;
2494 n = pcre_get_stringnumber(re, (char *)getnamesptr);
2495 if (n < 0)
2496 fprintf(outfile, "no parentheses with name \"%s\"\n", getnamesptr);
2497 getnamesptr = npp;
2498 }
2499 continue;
2500
2501 case 'J':
2502 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2503 if (extra != NULL
2504 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
2505 && extra->executable_jit != NULL)
2506 {
2507 if (jit_stack != NULL) pcre_jit_stack_free(jit_stack);
2508 jit_stack = pcre_jit_stack_alloc(1, n * 1024);
2509 pcre_assign_jit_stack(extra, jit_callback, jit_stack);
2510 }
2511 continue;
2512
2513 case 'L':
2514 getlist = 1;
2515 continue;
2516
2517 case 'M':
2518 find_match_limit = 1;
2519 continue;
2520
2521 case 'N':
2522 if ((options & PCRE_NOTEMPTY) != 0)
2523 options = (options & ~PCRE_NOTEMPTY) | PCRE_NOTEMPTY_ATSTART;
2524 else
2525 options |= PCRE_NOTEMPTY;
2526 continue;
2527
2528 case 'O':
2529 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2530 if (n > size_offsets_max)
2531 {
2532 size_offsets_max = n;
2533 free(offsets);
2534 use_offsets = offsets = (int *)malloc(size_offsets_max * sizeof(int));
2535 if (offsets == NULL)
2536 {
2537 printf("** Failed to get %d bytes of memory for offsets vector\n",
2538 (int)(size_offsets_max * sizeof(int)));
2539 yield = 1;
2540 goto EXIT;
2541 }
2542 }
2543 use_size_offsets = n;
2544 if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
2545 continue;
2546
2547 case 'P':
2548 options |= ((options & PCRE_PARTIAL_SOFT) == 0)?
2549 PCRE_PARTIAL_SOFT : PCRE_PARTIAL_HARD;
2550 continue;
2551
2552 case 'Q':
2553 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2554 if (extra == NULL)
2555 {
2556 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
2557 extra->flags = 0;
2558 }
2559 extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
2560 extra->match_limit_recursion = n;
2561 continue;
2562
2563 case 'q':
2564 while(isdigit(*p)) n = n * 10 + *p++ - '0';
2565 if (extra == NULL)
2566 {
2567 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
2568 extra->flags = 0;
2569 }
2570 extra->flags |= PCRE_EXTRA_MATCH_LIMIT;
2571 extra->match_limit = n;
2572 continue;
2573
2574 #if !defined NODFA
2575 case 'R':
2576 options |= PCRE_DFA_RESTART;
2577 continue;
2578 #endif
2579
2580 case 'S':
2581 show_malloc = 1;
2582 continue;
2583
2584 case 'Y':
2585 options |= PCRE_NO_START_OPTIMIZE;
2586 continue;
2587
2588 case 'Z':
2589 options |= PCRE_NOTEOL;
2590 continue;
2591
2592 case '?':
2593 options |= PCRE_NO_UTF8_CHECK;
2594 continue;
2595
2596 case '<':
2597 {
2598 int x = check_newline(p, outfile);
2599 if (x == 0) goto NEXT_DATA;
2600 options |= x;
2601 while (*p++ != '>');
2602 }
2603 continue;
2604 }
2605 *q++ = c;
2606 }
2607 *q = 0;
2608 len = (int)(q - dbuffer);
2609
2610 /* Move the data to the end of the buffer so that a read over the end of
2611 the buffer will be seen by valgrind, even if it doesn't cause a crash. If
2612 we are using the POSIX interface, we must include the terminating zero. */
2613
2614 #if !defined NOPOSIX
2615 if (posix || do_posix)
2616 {
2617 memmove(bptr + buffer_size - len - 1, bptr, len + 1);
2618 bptr += buffer_size - len - 1;
2619 }
2620 else
2621 #endif
2622 {
2623 memmove(bptr + buffer_size - len, bptr, len);
2624 bptr += buffer_size - len;
2625 }
2626
2627 if ((all_use_dfa || use_dfa) && find_match_limit)
2628 {
2629 printf("**Match limit not relevant for DFA matching: ignored\n");
2630 find_match_limit = 0;
2631 }
2632
2633 /* Handle matching via the POSIX interface, which does not
2634 support timing or playing with the match limit or callout data. */
2635
2636 #if !defined NOPOSIX
2637 if (posix || do_posix)
2638 {
2639 int rc;
2640 int eflags = 0;
2641 regmatch_t *pmatch = NULL;
2642 if (use_size_offsets > 0)
2643 pmatch = (regmatch_t *)malloc(sizeof(regmatch_t) * use_size_offsets);
2644 if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
2645 if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
2646 if ((options & PCRE_NOTEMPTY) != 0) eflags |= REG_NOTEMPTY;
2647
2648 rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
2649
2650 if (rc != 0)
2651 {
2652 (void)regerror(rc, &preg, (char *)buffer, buffer_size);
2653 fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
2654 }
2655 else if ((((const pcre *)preg.re_pcre)->options & PCRE_NO_AUTO_CAPTURE)
2656 != 0)
2657 {
2658 fprintf(outfile, "Matched with REG_NOSUB\n");
2659 }
2660 else
2661 {
2662 size_t i;
2663 for (i = 0; i < (size_t)use_size_offsets; i++)
2664 {
2665 if (pmatch[i].rm_so >= 0)
2666 {
2667 fprintf(outfile, "%2d: ", (int)i);
2668 (void)pchars(dbuffer + pmatch[i].rm_so,
2669 pmatch[i].rm_eo - pmatch[i].rm_so, outfile);
2670 fprintf(outfile, "\n");
2671 if (do_showcaprest || (i == 0 && do_showrest))
2672 {
2673 fprintf(outfile, "%2d+ ", (int)i);
2674 (void)pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo,
2675 outfile);
2676 fprintf(outfile, "\n");
2677 }
2678 }
2679 }
2680 }
2681 free(pmatch);
2682 }
2683
2684 /* Handle matching via the native interface - repeats for /g and /G */
2685
2686 else
2687 #endif /* !defined NOPOSIX */
2688
2689 for (;; gmatched++) /* Loop for /g or /G */
2690 {
2691 markptr = NULL;
2692
2693 if (timeitm > 0)
2694 {
2695 register int i;
2696 clock_t time_taken;
2697 clock_t start_time = clock();
2698
2699 #if !defined NODFA
2700 if (all_use_dfa || use_dfa)
2701 {
2702 int workspace[1000];
2703 for (i = 0; i < timeitm; i++)
2704 count = pcre_dfa_exec(re, extra, (char *)bptr, len, start_offset,
2705 options | g_notempty, use_offsets, use_size_offsets, workspace,
2706 sizeof(workspace)/sizeof(int));
2707 }
2708 else
2709 #endif
2710
2711 for (i = 0; i < timeitm; i++)
2712 count = pcre_exec(re, extra, (char *)bptr, len,
2713 start_offset, options | g_notempty, use_offsets, use_size_offsets);
2714
2715 time_taken = clock() - start_time;
2716 fprintf(outfile, "Execute time %.4f milliseconds\n",
2717 (((double)time_taken * 1000.0) / (double)timeitm) /
2718 (double)CLOCKS_PER_SEC);
2719 }
2720
2721 /* If find_match_limit is set, we want to do repeated matches with
2722 varying limits in order to find the minimum value for the match limit and
2723 for the recursion limit. The match limits are relevant only to the normal
2724 running of pcre_exec(), so disable the JIT optimization. This makes it
2725 possible to run the same set of tests with and without JIT externally
2726 requested. */
2727
2728 if (find_match_limit)
2729 {
2730 if (extra == NULL)
2731 {
2732 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
2733 extra->flags = 0;
2734 }
2735 else extra->flags &= ~PCRE_EXTRA_EXECUTABLE_JIT;
2736
2737 (void)check_match_limit(re, extra, bptr, len, start_offset,
2738 options|g_notempty, use_offsets, use_size_offsets,
2739 PCRE_EXTRA_MATCH_LIMIT, &(extra->match_limit),
2740 PCRE_ERROR_MATCHLIMIT, "match()");
2741
2742 count = check_match_limit(re, extra, bptr, len, start_offset,
2743 options|g_notempty, use_offsets, use_size_offsets,
2744 PCRE_EXTRA_MATCH_LIMIT_RECURSION, &(extra->match_limit_recursion),
2745 PCRE_ERROR_RECURSIONLIMIT, "match() recursion");
2746 }
2747
2748 /* If callout_data is set, use the interface with additional data */
2749
2750 else if (callout_data_set)
2751 {
2752 if (extra == NULL)
2753 {
2754 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
2755 extra->flags = 0;
2756 }
2757 extra->flags |= PCRE_EXTRA_CALLOUT_DATA;
2758 extra->callout_data = &callout_data;
2759 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
2760 options | g_notempty, use_offsets, use_size_offsets);
2761 extra->flags &= ~PCRE_EXTRA_CALLOUT_DATA;
2762 }
2763
2764 /* The normal case is just to do the match once, with the default
2765 value of match_limit. */
2766
2767 #if !defined NODFA
2768 else if (all_use_dfa || use_dfa)
2769 {
2770 int workspace[1000];
2771 count = pcre_dfa_exec(re, extra, (char *)bptr, len, start_offset,
2772 options | g_notempty, use_offsets, use_size_offsets, workspace,
2773 sizeof(workspace)/sizeof(int));
2774 if (count == 0)
2775 {
2776 fprintf(outfile, "Matched, but too many subsidiary matches\n");
2777 count = use_size_offsets/2;
2778 }
2779 }
2780 #endif
2781
2782 else
2783 {
2784 count = pcre_exec(re, extra, (char *)bptr, len,
2785 start_offset, options | g_notempty, use_offsets, use_size_offsets);
2786 if (count == 0)
2787 {
2788 fprintf(outfile, "Matched, but too many substrings\n");
2789 count = use_size_offsets/3;
2790 }
2791 }
2792
2793 /* Matched */
2794
2795 if (count >= 0)
2796 {
2797 int i, maxcount;
2798
2799 #if !defined NODFA
2800 if (all_use_dfa || use_dfa) maxcount = use_size_offsets/2; else
2801 #endif
2802 maxcount = use_size_offsets/3;
2803
2804 /* This is a check against a lunatic return value. */
2805
2806 if (count > maxcount)
2807 {
2808 fprintf(outfile,
2809 "** PCRE error: returned count %d is too big for offset size %d\n",
2810 count, use_size_offsets);
2811 count = use_size_offsets/3;
2812 if (do_g || do_G)
2813 {
2814 fprintf(outfile, "** /%c loop abandoned\n", do_g? 'g' : 'G');
2815 do_g = do_G = FALSE; /* Break g/G loop */
2816 }
2817 }
2818
2819 /* do_allcaps requests showing of all captures in the pattern, to check
2820 unset ones at the end. */
2821
2822 if (do_allcaps)
2823 {
2824 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
2825 count++; /* Allow for full match */
2826 if (count * 2 > use_size_offsets) count = use_size_offsets/2;
2827 }
2828
2829 /* Output the captured substrings */
2830
2831 for (i = 0; i < count * 2; i += 2)
2832 {
2833 if (use_offsets[i] < 0)
2834 {
2835 if (use_offsets[i] != -1)
2836 fprintf(outfile, "ERROR: bad negative value %d for offset %d\n",
2837 use_offsets[i], i);
2838 if (use_offsets[i+1] != -1)
2839 fprintf(outfile, "ERROR: bad negative value %d for offset %d\n",
2840 use_offsets[i+1], i+1);
2841 fprintf(outfile, "%2d: <unset>\n", i/2);
2842 }
2843 else
2844 {
2845 fprintf(outfile, "%2d: ", i/2);
2846 (void)pchars(bptr + use_offsets[i],
2847 use_offsets[i+1] - use_offsets[i], outfile);
2848 fprintf(outfile, "\n");
2849 if (do_showcaprest || (i == 0 && do_showrest))
2850 {
2851 fprintf(outfile, "%2d+ ", i/2);
2852 (void)pchars(bptr + use_offsets[i+1], len - use_offsets[i+1],
2853 outfile);
2854 fprintf(outfile, "\n");
2855 }
2856 }
2857 }
2858
2859 if (markptr != NULL) fprintf(outfile, "MK: %s\n", markptr);
2860
2861 for (i = 0; i < 32; i++)
2862 {
2863 if ((copystrings & (1 << i)) != 0)
2864 {
2865 char copybuffer[256];
2866 int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
2867 i, copybuffer, sizeof(copybuffer));
2868 if (rc < 0)
2869 fprintf(outfile, "copy substring %d failed %d\n", i, rc);
2870 else
2871 fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
2872 }
2873 }
2874
2875 for (copynamesptr = copynames;
2876 *copynamesptr != 0;
2877 copynamesptr += (int)strlen((char*)copynamesptr) + 1)
2878 {
2879 char copybuffer[256];
2880 int rc = pcre_copy_named_substring(re, (char *)bptr, use_offsets,
2881 count, (char *)copynamesptr, copybuffer, sizeof(copybuffer));
2882 if (rc < 0)
2883 fprintf(outfile, "copy substring %s failed %d\n", copynamesptr, rc);
2884 else
2885 fprintf(outfile, " C %s (%d) %s\n", copybuffer, rc, copynamesptr);
2886 }
2887
2888 for (i = 0; i < 32; i++)
2889 {
2890 if ((getstrings & (1 << i)) != 0)
2891 {
2892 const char *substring;
2893 int rc = pcre_get_substring((char *)bptr, use_offsets, count,
2894 i, &substring);
2895 if (rc < 0)
2896 fprintf(outfile, "get substring %d failed %d\n", i, rc);
2897 else
2898 {
2899 fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
2900 pcre_free_substring(substring);
2901 }
2902 }
2903 }
2904
2905 for (getnamesptr = getnames;
2906 *getnamesptr != 0;
2907 getnamesptr += (int)strlen((char*)getnamesptr) + 1)
2908 {
2909 const char *substring;
2910 int rc = pcre_get_named_substring(re, (char *)bptr, use_offsets,
2911 count, (char *)getnamesptr, &substring);
2912 if (rc < 0)
2913 fprintf(outfile, "copy substring %s failed %d\n", getnamesptr, rc);
2914 else
2915 {
2916 fprintf(outfile, " G %s (%d) %s\n", substring, rc, getnamesptr);
2917 pcre_free_substring(substring);
2918 }
2919 }
2920
2921 if (getlist)
2922 {
2923 const char **stringlist;
2924 int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
2925 &stringlist);
2926 if (rc < 0)
2927 fprintf(outfile, "get substring list failed %d\n", rc);
2928 else
2929 {
2930 for (i = 0; i < count; i++)
2931 fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
2932 if (stringlist[i] != NULL)
2933 fprintf(outfile, "string list not terminated by NULL\n");
2934 pcre_free_substring_list(stringlist);
2935 }
2936 }
2937 }
2938
2939 /* There was a partial match */
2940
2941 else if (count == PCRE_ERROR_PARTIAL)
2942 {
2943 if (markptr == NULL) fprintf(outfile, "Partial match");
2944 else fprintf(outfile, "Partial match, mark=%s", markptr);
2945 if (use_size_offsets > 1)
2946 {
2947 fprintf(outfile, ": ");
2948 pchars(bptr + use_offsets[0], use_offsets[1] - use_offsets[0],
2949 outfile);
2950 }
2951 fprintf(outfile, "\n");
2952 break; /* Out of the /g loop */
2953 }
2954
2955 /* Failed to match. If this is a /g or /G loop and we previously set
2956 g_notempty after a null match, this is not necessarily the end. We want
2957 to advance the start offset, and continue. We won't be at the end of the
2958 string - that was checked before setting g_notempty.
2959
2960 Complication arises in the case when the newline convention is "any",
2961 "crlf", or "anycrlf". If the previous match was at the end of a line
2962 terminated by CRLF, an advance of one character just passes the \r,
2963 whereas we should prefer the longer newline sequence, as does the code in
2964 pcre_exec(). Fudge the offset value to achieve this. We check for a
2965 newline setting in the pattern; if none was set, use pcre_config() to
2966 find the default.
2967
2968 Otherwise, in the case of UTF-8 matching, the advance must be one
2969 character, not one byte. */
2970
2971 else
2972 {
2973 if (g_notempty != 0)
2974 {
2975 int onechar = 1;
2976 unsigned int obits = ((real_pcre *)re)->options;
2977 use_offsets[0] = start_offset;
2978 if ((obits & PCRE_NEWLINE_BITS) == 0)
2979 {
2980 int d;
2981 (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
2982 /* Note that these values are always the ASCII ones, even in
2983 EBCDIC environments. CR = 13, NL = 10. */
2984 obits = (d == 13)? PCRE_NEWLINE_CR :
2985 (d == 10)? PCRE_NEWLINE_LF :
2986 (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
2987 (d == -2)? PCRE_NEWLINE_ANYCRLF :
2988 (d == -1)? PCRE_NEWLINE_ANY : 0;
2989 }
2990 if (((obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY ||
2991 (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_CRLF ||
2992 (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANYCRLF)
2993 &&
2994 start_offset < len - 1 &&
2995 bptr[start_offset] == '\r' &&
2996 bptr[start_offset+1] == '\n')
2997 onechar++;
2998 else if (use_utf8)
2999 {
3000 while (start_offset + onechar < len)
3001 {
3002 if ((bptr[start_offset+onechar] & 0xc0) != 0x80) break;
3003 onechar++;
3004 }
3005 }
3006 use_offsets[1] = start_offset + onechar;
3007 }
3008 else
3009 {
3010 switch(count)
3011 {
3012 case PCRE_ERROR_NOMATCH:
3013 if (gmatched == 0)
3014 {
3015 if (markptr == NULL) fprintf(outfile, "No match\n");
3016 else fprintf(outfile, "No match, mark = %s\n", markptr);
3017 }
3018 break;
3019
3020 case PCRE_ERROR_BADUTF8:
3021 case PCRE_ERROR_SHORTUTF8:
3022 fprintf(outfile, "Error %d (%s UTF-8 string)", count,
3023 (count == PCRE_ERROR_BADUTF8)? "bad" : "short");
3024 if (use_size_offsets >= 2)
3025 fprintf(outfile, " offset=%d reason=%d", use_offsets[0],
3026 use_offsets[1]);
3027 fprintf(outfile, "\n");
3028 break;
3029
3030 default:
3031 if (count < 0 && (-count) < sizeof(errtexts)/sizeof(const char *))
3032 fprintf(outfile, "Error %d (%s)\n", count, errtexts[-count]);
3033 else
3034 fprintf(outfile, "Error %d (Unexpected value)\n", count);
3035 break;
3036 }
3037
3038 break; /* Out of the /g loop */
3039 }
3040 }
3041
3042 /* If not /g or /G we are done */
3043
3044 if (!do_g && !do_G) break;
3045
3046 /* If we have matched an empty string, first check to see if we are at
3047 the end of the subject. If so, the /g loop is over. Otherwise, mimic what
3048 Perl's /g options does. This turns out to be rather cunning. First we set
3049 PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED and try the match again at the
3050 same point. If this fails (picked up above) we advance to the next
3051 character. */
3052
3053 g_notempty = 0;
3054
3055 if (use_offsets[0] == use_offsets[1])
3056 {
3057 if (use_offsets[0] == len) break;
3058 g_notempty = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
3059 }
3060
3061 /* For /g, update the start offset, leaving the rest alone */
3062
3063 if (do_g) start_offset = use_offsets[1];
3064
3065 /* For /G, update the pointer and length */
3066
3067 else
3068 {
3069 bptr += use_offsets[1];
3070 len -= use_offsets[1];
3071 }
3072 } /* End of loop for /g and /G */
3073
3074 NEXT_DATA: continue;
3075 } /* End of loop for data lines */
3076
3077 CONTINUE:
3078
3079 #if !defined NOPOSIX
3080 if (posix || do_posix) regfree(&preg);
3081 #endif
3082
3083 if (re != NULL) new_free(re);
3084 if (extra != NULL) pcre_free_study(extra);
3085 if (locale_set)
3086 {
3087 new_free((void *)tables);
3088 setlocale(LC_CTYPE, "C");
3089 locale_set = 0;
3090 }
3091 if (jit_stack != NULL)
3092 {
3093 pcre_jit_stack_free(jit_stack);
3094 jit_stack = NULL;
3095 }
3096 }
3097
3098 if (infile == stdin) fprintf(outfile, "\n");
3099
3100 EXIT:
3101
3102 if (infile != NULL && infile != stdin) fclose(infile);
3103 if (outfile != NULL && outfile != stdout) fclose(outfile);
3104
3105 free(buffer);
3106 free(dbuffer);
3107 free(pbuffer);
3108 free(offsets);
3109
3110 return yield;
3111 }
3112
3113 /* End of pcretest.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5