/[pcre]/code/branches/pcre16/pcre_printint.src
ViewVC logotype

Contents of /code/branches/pcre16/pcre_printint.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 785 - (show annotations)
Mon Dec 5 20:12:24 2011 UTC (8 years, 4 months ago) by zherczeg
File MIME type: application/x-wais-source
File size: 18543 byte(s)
Improving UTF-16 support by fixing a lot of issues.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains a PCRE private debugging function for printing out the
42 internal form of a compiled regular expression, along with some supporting
43 local functions. This source file is used in two places:
44
45 (1) It is #included by pcre_compile.c when it is compiled in debugging mode
46 (PCRE_DEBUG defined in pcre_internal.h). It is not included in production
47 compiles.
48
49 (2) It is always #included by pcretest.c, which can be asked to print out a
50 compiled regex for debugging purposes. */
51
52
53 /* Macro that decides whether a character should be output as a literal or in
54 hexadecimal. We don't use isprint() because that can vary from system to system
55 (even without the use of locales) and we want the output always to be the same,
56 for testing purposes. This macro is used in pcretest as well as in this file. */
57
58 #ifdef EBCDIC
59 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
60 #else
61 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
62 #endif
63
64 /* The table of operator names. */
65
66 static const char *OP_names[] = { OP_NAME_LIST };
67
68
69
70 /*************************************************
71 * Print single- or multi-byte character *
72 *************************************************/
73
74 static int
75 print_char(FILE *f, pcre_uchar *ptr, BOOL utf)
76 {
77 int c = *ptr;
78
79 #ifndef SUPPORT_UTF
80 (void)utf; /* Avoid compiler warning */
81 if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
82 return 0;
83
84 #else
85
86 #ifdef COMPILE_PCRE8
87
88 if (!utf || (c & 0xc0) != 0xc0)
89 {
90 if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
91 return 0;
92 }
93 else
94 {
95 int i;
96 int a = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */
97 int s = 6*a;
98 c = (c & PRIV(utf8_table3)[a]) << s;
99 for (i = 1; i <= a; i++)
100 {
101 /* This is a check for malformed UTF-8; it should only occur if the sanity
102 check has been turned off. Rather than swallow random bytes, just stop if
103 we hit a bad one. Print it with \X instead of \x as an indication. */
104
105 if ((ptr[i] & 0xc0) != 0x80)
106 {
107 fprintf(f, "\\X{%x}", c);
108 return i - 1;
109 }
110
111 /* The byte is OK */
112
113 s -= 6;
114 c |= (ptr[i] & 0x3f) << s;
115 }
116 fprintf(f, "\\x{%x}", c);
117 return a;
118 }
119
120 #else
121
122 #ifdef COMPILE_PCRE16
123
124 if (!utf || (c & 0xfc00) != 0xd800)
125 {
126 if (PRINTABLE(c)) fprintf(f, "%c", c);
127 else if (c <= 0xff) fprintf(f, "\\x%02x", c);
128 else fprintf(f, "\\x{%x}", c);
129 return 0;
130 }
131 else
132 {
133 /* This is a check for malformed UTF-16; it should only occur if the sanity
134 check has been turned off. Rather than swallow a low surrogate, just stop if
135 we hit a bad one. Print it with \X instead of \x as an indication. */
136
137 if ((ptr[1] & 0xfc00) != 0xdc00)
138 {
139 fprintf(f, "\\X{%x}", c);
140 return 0;
141 }
142
143 c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000;
144 fprintf(f, "\\x{%x}", c);
145 return 1;
146 }
147
148 #endif /* COMPILE_PCRE16 */
149
150 #endif /* COMPILE_PCRE8 */
151
152 #endif /* SUPPORT_UTF */
153 }
154
155 /*************************************************
156 * Print uchar string (regardless of utf) *
157 *************************************************/
158
159 static void
160 print_puchar(FILE *f, PCRE_PUCHAR ptr)
161 {
162 while (*ptr != '\0')
163 {
164 register int c = *ptr++;
165 if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c);
166 }
167 }
168
169 /*************************************************
170 * Find Unicode property name *
171 *************************************************/
172
173 static const char *
174 get_ucpname(int ptype, int pvalue)
175 {
176 #ifdef SUPPORT_UCP
177 int i;
178 for (i = PRIV(utt_size) - 1; i >= 0; i--)
179 {
180 if (ptype == PRIV(utt)[i].type && pvalue == PRIV(utt)[i].value) break;
181 }
182 return (i >= 0)? PRIV(utt_names) + PRIV(utt)[i].name_offset : "??";
183 #else
184 /* It gets harder and harder to shut off unwanted compiler warnings. */
185 ptype = ptype * pvalue;
186 return (ptype == pvalue)? "??" : "??";
187 #endif
188 }
189
190
191
192 /*************************************************
193 * Print compiled regex *
194 *************************************************/
195
196 /* Make this function work for a regex with integers either byte order.
197 However, we assume that what we are passed is a compiled regex. The
198 print_lengths flag controls whether offsets and lengths of items are printed.
199 They can be turned off from pcretest so that automatic tests on bytecode can be
200 written that do not depend on the value of LINK_SIZE. */
201
202 static void
203 pcre_printint(pcre *external_re, FILE *f, BOOL print_lengths)
204 {
205 real_pcre *re = (real_pcre *)external_re;
206 pcre_uchar *codestart, *code;
207 BOOL utf;
208
209 unsigned int options = re->options;
210 int offset = re->name_table_offset;
211 int count = re->name_count;
212 int size = re->name_entry_size;
213
214 if (re->magic_number != MAGIC_NUMBER)
215 {
216 offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff);
217 count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff);
218 size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff);
219 options = ((options << 24) & 0xff000000) |
220 ((options << 8) & 0x00ff0000) |
221 ((options >> 8) & 0x0000ff00) |
222 ((options >> 24) & 0x000000ff);
223 }
224
225 code = codestart = (pcre_uchar *)re + offset + count * size;
226 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
227 utf = (options & PCRE_UTF8) != 0;
228
229 for(;;)
230 {
231 pcre_uchar *ccode;
232 const char *flag = " ";
233 int c;
234 int extra = 0;
235
236 if (print_lengths)
237 fprintf(f, "%3d ", (int)(code - codestart));
238 else
239 fprintf(f, " ");
240
241 switch(*code)
242 {
243 /* ========================================================================== */
244 /* These cases are never obeyed. This is a fudge that causes a compile-
245 time error if the vectors OP_names or PRIV(OP_lengths), which are indexed
246 by opcode, are not the correct length. It seems to be the only way to do
247 such a check at compile time, as the sizeof() operator does not work in
248 the C preprocessor. We do this while compiling pcretest, because that
249 #includes pcre_tables.c, which holds PRIV(OP_lengths). We can't do this
250 when building pcre_compile.c with PCRE_DEBUG set, because it doesn't then
251 know the size of PRIV(OP_lengths). */
252
253 #ifdef COMPILING_PCRETEST
254 case OP_TABLE_LENGTH:
255 case OP_TABLE_LENGTH +
256 ((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) &&
257 (sizeof(PRIV(OP_lengths)) == OP_TABLE_LENGTH)):
258 break;
259 #endif
260 /* ========================================================================== */
261
262 case OP_END:
263 fprintf(f, " %s\n", OP_names[*code]);
264 fprintf(f, "------------------------------------------------------------------\n");
265 return;
266
267 case OP_CHAR:
268 fprintf(f, " ");
269 do
270 {
271 code++;
272 code += 1 + print_char(f, code, utf);
273 }
274 while (*code == OP_CHAR);
275 fprintf(f, "\n");
276 continue;
277
278 case OP_CHARI:
279 fprintf(f, " /i ");
280 do
281 {
282 code++;
283 code += 1 + print_char(f, code, utf);
284 }
285 while (*code == OP_CHARI);
286 fprintf(f, "\n");
287 continue;
288
289 case OP_CBRA:
290 case OP_CBRAPOS:
291 case OP_SCBRA:
292 case OP_SCBRAPOS:
293 if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
294 else fprintf(f, " ");
295 fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
296 break;
297
298 case OP_BRA:
299 case OP_BRAPOS:
300 case OP_SBRA:
301 case OP_SBRAPOS:
302 case OP_KETRMAX:
303 case OP_KETRMIN:
304 case OP_KETRPOS:
305 case OP_ALT:
306 case OP_KET:
307 case OP_ASSERT:
308 case OP_ASSERT_NOT:
309 case OP_ASSERTBACK:
310 case OP_ASSERTBACK_NOT:
311 case OP_ONCE:
312 case OP_ONCE_NC:
313 case OP_COND:
314 case OP_SCOND:
315 case OP_REVERSE:
316 if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
317 else fprintf(f, " ");
318 fprintf(f, "%s", OP_names[*code]);
319 break;
320
321 case OP_CLOSE:
322 fprintf(f, " %s %d", OP_names[*code], GET2(code, 1));
323 break;
324
325 case OP_CREF:
326 case OP_NCREF:
327 fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
328 break;
329
330 case OP_RREF:
331 c = GET2(code, 1);
332 if (c == RREF_ANY)
333 fprintf(f, " Cond recurse any");
334 else
335 fprintf(f, " Cond recurse %d", c);
336 break;
337
338 case OP_NRREF:
339 c = GET2(code, 1);
340 if (c == RREF_ANY)
341 fprintf(f, " Cond nrecurse any");
342 else
343 fprintf(f, " Cond nrecurse %d", c);
344 break;
345
346 case OP_DEF:
347 fprintf(f, " Cond def");
348 break;
349
350 case OP_STARI:
351 case OP_MINSTARI:
352 case OP_POSSTARI:
353 case OP_PLUSI:
354 case OP_MINPLUSI:
355 case OP_POSPLUSI:
356 case OP_QUERYI:
357 case OP_MINQUERYI:
358 case OP_POSQUERYI:
359 flag = "/i";
360 /* Fall through */
361 case OP_STAR:
362 case OP_MINSTAR:
363 case OP_POSSTAR:
364 case OP_PLUS:
365 case OP_MINPLUS:
366 case OP_POSPLUS:
367 case OP_QUERY:
368 case OP_MINQUERY:
369 case OP_POSQUERY:
370 case OP_TYPESTAR:
371 case OP_TYPEMINSTAR:
372 case OP_TYPEPOSSTAR:
373 case OP_TYPEPLUS:
374 case OP_TYPEMINPLUS:
375 case OP_TYPEPOSPLUS:
376 case OP_TYPEQUERY:
377 case OP_TYPEMINQUERY:
378 case OP_TYPEPOSQUERY:
379 fprintf(f, " %s ", flag);
380 if (*code >= OP_TYPESTAR)
381 {
382 fprintf(f, "%s", OP_names[code[1]]);
383 if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
384 {
385 fprintf(f, " %s ", get_ucpname(code[2], code[3]));
386 extra = 2;
387 }
388 }
389 else extra = print_char(f, code+1, utf);
390 fprintf(f, "%s", OP_names[*code]);
391 break;
392
393 case OP_EXACTI:
394 case OP_UPTOI:
395 case OP_MINUPTOI:
396 case OP_POSUPTOI:
397 flag = "/i";
398 /* Fall through */
399 case OP_EXACT:
400 case OP_UPTO:
401 case OP_MINUPTO:
402 case OP_POSUPTO:
403 fprintf(f, " %s ", flag);
404 extra = print_char(f, code + 1 + IMM2_SIZE, utf);
405 fprintf(f, "{");
406 if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
407 fprintf(f, "%d}", GET2(code,1));
408 if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?");
409 else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+");
410 break;
411
412 case OP_TYPEEXACT:
413 case OP_TYPEUPTO:
414 case OP_TYPEMINUPTO:
415 case OP_TYPEPOSUPTO:
416 fprintf(f, " %s", OP_names[code[1 + IMM2_SIZE]]);
417 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
418 {
419 fprintf(f, " %s ", get_ucpname(code[1 + IMM2_SIZE + 1],
420 code[1 + IMM2_SIZE + 2]));
421 extra = 2;
422 }
423 fprintf(f, "{");
424 if (*code != OP_TYPEEXACT) fprintf(f, "0,");
425 fprintf(f, "%d}", GET2(code,1));
426 if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
427 else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
428 break;
429
430 case OP_NOTI:
431 flag = "/i";
432 /* Fall through */
433 case OP_NOT:
434 c = code[1];
435 if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
436 else fprintf(f, " %s [^\\x%02x]", flag, c);
437 break;
438
439 case OP_NOTSTARI:
440 case OP_NOTMINSTARI:
441 case OP_NOTPOSSTARI:
442 case OP_NOTPLUSI:
443 case OP_NOTMINPLUSI:
444 case OP_NOTPOSPLUSI:
445 case OP_NOTQUERYI:
446 case OP_NOTMINQUERYI:
447 case OP_NOTPOSQUERYI:
448 flag = "/i";
449 /* Fall through */
450
451 case OP_NOTSTAR:
452 case OP_NOTMINSTAR:
453 case OP_NOTPOSSTAR:
454 case OP_NOTPLUS:
455 case OP_NOTMINPLUS:
456 case OP_NOTPOSPLUS:
457 case OP_NOTQUERY:
458 case OP_NOTMINQUERY:
459 case OP_NOTPOSQUERY:
460 c = code[1];
461 if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
462 else fprintf(f, " %s [^\\x%02x]", flag, c);
463 fprintf(f, "%s", OP_names[*code]);
464 break;
465
466 case OP_NOTEXACTI:
467 case OP_NOTUPTOI:
468 case OP_NOTMINUPTOI:
469 case OP_NOTPOSUPTOI:
470 flag = "/i";
471 /* Fall through */
472
473 case OP_NOTEXACT:
474 case OP_NOTUPTO:
475 case OP_NOTMINUPTO:
476 case OP_NOTPOSUPTO:
477 c = code[1 + IMM2_SIZE];
478 if (PRINTABLE(c)) fprintf(f, " %s [^%c]{", flag, c);
479 else fprintf(f, " %s [^\\x%02x]{", flag, c);
480 if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
481 fprintf(f, "%d}", GET2(code,1));
482 if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
483 else
484 if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
485 break;
486
487 case OP_RECURSE:
488 if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
489 else fprintf(f, " ");
490 fprintf(f, "%s", OP_names[*code]);
491 break;
492
493 case OP_REFI:
494 flag = "/i";
495 /* Fall through */
496 case OP_REF:
497 fprintf(f, " %s \\%d", flag, GET2(code,1));
498 ccode = code + PRIV(OP_lengths)[*code];
499 goto CLASS_REF_REPEAT;
500
501 case OP_CALLOUT:
502 fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
503 GET(code, 2 + LINK_SIZE));
504 break;
505
506 case OP_PROP:
507 case OP_NOTPROP:
508 fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1], code[2]));
509 break;
510
511 /* OP_XCLASS can only occur in UTF or PCRE16 modes. However, there's no
512 harm in having this code always here, and it makes it less messy without
513 all those #ifdefs. */
514
515 case OP_CLASS:
516 case OP_NCLASS:
517 case OP_XCLASS:
518 {
519 int i, min, max;
520 BOOL printmap;
521 pcre_uint8 *map;
522
523 fprintf(f, " [");
524
525 if (*code == OP_XCLASS)
526 {
527 extra = GET(code, 1);
528 ccode = code + LINK_SIZE + 1;
529 printmap = (*ccode & XCL_MAP) != 0;
530 if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^");
531 }
532 else
533 {
534 printmap = TRUE;
535 ccode = code + 1;
536 }
537
538 /* Print a bit map */
539
540 if (printmap)
541 {
542 map = (pcre_uint8 *)ccode;
543 for (i = 0; i < 256; i++)
544 {
545 if ((map[i/8] & (1 << (i&7))) != 0)
546 {
547 int j;
548 for (j = i+1; j < 256; j++)
549 if ((map[j/8] & (1 << (j&7))) == 0) break;
550 if (i == '-' || i == ']') fprintf(f, "\\");
551 if (PRINTABLE(i)) fprintf(f, "%c", i);
552 else fprintf(f, "\\x%02x", i);
553 if (--j > i)
554 {
555 if (j != i + 1) fprintf(f, "-");
556 if (j == '-' || j == ']') fprintf(f, "\\");
557 if (PRINTABLE(j)) fprintf(f, "%c", j);
558 else fprintf(f, "\\x%02x", j);
559 }
560 i = j;
561 }
562 }
563 ccode += 32 / sizeof(pcre_uchar);
564 }
565
566 /* For an XCLASS there is always some additional data */
567
568 if (*code == OP_XCLASS)
569 {
570 int ch;
571 while ((ch = *ccode++) != XCL_END)
572 {
573 if (ch == XCL_PROP)
574 {
575 int ptype = *ccode++;
576 int pvalue = *ccode++;
577 fprintf(f, "\\p{%s}", get_ucpname(ptype, pvalue));
578 }
579 else if (ch == XCL_NOTPROP)
580 {
581 int ptype = *ccode++;
582 int pvalue = *ccode++;
583 fprintf(f, "\\P{%s}", get_ucpname(ptype, pvalue));
584 }
585 else
586 {
587 ccode += 1 + print_char(f, ccode, TRUE);
588 if (ch == XCL_RANGE)
589 {
590 fprintf(f, "-");
591 ccode += 1 + print_char(f, ccode, TRUE);
592 }
593 }
594 }
595 }
596
597 /* Indicate a non-UTF class which was created by negation */
598
599 fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
600
601 /* Handle repeats after a class or a back reference */
602
603 CLASS_REF_REPEAT:
604 switch(*ccode)
605 {
606 case OP_CRSTAR:
607 case OP_CRMINSTAR:
608 case OP_CRPLUS:
609 case OP_CRMINPLUS:
610 case OP_CRQUERY:
611 case OP_CRMINQUERY:
612 fprintf(f, "%s", OP_names[*ccode]);
613 extra += PRIV(OP_lengths)[*ccode];
614 break;
615
616 case OP_CRRANGE:
617 case OP_CRMINRANGE:
618 min = GET2(ccode,1);
619 max = GET2(ccode,1 + IMM2_SIZE);
620 if (max == 0) fprintf(f, "{%d,}", min);
621 else fprintf(f, "{%d,%d}", min, max);
622 if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
623 extra += PRIV(OP_lengths)[*ccode];
624 break;
625
626 /* Do nothing if it's not a repeat; this code stops picky compilers
627 warning about the lack of a default code path. */
628
629 default:
630 break;
631 }
632 }
633 break;
634
635 case OP_MARK:
636 case OP_PRUNE_ARG:
637 case OP_SKIP_ARG:
638 case OP_THEN_ARG:
639 fprintf(f, " %s ", OP_names[*code]);
640 print_puchar(f, code + 2);
641 extra += code[1];
642 break;
643
644 case OP_THEN:
645 fprintf(f, " %s", OP_names[*code]);
646 break;
647
648 case OP_CIRCM:
649 case OP_DOLLM:
650 flag = "/m";
651 /* Fall through */
652
653 /* Anything else is just an item with no data, but possibly a flag. */
654
655 default:
656 fprintf(f, " %s %s", flag, OP_names[*code]);
657 break;
658 }
659
660 code += PRIV(OP_lengths)[*code] + extra;
661 fprintf(f, "\n");
662 }
663 }
664
665 /* End of pcre_printint.src */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5