/[pcre]/code/trunk/internal.h
ViewVC logotype

Diff of /code/trunk/internal.h

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 74 by nigel, Sat Feb 24 21:40:30 2007 UTC revision 75 by nigel, Sat Feb 24 21:40:37 2007 UTC
# Line 5  Line 5 
5    
6  /* This is a library of functions to support regular expressions whose syntax  /* This is a library of functions to support regular expressions whose syntax
7  and semantics are as close as possible to those of the Perl 5 language. See  and semantics are as close as possible to those of the Perl 5 language. See
8  the file Tech.Notes for some information on the internals.  the file doc/Tech.Notes for some information on the internals.
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-2003 University of Cambridge             Copyright (c) 1997-2004 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Redistribution and use in source and binary forms, with or without
16  computer system, and to redistribute it freely, subject to the following  modification, are permitted provided that the following conditions are met:
17  restrictions:  
18        * Redistributions of source code must retain the above copyright notice,
19  1. This software is distributed in the hope that it will be useful,        this list of conditions and the following disclaimer.
20     but WITHOUT ANY WARRANTY; without even the implied warranty of  
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.      * Redistributions in binary form must reproduce the above copyright
22          notice, this list of conditions and the following disclaimer in the
23  2. The origin of this software must not be misrepresented, either by        documentation and/or other materials provided with the distribution.
24     explicit claim or by omission.  
25        * Neither the name of the University of Cambridge nor the names of its
26  3. Altered versions must be plainly marked as such, and must not be        contributors may be used to endorse or promote products derived from
27     misrepresented as being the original software.        this software without specific prior written permission.
28    
29  4. If PCRE is embedded in any software that is released under the GNU  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30     General Purpose Licence (GPL), then the terms of that licence shall  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31     supersede any condition above with which it is incompatible.  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39    POSSIBILITY OF SUCH DAMAGE.
40  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
41  */  */
42    
# Line 55  setjmp and stdarg are used is when NO_RE Line 63  setjmp and stdarg are used is when NO_RE
63  #define PCRE_DEFINITION       /* Win32 __declspec(export) trigger for .dll */  #define PCRE_DEFINITION       /* Win32 __declspec(export) trigger for .dll */
64  #endif  #endif
65    
66    /* We need to have types that specify unsigned 16-bit and 32-bit integers. We
67    cannot determine these outside the compilation (e.g. by running a program as
68    part of "configure") because PCRE is often cross-compiled for use on other
69    systems. Instead we make use of the maximum sizes that are available at
70    preprocessor time in standard C environments. */
71    
72    #if USHRT_MAX == 65535
73      typedef unsigned short pcre_uint16;
74    #elif UINT_MAX == 65535
75      typedef unsigned int pcre_uint16;
76    #else
77      #error Cannot determine a type for 16-bit unsigned integers
78    #endif
79    
80    #if UINT_MAX == 4294967295
81      typedef unsigned int pcre_uint32;
82    #elif ULONG_MAX == 4294967295
83      typedef unsigned long int pcre_uint32;
84    #else
85      #error Cannot determine a type for 32-bit unsigned integers
86    #endif
87    
88    /* All character handling must be done as unsigned characters. Otherwise there
89    are problems with top-bit-set characters and functions such as isspace().
90    However, we leave the interface to the outside world as char *, because that
91    should make things easier for callers. We define a short type for unsigned char
92    to save lots of typing. I tried "uchar", but it causes problems on Digital
93    Unix, where it is defined in sys/types, so use "uschar" instead. */
94    
95    typedef unsigned char uschar;
96    
97    /* Include the public PCRE header */
98    
99  #include "pcre.h"  #include "pcre.h"
100    
101  /* When compiling for use with the Virtual Pascal compiler, these functions  /* When compiling for use with the Virtual Pascal compiler, these functions
# Line 94  for (i = 0; i < n; ++i) *(--dest) =  *(- Line 135  for (i = 0; i < n; ++i) *(--dest) =  *(-
135  #endif   /* not VPCOMPAT */  #endif   /* not VPCOMPAT */
136    
137    
138  /* PCRE keeps offsets in its compiled code as 2-byte quantities by default.  /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
139  These are used, for example, to link from the start of a subpattern to its  in big-endian order) by default. These are used, for example, to link from the
140  alternatives and its end. The use of 2 bytes per offset limits the size of the  start of a subpattern to its alternatives and its end. The use of 2 bytes per
141  compiled regex to around 64K, which is big enough for almost everybody.  offset limits the size of the compiled regex to around 64K, which is big enough
142  However, I received a request for an even bigger limit. For this reason, and  for almost everybody. However, I received a request for an even bigger limit.
143  also to make the code easier to maintain, the storing and loading of offsets  For this reason, and also to make the code easier to maintain, the storing and
144  from the byte string is now handled by the macros that are defined here.  loading of offsets from the byte string is now handled by the macros that are
145    defined here.
146    
147  The macros are controlled by the value of LINK_SIZE. This defaults to 2 in  The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
148  the config.h file, but can be overridden by using -D on the command line. This  the config.h file, but can be overridden by using -D on the command line. This
# Line 176  Standard C system should have one. */ Line 218  Standard C system should have one. */
218  #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))  #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
219  #endif  #endif
220    
221    
222  /* These are the public options that can change during matching. */  /* These are the public options that can change during matching. */
223    
224  #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)  #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
# Line 184  Standard C system should have one. */ Line 227  Standard C system should have one. */
227  but skip the top bit so we can use ints for convenience without getting tangled  but skip the top bit so we can use ints for convenience without getting tangled
228  with negative values. The public options defined in pcre.h start at the least  with negative values. The public options defined in pcre.h start at the least
229  significant end. Make sure they don't overlap, though now that we have expanded  significant end. Make sure they don't overlap, though now that we have expanded
230  to four bytes there is plenty of space. */  to four bytes, there is plenty of space. */
231    
232  #define PCRE_FIRSTSET      0x40000000  /* first_byte is set */  #define PCRE_FIRSTSET      0x40000000  /* first_byte is set */
233  #define PCRE_REQCHSET      0x20000000  /* req_byte is set */  #define PCRE_REQCHSET      0x20000000  /* req_byte is set */
234  #define PCRE_STARTLINE     0x10000000  /* start after \n for multiline */  #define PCRE_STARTLINE     0x10000000  /* start after \n for multiline */
235  #define PCRE_ICHANGED      0x08000000  /* i option changes within regex */  #define PCRE_ICHANGED      0x08000000  /* i option changes within regex */
236    #define PCRE_NOPARTIAL     0x04000000  /* can't use partial with this regex */
237    
238  /* Options for the "extra" block produced by pcre_study(). */  /* Options for the "extra" block produced by pcre_study(). */
239    
# Line 201  time, run time or study time, respective Line 245  time, run time or study time, respective
245  #define PUBLIC_OPTIONS \  #define PUBLIC_OPTIONS \
246    (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \    (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
247     PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \     PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
248     PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK)     PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT)
249    
250  #define PUBLIC_EXEC_OPTIONS \  #define PUBLIC_EXEC_OPTIONS \
251    (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK)    (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
252       PCRE_PARTIAL)
253    
254  #define PUBLIC_STUDY_OPTIONS 0   /* None defined */  #define PUBLIC_STUDY_OPTIONS 0   /* None defined */
255    
# Line 264  definitions below, up to ESC_z. There's Line 309  definitions below, up to ESC_z. There's
309  corresponds to "." rather than an escape sequence. The final one must be  corresponds to "." rather than an escape sequence. The final one must be
310  ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two  ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
311  tests in the code for an escape greater than ESC_b and less than ESC_Z to  tests in the code for an escape greater than ESC_b and less than ESC_Z to
312  detect the types that may be repeated. These are the types that consume a  detect the types that may be repeated. These are the types that consume
313  character. If any new escapes are put in between that don't consume a  characters. If any new escapes are put in between that don't consume a
314  character, that code will have to change. */  character, that code will have to change. */
315    
316  enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,  enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
317         ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };         ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
318           ESC_Q, ESC_REF };
319    
320  /* Flag bits and data types for the extended class (OP_XCLASS) for classes that  /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
321  contain UTF-8 characters with values greater than 255. */  contain UTF-8 characters with values greater than 255. */
# Line 280  contain UTF-8 characters with values gre Line 326  contain UTF-8 characters with values gre
326  #define XCL_END       0    /* Marks end of individual items */  #define XCL_END       0    /* Marks end of individual items */
327  #define XCL_SINGLE    1    /* Single item (one multibyte char) follows */  #define XCL_SINGLE    1    /* Single item (one multibyte char) follows */
328  #define XCL_RANGE     2    /* A range (two multibyte chars) follows */  #define XCL_RANGE     2    /* A range (two multibyte chars) follows */
329    #define XCL_PROP      3    /* Unicode property (one property code) follows */
330    #define XCL_NOTPROP   4    /* Unicode inverted property (ditto) */
331    
332    
333  /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets  /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
# Line 305  enum { Line 353  enum {
353    OP_WORDCHAR,           /* 10 \w */    OP_WORDCHAR,           /* 10 \w */
354    OP_ANY,            /* 11 Match any character */    OP_ANY,            /* 11 Match any character */
355    OP_ANYBYTE,        /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */    OP_ANYBYTE,        /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
356    OP_EODN,           /* 13 End of data or \n at end of data: \Z. */    OP_NOTPROP,        /* 13 \P (not Unicode property) */
357    OP_EOD,            /* 14 End of data: \z */    OP_PROP,           /* 14 \p (Unicode property) */
358      OP_EXTUNI,         /* 15 \X (extended Unicode sequence */
359    OP_OPT,            /* 15 Set runtime options */    OP_EODN,           /* 16 End of data or \n at end of data: \Z. */
360    OP_CIRC,           /* 16 Start of line - varies with multiline switch */    OP_EOD,            /* 17 End of data: \z */
361    OP_DOLL,           /* 17 End of line - varies with multiline switch */  
362    OP_CHARS,          /* 18 Match string of characters */    OP_OPT,            /* 18 Set runtime options */
363    OP_NOT,            /* 19 Match anything but the following char */    OP_CIRC,           /* 19 Start of line - varies with multiline switch */
364      OP_DOLL,           /* 20 End of line - varies with multiline switch */
365    OP_STAR,           /* 20 The maximizing and minimizing versions of */    OP_CHAR,           /* 21 Match one character, casefully */
366    OP_MINSTAR,        /* 21 all these opcodes must come in pairs, with */    OP_CHARNC,         /* 22 Match one character, caselessly */
367    OP_PLUS,           /* 22 the minimizing one second. */    OP_NOT,            /* 23 Match anything but the following char */
368    OP_MINPLUS,        /* 23 This first set applies to single characters */  
369    OP_QUERY,          /* 24 */    OP_STAR,           /* 24 The maximizing and minimizing versions of */
370    OP_MINQUERY,       /* 25 */    OP_MINSTAR,        /* 25 all these opcodes must come in pairs, with */
371    OP_UPTO,           /* 26 From 0 to n matches */    OP_PLUS,           /* 26 the minimizing one second. */
372    OP_MINUPTO,        /* 27 */    OP_MINPLUS,        /* 27 This first set applies to single characters */
373    OP_EXACT,          /* 28 Exactly n matches */    OP_QUERY,          /* 28 */
374      OP_MINQUERY,       /* 29 */
375    OP_NOTSTAR,        /* 29 The maximizing and minimizing versions of */    OP_UPTO,           /* 30 From 0 to n matches */
376    OP_NOTMINSTAR,     /* 30 all these opcodes must come in pairs, with */    OP_MINUPTO,        /* 31 */
377    OP_NOTPLUS,        /* 31 the minimizing one second. */    OP_EXACT,          /* 32 Exactly n matches */
378    OP_NOTMINPLUS,     /* 32 This set applies to "not" single characters */  
379    OP_NOTQUERY,       /* 33 */    OP_NOTSTAR,        /* 33 The maximizing and minimizing versions of */
380    OP_NOTMINQUERY,    /* 34 */    OP_NOTMINSTAR,     /* 34 all these opcodes must come in pairs, with */
381    OP_NOTUPTO,        /* 35 From 0 to n matches */    OP_NOTPLUS,        /* 35 the minimizing one second. */
382    OP_NOTMINUPTO,     /* 36 */    OP_NOTMINPLUS,     /* 36 This set applies to "not" single characters */
383    OP_NOTEXACT,       /* 37 Exactly n matches */    OP_NOTQUERY,       /* 37 */
384      OP_NOTMINQUERY,    /* 38 */
385    OP_TYPESTAR,       /* 38 The maximizing and minimizing versions of */    OP_NOTUPTO,        /* 39 From 0 to n matches */
386    OP_TYPEMINSTAR,    /* 39 all these opcodes must come in pairs, with */    OP_NOTMINUPTO,     /* 40 */
387    OP_TYPEPLUS,       /* 40 the minimizing one second. These codes must */    OP_NOTEXACT,       /* 41 Exactly n matches */
388    OP_TYPEMINPLUS,    /* 41 be in exactly the same order as those above. */  
389    OP_TYPEQUERY,      /* 42 This set applies to character types such as \d */    OP_TYPESTAR,       /* 42 The maximizing and minimizing versions of */
390    OP_TYPEMINQUERY,   /* 43 */    OP_TYPEMINSTAR,    /* 43 all these opcodes must come in pairs, with */
391    OP_TYPEUPTO,       /* 44 From 0 to n matches */    OP_TYPEPLUS,       /* 44 the minimizing one second. These codes must */
392    OP_TYPEMINUPTO,    /* 45 */    OP_TYPEMINPLUS,    /* 45 be in exactly the same order as those above. */
393    OP_TYPEEXACT,      /* 46 Exactly n matches */    OP_TYPEQUERY,      /* 46 This set applies to character types such as \d */
394      OP_TYPEMINQUERY,   /* 47 */
395    OP_CRSTAR,         /* 47 The maximizing and minimizing versions of */    OP_TYPEUPTO,       /* 48 From 0 to n matches */
396    OP_CRMINSTAR,      /* 48 all these opcodes must come in pairs, with */    OP_TYPEMINUPTO,    /* 49 */
397    OP_CRPLUS,         /* 49 the minimizing one second. These codes must */    OP_TYPEEXACT,      /* 50 Exactly n matches */
398    OP_CRMINPLUS,      /* 50 be in exactly the same order as those above. */  
399    OP_CRQUERY,        /* 51 These are for character classes and back refs */    OP_CRSTAR,         /* 51 The maximizing and minimizing versions of */
400    OP_CRMINQUERY,     /* 52 */    OP_CRMINSTAR,      /* 52 all these opcodes must come in pairs, with */
401    OP_CRRANGE,        /* 53 These are different to the three seta above. */    OP_CRPLUS,         /* 53 the minimizing one second. These codes must */
402    OP_CRMINRANGE,     /* 54 */    OP_CRMINPLUS,      /* 54 be in exactly the same order as those above. */
403      OP_CRQUERY,        /* 55 These are for character classes and back refs */
404      OP_CRMINQUERY,     /* 56 */
405      OP_CRRANGE,        /* 57 These are different to the three sets above. */
406      OP_CRMINRANGE,     /* 58 */
407    
408    OP_CLASS,          /* 55 Match a character class, chars < 256 only */    OP_CLASS,          /* 59 Match a character class, chars < 256 only */
409    OP_NCLASS,         /* 56 Same, but the bitmap was created from a negative    OP_NCLASS,         /* 60 Same, but the bitmap was created from a negative
410                             class - the difference is relevant only when a UTF-8                             class - the difference is relevant only when a UTF-8
411                             character > 255 is encountered. */                             character > 255 is encountered. */
412    
413    OP_XCLASS,         /* 57 Extended class for handling UTF-8 chars within the    OP_XCLASS,         /* 61 Extended class for handling UTF-8 chars within the
414                             class. This does both positive and negative. */                             class. This does both positive and negative. */
415    
416    OP_REF,            /* 58 Match a back reference */    OP_REF,            /* 62 Match a back reference */
417    OP_RECURSE,        /* 59 Match a numbered subpattern (possibly recursive) */    OP_RECURSE,        /* 63 Match a numbered subpattern (possibly recursive) */
418    OP_CALLOUT,        /* 60 Call out to external function if provided */    OP_CALLOUT,        /* 64 Call out to external function if provided */
419    
420    OP_ALT,            /* 61 Start of alternation */    OP_ALT,            /* 65 Start of alternation */
421    OP_KET,            /* 62 End of group that doesn't have an unbounded repeat */    OP_KET,            /* 66 End of group that doesn't have an unbounded repeat */
422    OP_KETRMAX,        /* 63 These two must remain together and in this */    OP_KETRMAX,        /* 67 These two must remain together and in this */
423    OP_KETRMIN,        /* 64 order. They are for groups the repeat for ever. */    OP_KETRMIN,        /* 68 order. They are for groups the repeat for ever. */
424    
425    /* The assertions must come before ONCE and COND */    /* The assertions must come before ONCE and COND */
426    
427    OP_ASSERT,         /* 65 Positive lookahead */    OP_ASSERT,         /* 69 Positive lookahead */
428    OP_ASSERT_NOT,     /* 66 Negative lookahead */    OP_ASSERT_NOT,     /* 70 Negative lookahead */
429    OP_ASSERTBACK,     /* 67 Positive lookbehind */    OP_ASSERTBACK,     /* 71 Positive lookbehind */
430    OP_ASSERTBACK_NOT, /* 68 Negative lookbehind */    OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
431    OP_REVERSE,        /* 69 Move pointer back - used in lookbehind assertions */    OP_REVERSE,        /* 73 Move pointer back - used in lookbehind assertions */
432    
433    /* ONCE and COND must come after the assertions, with ONCE first, as there's    /* ONCE and COND must come after the assertions, with ONCE first, as there's
434    a test for >= ONCE for a subpattern that isn't an assertion. */    a test for >= ONCE for a subpattern that isn't an assertion. */
435    
436    OP_ONCE,           /* 70 Once matched, don't back up into the subpattern */    OP_ONCE,           /* 74 Once matched, don't back up into the subpattern */
437    OP_COND,           /* 71 Conditional group */    OP_COND,           /* 75 Conditional group */
438    OP_CREF,           /* 72 Used to hold an extraction string number (cond ref) */    OP_CREF,           /* 76 Used to hold an extraction string number (cond ref) */
439    
440    OP_BRAZERO,        /* 73 These two must remain together and in this */    OP_BRAZERO,        /* 77 These two must remain together and in this */
441    OP_BRAMINZERO,     /* 74 order. */    OP_BRAMINZERO,     /* 78 order. */
442    
443    OP_BRANUMBER,      /* 75 Used for extracting brackets whose number is greater    OP_BRANUMBER,      /* 79 Used for extracting brackets whose number is greater
444                             than can fit into an opcode. */                             than can fit into an opcode. */
445    
446    OP_BRA             /* 76 This and greater values are used for brackets that    OP_BRA             /* 80 This and greater values are used for brackets that
447                             extract substrings up to a basic limit. After that,                             extract substrings up to EXTRACT_BASIC_MAX. After
448                             use is made of OP_BRANUMBER. */                             that, use is made of OP_BRANUMBER. */
449  };  };
450    
451  /* WARNING: There is an implicit assumption in study.c that all opcodes are  /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
452  less than 128 in value. This makes handling UTF-8 character sequences easier.  study.c that all opcodes are less than 128 in value. This makes handling UTF-8
453  */  character sequences easier. */
454    
455    /* The highest extraction number before we have to start using additional
456    bytes. (Originally PCRE didn't have support for extraction counts highter than
457    this number.) The value is limited by the number of opcodes left after OP_BRA,
458    i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
459    opcodes. */
460    
461    #define EXTRACT_BASIC_MAX  100
462    
463    
464  /* This macro defines textual names for all the opcodes. There are used only  /* This macro defines textual names for all the opcodes. There are used only
# Line 407  macro is referenced only in printint.c. Line 467  macro is referenced only in printint.c.
467    
468  #define OP_NAME_LIST \  #define OP_NAME_LIST \
469    "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d",                \    "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d",                \
470    "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z",     \    "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte",                   \
471    "Opt", "^", "$", "chars", "not",                                \    "notprop", "prop", "extuni",                                    \
472      "\\Z", "\\z",                                                   \
473      "Opt", "^", "$", "char", "charnc", "not",                       \
474    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
475    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
476    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
# Line 431  in UTF-8 mode. The code that uses this t Line 493  in UTF-8 mode. The code that uses this t
493  #define OP_LENGTHS \  #define OP_LENGTHS \
494    1,                             /* End                                    */ \    1,                             /* End                                    */ \
495    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
496    1, 1, 1, 1, 2, 1, 1,           /* Any, Anybyte, \Z, \z, Opt, ^, $        */ \    1, 1,                          /* Any, Anybyte                           */ \
497    2,                             /* Chars - the minimum length             */ \    2, 2, 1,                       /* NOTPROP, PROP, EXTUNI                  */ \
498      1, 1, 2, 1, 1,                 /* \Z, \z, Opt, ^, $                      */ \
499      2,                             /* Char  - the minimum length             */ \
500      2,                             /* Charnc  - the minimum length           */ \
501    2,                             /* not                                    */ \    2,                             /* not                                    */ \
502    /* Positive single-char repeats                            ** These are  */ \    /* Positive single-char repeats                            ** These are  */ \
503    2, 2, 2, 2, 2, 2,              /* *, *?, +, +?, ?, ??      ** minima in  */ \    2, 2, 2, 2, 2, 2,              /* *, *?, +, +?, ?, ??      ** minima in  */ \
# Line 451  in UTF-8 mode. The code that uses this t Line 516  in UTF-8 mode. The code that uses this t
516    0,                             /* XCLASS - variable length               */ \    0,                             /* XCLASS - variable length               */ \
517    3,                             /* REF                                    */ \    3,                             /* REF                                    */ \
518    1+LINK_SIZE,                   /* RECURSE                                */ \    1+LINK_SIZE,                   /* RECURSE                                */ \
519    2,                             /* CALLOUT                                */ \    2+2*LINK_SIZE,                 /* CALLOUT                                */ \
520    1+LINK_SIZE,                   /* Alt                                    */ \    1+LINK_SIZE,                   /* Alt                                    */ \
521    1+LINK_SIZE,                   /* Ket                                    */ \    1+LINK_SIZE,                   /* Ket                                    */ \
522    1+LINK_SIZE,                   /* KetRmax                                */ \    1+LINK_SIZE,                   /* KetRmax                                */ \
# Line 469  in UTF-8 mode. The code that uses this t Line 534  in UTF-8 mode. The code that uses this t
534    1+LINK_SIZE                    /* BRA                                    */ \    1+LINK_SIZE                    /* BRA                                    */ \
535    
536    
 /* The highest extraction number before we have to start using additional  
 bytes. (Originally PCRE didn't have support for extraction counts highter than  
 this number.) The value is limited by the number of opcodes left after OP_BRA,  
 i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional  
 opcodes. */  
   
 #define EXTRACT_BASIC_MAX  150  
   
537  /* A magic value for OP_CREF to indicate the "in recursion" condition. */  /* A magic value for OP_CREF to indicate the "in recursion" condition. */
538    
539  #define CREF_RECURSE  0xffff  #define CREF_RECURSE  0xffff
# Line 522  just to accommodate the POSIX wrapper. * Line 579  just to accommodate the POSIX wrapper. *
579  #define ERR34 "character value in \\x{...} sequence is too large"  #define ERR34 "character value in \\x{...} sequence is too large"
580  #define ERR35 "invalid condition (?(0)"  #define ERR35 "invalid condition (?(0)"
581  #define ERR36 "\\C not allowed in lookbehind assertion"  #define ERR36 "\\C not allowed in lookbehind assertion"
582  #define ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X"  #define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u"
583  #define ERR38 "number after (?C is > 255"  #define ERR38 "number after (?C is > 255"
584  #define ERR39 "closing ) for (?C expected"  #define ERR39 "closing ) for (?C expected"
585  #define ERR40 "recursive call could loop indefinitely"  #define ERR40 "recursive call could loop indefinitely"
# Line 530  just to accommodate the POSIX wrapper. * Line 587  just to accommodate the POSIX wrapper. *
587  #define ERR42 "syntax error after (?P"  #define ERR42 "syntax error after (?P"
588  #define ERR43 "two named groups have the same name"  #define ERR43 "two named groups have the same name"
589  #define ERR44 "invalid UTF-8 string"  #define ERR44 "invalid UTF-8 string"
590    #define ERR45 "support for \\P, \\p, and \\X has not been compiled"
591  /* All character handling must be done as unsigned characters. Otherwise there  #define ERR46 "malformed \\P or \\p sequence"
592  are problems with top-bit-set characters and functions such as isspace().  #define ERR47 "unknown property name after \\P or \\p"
 However, we leave the interface to the outside world as char *, because that  
 should make things easier for callers. We define a short type for unsigned char  
 to save lots of typing. I tried "uchar", but it causes problems on Digital  
 Unix, where it is defined in sys/types, so use "uschar" instead. */  
   
 typedef unsigned char uschar;  
593    
594  /* The real format of the start of the pcre block; the index of names and the  /* The real format of the start of the pcre block; the index of names and the
595  code vector run on as long as necessary after the end. */  code vector run on as long as necessary after the end. We store an explicit
596    offset to the name table so that if a regex is compiled on one host, saved, and
597    then run on another where the size of pointers is different, all might still
598    be well. For the case of compiled-on-4 and run-on-8, we include an extra
599    pointer that is always NULL. For future-proofing, we also include a few dummy
600    fields - even though you can never get this planning right!
601    
602    NOTE NOTE NOTE:
603    Because people can now save and re-use compiled patterns, any additions to this
604    structure should be made at the end, and something earlier (e.g. a new
605    flag in the options or one of the dummy fields) should indicate that the new
606    fields are present. Currently PCRE always sets the dummy fields to zero.
607    NOTE NOTE NOTE:
608    */
609    
610  typedef struct real_pcre {  typedef struct real_pcre {
611    unsigned long int magic_number;    pcre_uint32 magic_number;
612    size_t size;                        /* Total that was malloced */    pcre_uint32 size;               /* Total that was malloced */
613    const unsigned char *tables;        /* Pointer to tables */    pcre_uint32 options;
614    unsigned long int options;    pcre_uint32 dummy1;             /* For future use, maybe */
615    unsigned short int top_bracket;  
616    unsigned short int top_backref;    pcre_uint16 top_bracket;
617    unsigned short int first_byte;    pcre_uint16 top_backref;
618    unsigned short int req_byte;    pcre_uint16 first_byte;
619    unsigned short int name_entry_size; /* Size of any name items; 0 => none */    pcre_uint16 req_byte;
620    unsigned short int name_count;      /* Number of name items */    pcre_uint16 name_table_offset;  /* Offset to name table that follows */
621      pcre_uint16 name_entry_size;    /* Size of any name items */
622      pcre_uint16 name_count;         /* Number of name items */
623      pcre_uint16 dummy2;             /* For future use, maybe */
624    
625      const unsigned char *tables;    /* Pointer to tables or NULL for std */
626      const unsigned char *nullpad;   /* NULL padding */
627  } real_pcre;  } real_pcre;
628    
629  /* The format of the block used to store data from pcre_study(). */  /* The format of the block used to store data from pcre_study(). The same
630    remark (see NOTE above) about extending this structure applies. */
631    
632  typedef struct pcre_study_data {  typedef struct pcre_study_data {
633    size_t size;                        /* Total that was malloced */    pcre_uint32 size;               /* Total that was malloced */
634    uschar options;    pcre_uint32 options;
635    uschar start_bits[32];    uschar start_bits[32];
636  } pcre_study_data;  } pcre_study_data;
637    
# Line 573  typedef struct compile_data { Line 644  typedef struct compile_data {
644    const uschar *cbits;          /* Points to character type table */    const uschar *cbits;          /* Points to character type table */
645    const uschar *ctypes;         /* Points to table of type maps */    const uschar *ctypes;         /* Points to table of type maps */
646    const uschar *start_code;     /* The start of the compiled code */    const uschar *start_code;     /* The start of the compiled code */
647      const uschar *start_pattern;  /* The start of the pattern */
648    uschar *name_table;           /* The name/number table */    uschar *name_table;           /* The name/number table */
649    int  names_found;             /* Number of entries so far */    int  names_found;             /* Number of entries so far */
650    int  name_entry_size;         /* Size of each entry */    int  name_entry_size;         /* Size of each entry */
651    int  top_backref;             /* Maximum back reference */    int  top_backref;             /* Maximum back reference */
652    unsigned int backref_map;     /* Bitmap of low back refs */    unsigned int backref_map;     /* Bitmap of low back refs */
653    int  req_varyopt;             /* "After variable item" flag for reqbyte */    int  req_varyopt;             /* "After variable item" flag for reqbyte */
654      BOOL nopartial;               /* Set TRUE if partial won't work */
655  } compile_data;  } compile_data;
656    
657  /* Structure for maintaining a chain of pointers to the currently incomplete  /* Structure for maintaining a chain of pointers to the currently incomplete
# Line 628  typedef struct match_data { Line 701  typedef struct match_data {
701    BOOL   utf8;                  /* UTF8 flag */    BOOL   utf8;                  /* UTF8 flag */
702    BOOL   endonly;               /* Dollar not before final \n */    BOOL   endonly;               /* Dollar not before final \n */
703    BOOL   notempty;              /* Empty string match not wanted */    BOOL   notempty;              /* Empty string match not wanted */
704      BOOL   partial;               /* PARTIAL flag */
705      BOOL   hitend;                /* Hit the end of the subject at some point */
706    const uschar *start_code;     /* For use when recursing */    const uschar *start_code;     /* For use when recursing */
707    const uschar *start_subject;  /* Start of the subject string */    const uschar *start_subject;  /* Start of the subject string */
708    const uschar *end_subject;    /* End of the subject string */    const uschar *end_subject;    /* End of the subject string */

Legend:
Removed from v.74  
changed lines
  Added in v.75

  ViewVC Help
Powered by ViewVC 1.1.5