/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 602 by ph10, Wed May 25 08:29:03 2011 UTC revision 1387 by ph10, Sat Nov 2 18:29:05 2013 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2011 University of Cambridge             Copyright (c) 1997-2013 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
69  /* Macro for setting individual bits in class bitmaps. */  /* Macro for setting individual bits in class bitmaps. */
70    
71  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))  #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72    
73  /* Maximum length value to check against when making sure that the integer that  /* Maximum length value to check against when making sure that the integer that
74  holds the compiled pattern length does not overflow. We make it a bit less than  holds the compiled pattern length does not overflow. We make it a bit less than
# Line 73  to check them every time. */ Line 77  to check them every time. */
77    
78  #define OFLOW_MAX (INT_MAX - 20)  #define OFLOW_MAX (INT_MAX - 20)
79    
80    /* Definitions to allow mutual recursion */
81    
82    static int
83      add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84        const pcre_uint32 *, unsigned int);
85    
86    static BOOL
87      compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88        pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89        compile_data *, int *);
90    
91    
92    
93  /*************************************************  /*************************************************
94  *      Code parameters and static tables         *  *      Code parameters and static tables         *
# Line 88  so this number is very generous. Line 104  so this number is very generous.
104  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
105  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
106  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
108    filled up by repetitions of forward references, for example patterns like
109    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110    that the workspace is expanded using malloc() in this situation. The value
111    below is therefore a minimum, and we put a maximum on it for safety. The
112    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113    kicks in at the same number of forward references in all cases. */
114    
115    #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117    
118    /* This value determines the size of the initial vector that is used for
119    remembering named groups during the pre-compile. It is allocated on the stack,
120    but if it is too small, it is expanded using malloc(), in a similar way to the
121    workspace. The value is the number of slots in the list. */
122    
123  #define COMPILE_WORK_SIZE (4096)  #define NAMED_GROUP_LIST_SIZE  20
124    
125  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
126  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
127    
128  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
129    
130    /* Private flags added to firstchar and reqchar. */
131    
132    #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
133    #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
134    /* Negative values for the firstchar and reqchar flags */
135    #define REQ_UNSET       (-2)
136    #define REQ_NONE        (-1)
137    
138    /* Repeated character flags. */
139    
140    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
141    
142  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 223  static const int verbcount = sizeof(verb Line 264  static const int verbcount = sizeof(verb
264  now all in a single string, to reduce the number of relocations when a shared  now all in a single string, to reduce the number of relocations when a shared
265  library is dynamically loaded. The list of lengths is terminated by a zero  library is dynamically loaded. The list of lengths is terminated by a zero
266  length entry. The first three must be alpha, lower, upper, as this is assumed  length entry. The first three must be alpha, lower, upper, as this is assumed
267  for handling case independence. */  for handling case independence. The indices for graph, print, and punct are
268    needed, so identify them. */
269    
270  static const char posix_names[] =  static const char posix_names[] =
271    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
# Line 231  static const char posix_names[] = Line 273  static const char posix_names[] =
273    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
274    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
275    
276  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
277    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
278    
279    #define PC_GRAPH  8
280    #define PC_PRINT  9
281    #define PC_PUNCT 10
282    
283    
284  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
285  base map, with an optional addition or removal of another map. Then, for some  base map, with an optional addition or removal of another map. Then, for some
286  classes, there is some additional tweaking: for [:blank:] the vertical space  classes, there is some additional tweaking: for [:blank:] the vertical space
# Line 261  static const int posix_class_maps[] = { Line 308  static const int posix_class_maps[] = {
308    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
309  };  };
310    
311  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class  /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
312  substitutes must be in the order of the names, defined above, and there are  Unicode property escapes. */
 both positive and negative cases. NULL means no substitute. */  
313    
314  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
315  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
316    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
317    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
318    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
319    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
320    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
321    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
322      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
323      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
324    static const pcre_uchar string_pXsp[] = {
325      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
326      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
327    static const pcre_uchar string_PXwd[] = {
328      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
329      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
330    static const pcre_uchar string_pXwd[] = {
331      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333    
334    static const pcre_uchar *substitutes[] = {
335      string_PNd,           /* \D */
336      string_pNd,           /* \d */
337      string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
338      string_pXsp,          /* \s */   /* space and POSIX space are the same. */
339      string_PXwd,          /* \W */
340      string_pXwd           /* \w */
341  };  };
342    
343  static const uschar *posix_substitutes[] = {  /* The POSIX class substitutes must be in the order of the POSIX class names,
344    (uschar *)"\\p{L}",     /* alpha */  defined above, and there are both positive and negative cases. NULL means no
345    (uschar *)"\\p{Ll}",    /* lower */  general substitute of a Unicode property escape (\p or \P). However, for some
346    (uschar *)"\\p{Lu}",    /* upper */  POSIX classes (e.g. graph, print, punct) a special property code is compiled
347    (uschar *)"\\p{Xan}",   /* alnum */  directly. */
348    NULL,                   /* ascii */  
349    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pL[] =   {
350    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
352    NULL,                   /* graph */  static const pcre_uchar string_pLl[] =  {
353    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354    NULL,                   /* punct */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_pLu[] =  {
356    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
357    NULL,                   /* xdigit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358    static const pcre_uchar string_pXan[] = {
359      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
360      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361    static const pcre_uchar string_h[] =    {
362      CHAR_BACKSLASH, CHAR_h, '\0' };
363    static const pcre_uchar string_pXps[] = {
364      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
365      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
366    static const pcre_uchar string_PL[] =   {
367      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
368      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
369    static const pcre_uchar string_PLl[] =  {
370      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372    static const pcre_uchar string_PLu[] =  {
373      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
374      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
375    static const pcre_uchar string_PXan[] = {
376      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
377      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
378    static const pcre_uchar string_H[] =    {
379      CHAR_BACKSLASH, CHAR_H, '\0' };
380    static const pcre_uchar string_PXps[] = {
381      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
382      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
383    
384    static const pcre_uchar *posix_substitutes[] = {
385      string_pL,            /* alpha */
386      string_pLl,           /* lower */
387      string_pLu,           /* upper */
388      string_pXan,          /* alnum */
389      NULL,                 /* ascii */
390      string_h,             /* blank */
391      NULL,                 /* cntrl */
392      string_pNd,           /* digit */
393      NULL,                 /* graph */
394      NULL,                 /* print */
395      NULL,                 /* punct */
396      string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
397      string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
398      NULL,                 /* xdigit */
399    /* Negated cases */    /* Negated cases */
400    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
401    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
402    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
403    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
404    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
405    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
406    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
407    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
408    NULL,                   /* ^graph */    NULL,                 /* ^graph */
409    NULL,                   /* ^print */    NULL,                 /* ^print */
410    NULL,                   /* ^punct */    NULL,                 /* ^punct */
411    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
412    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
413    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
414  };  };
415  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
416  #endif  #endif
417    
418  #define STRING(a)  # a  #define STRING(a)  # a
# Line 365  static const char error_texts[] = Line 471  static const char error_texts[] =
471    /* 30 */    /* 30 */
472    "unknown POSIX class name\0"    "unknown POSIX class name\0"
473    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
474    "this version of PCRE is not compiled with PCRE_UTF8 support\0"    "this version of PCRE is compiled without UTF support\0"
475    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
476    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
477    /* 35 */    /* 35 */
478    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
479    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 388  static const char error_texts[] = Line 494  static const char error_texts[] =
494    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
495    /* 50 */    /* 50 */
496    "repeated subpattern is too long\0"    /** DEAD **/    "repeated subpattern is too long\0"    /** DEAD **/
497    "octal value is greater than \\377 (not in UTF-8 mode)\0"    "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
498    "internal error: overran compiling workspace\0"    "internal error: overran compiling workspace\0"
499    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
500    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
501    /* 55 */    /* 55 */
502    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
503    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
504    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
505    "a numbered reference must not be zero\0"    "a numbered reference must not be zero\0"
506    "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"    "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
507    /* 60 */    /* 60 */
508    "(*VERB) not recognized\0"    "(*VERB) not recognized or malformed\0"
509    "number is too big\0"    "number is too big\0"
510    "subpattern name expected\0"    "subpattern name expected\0"
511    "digit expected after (?+\0"    "digit expected after (?+\0"
# Line 407  static const char error_texts[] = Line 513  static const char error_texts[] =
513    /* 65 */    /* 65 */
514    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
515    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
516    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with Unicode property support\0"
517    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
518      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
519      /* 70 */
520      "internal error: unknown opcode in find_fixedlength()\0"
521      "\\N is not supported in a class\0"
522      "too many forward references\0"
523      "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
524      "invalid UTF-16 string\0"
525      /* 75 */
526      "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
527      "character value in \\u.... sequence is too large\0"
528      "invalid UTF-32 string\0"
529      "setting UTF is disabled by the application\0"
530      "non-hex character in \\x{} (closing brace missing?)\0"
531      /* 80 */
532      "non-octal character in \\o{} (closing brace missing?)\0"
533      "missing opening brace after \\o\0"
534    ;    ;
535    
536  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 427  For convenience, we use the same bit def Line 549  For convenience, we use the same bit def
549    
550  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
551    
552    /* Using a simple comparison for decimal numbers rather than a memory read
553    is much faster, and the resulting code is simpler (the compiler turns it
554    into a subtraction and unsigned comparison). */
555    
556    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
557    
558  #ifndef EBCDIC  #ifndef EBCDIC
559    
560  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
561  UTF-8 mode. */  UTF-8 mode. */
562    
563  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
564    {    {
565    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
566    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 471  static const unsigned char digitab[] = Line 599  static const unsigned char digitab[] =
599    
600  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
601    
602  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
603    {    {
604    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
605    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 506  static const unsigned char digitab[] = Line 634  static const unsigned char digitab[] =
634    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
635    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
636    
637  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
638    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
639    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
640    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 542  static const unsigned char ebcdic_charta Line 670  static const unsigned char ebcdic_charta
670  #endif  #endif
671    
672    
673  /* Definition to allow mutual recursion */  /* This table is used to check whether auto-possessification is possible
674    between adjacent character-type opcodes. The left-hand (repeated) opcode is
675    used to select the row, and the right-hand opcode is use to select the column.
676    A value of 1 means that auto-possessification is OK. For example, the second
677    value in the first row means that \D+\d can be turned into \D++\d.
678    
679    The Unicode property types (\P and \p) have to be present to fill out the table
680    because of what their opcode values are, but the table values should always be
681    zero because property types are handled separately in the code. The last four
682    columns apply to items that cannot be repeated, so there is no need to have
683    rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
684    *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
685    
686    #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
687    #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
688    
689    static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
690    /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
691      { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
692      { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
693      { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
694      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
695      { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
696      { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
697      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
698      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
699      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
700      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
701      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
702      { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
703      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
704      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
705      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
706      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
707      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
708    };
709    
710    
711  static BOOL  /* This table is used to check whether auto-possessification is possible
712    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,  between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
713      int *, int *, branch_chain *, compile_data *, int *);  left-hand (repeated) opcode is used to select the row, and the right-hand
714    opcode is used to select the column. The values are as follows:
715    
716      0   Always return FALSE (never auto-possessify)
717      1   Character groups are distinct (possessify if both are OP_PROP)
718      2   Check character categories in the same group (general or particular)
719      3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
720    
721      4   Check left general category vs right particular category
722      5   Check right general category vs left particular category
723    
724      6   Left alphanum vs right general category
725      7   Left space vs right general category
726      8   Left word vs right general category
727    
728      9   Right alphanum vs left general category
729     10   Right space vs left general category
730     11   Right word vs left general category
731    
732     12   Left alphanum vs right particular category
733     13   Left space vs right particular category
734     14   Left word vs right particular category
735    
736     15   Right alphanum vs left particular category
737     16   Right space vs left particular category
738     17   Right word vs left particular category
739    */
740    
741    static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
742    /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
743      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
744      { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
745      { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
746      { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
747      { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
748      { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
749      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
750      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
751      { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
752      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
753      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
754    };
755    
756    /* This table is used to check whether auto-possessification is possible
757    between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
758    specifies a general category and the other specifies a particular category. The
759    row is selected by the general category and the column by the particular
760    category. The value is 1 if the particular category is not part of the general
761    category. */
762    
763    static const pcre_uint8 catposstab[7][30] = {
764    /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
765      { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
766      { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
767      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
768      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
769      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
770      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
771      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
772    };
773    
774    /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
775    a general or particular category. The properties in each row are those
776    that apply to the character set in question. Duplication means that a little
777    unnecessary work is done when checking, but this keeps things much simpler
778    because they can all use the same code. For more details see the comment where
779    this table is used.
780    
781    Note: SPACE and PXSPACE used to be different because Perl excluded VT from
782    "space", but from Perl 5.18 it's included, so both categories are treated the
783    same here. */
784    
785    static const pcre_uint8 posspropstab[3][4] = {
786      { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
787      { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
788      { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
789    };
790    
791    /* This table is used when converting repeating opcodes into possessified
792    versions as a result of an explicit possessive quantifier such as ++. A zero
793    value means there is no possessified version - in those cases the item in
794    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
795    because all relevant opcodes are less than that. */
796    
797    static const pcre_uint8 opcode_possessify[] = {
798      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
799      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
800    
801      0,                       /* NOTI */
802      OP_POSSTAR, 0,           /* STAR, MINSTAR */
803      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
804      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
805      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
806      0,                       /* EXACT */
807      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
808    
809      OP_POSSTARI, 0,          /* STARI, MINSTARI */
810      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
811      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
812      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
813      0,                       /* EXACTI */
814      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
815    
816      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
817      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
818      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
819      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
820      0,                       /* NOTEXACT */
821      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
822    
823      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
824      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
825      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
826      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
827      0,                       /* NOTEXACTI */
828      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
829    
830      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
831      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
832      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
833      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
834      0,                       /* TYPEEXACT */
835      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
836    
837      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
838      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
839      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
840      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
841      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
842    
843      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
844      0, 0,                    /* REF, REFI */
845      0, 0,                    /* DNREF, DNREFI */
846      0, 0                     /* RECURSE, CALLOUT */
847    };
848    
849    
850    
# Line 569  find_error_text(int n) Line 867  find_error_text(int n)
867  const char *s = error_texts;  const char *s = error_texts;
868  for (; n > 0; n--)  for (; n > 0; n--)
869    {    {
870    while (*s++ != 0) {};    while (*s++ != CHAR_NULL) {};
871    if (*s == 0) return "Error text not found (please report)";    if (*s == CHAR_NULL) return "Error text not found (please report)";
872    }    }
873  return s;  return s;
874  }  }
875    
876    
877    
878    /*************************************************
879    *           Expand the workspace                 *
880    *************************************************/
881    
882    /* This function is called during the second compiling phase, if the number of
883    forward references fills the existing workspace, which is originally a block on
884    the stack. A larger block is obtained from malloc() unless the ultimate limit
885    has been reached or the increase will be rather small.
886    
887    Argument: pointer to the compile data block
888    Returns:  0 if all went well, else an error number
889    */
890    
891    static int
892    expand_workspace(compile_data *cd)
893    {
894    pcre_uchar *newspace;
895    int newsize = cd->workspace_size * 2;
896    
897    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
898    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
899        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
900     return ERR72;
901    
902    newspace = (PUBL(malloc))(IN_UCHARS(newsize));
903    if (newspace == NULL) return ERR21;
904    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
905    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
906    if (cd->workspace_size > COMPILE_WORK_SIZE)
907      (PUBL(free))((void *)cd->start_workspace);
908    cd->start_workspace = newspace;
909    cd->workspace_size = newsize;
910    return 0;
911    }
912    
913    
914    
915    /*************************************************
916    *            Check for counted repeat            *
917    *************************************************/
918    
919    /* This function is called when a '{' is encountered in a place where it might
920    start a quantifier. It looks ahead to see if it really is a quantifier or not.
921    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
922    where the ddds are digits.
923    
924    Arguments:
925      p         pointer to the first char after '{'
926    
927    Returns:    TRUE or FALSE
928    */
929    
930    static BOOL
931    is_counted_repeat(const pcre_uchar *p)
932    {
933    if (!IS_DIGIT(*p)) return FALSE;
934    p++;
935    while (IS_DIGIT(*p)) p++;
936    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
937    
938    if (*p++ != CHAR_COMMA) return FALSE;
939    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
940    
941    if (!IS_DIGIT(*p)) return FALSE;
942    p++;
943    while (IS_DIGIT(*p)) p++;
944    
945    return (*p == CHAR_RIGHT_CURLY_BRACKET);
946    }
947    
948    
949    
950  /*************************************************  /*************************************************
951  *            Handle escapes                      *  *            Handle escapes                      *
952  *************************************************/  *************************************************/
953    
954  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
955  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or 0 for a data character which
956  encodes one of the more complicated things such as \d. A backreference to group  will be placed in chptr. A backreference to group n is returned as negative n.
957  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When  When UTF-8 is enabled, a positive value greater than 255 may be returned in
958  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
959  ptr is pointing at the \. On exit, it is on the final character of the escape  character of the escape sequence.
 sequence.  
960    
961  Arguments:  Arguments:
962    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
963      chptr          points to a returned data character
964    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
965    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
966    options        the options bits    options        the options bits
967    isclass        TRUE if inside a character class    isclass        TRUE if inside a character class
968    
969  Returns:         zero or positive => a data character  Returns:         zero => a data character
970                   negative => a special escape sequence                   positive => a special escape sequence
971                     negative => a back reference
972                   on error, errorcodeptr is set                   on error, errorcodeptr is set
973  */  */
974    
975  static int  static int
976  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
977    int options, BOOL isclass)    int bracount, int options, BOOL isclass)
978  {  {
979  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
980  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
981  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
982    pcre_uint32 c;
983    int escape = 0;
984    int i;
985    
986  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
987  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
988    
989  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
990    
991  if (c == 0) *errorcodeptr = ERR1;  if (c == CHAR_NULL) *errorcodeptr = ERR1;
992    
993  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
994  in a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
995  Otherwise further processing may be required. */  Otherwise further processing may be required. */
996    
997  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
998  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
999  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if (c < CHAR_0 || c > CHAR_z) {}
1000    else if ((i = escapes[c - CHAR_0]) != 0)
1001      { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1002    
1003  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1004  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
1005  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1006    else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1007  #endif  #endif
1008    
1009  /* Escapes that need further processing, or are illegal. */  /* Escapes that need further processing, or are illegal. */
1010    
1011  else  else
1012    {    {
1013    const uschar *oldptr;    const pcre_uchar *oldptr;
1014    BOOL braced, negated;    BOOL braced, negated, overflow;
1015      int s;
1016    
1017    switch (c)    switch (c)
1018      {      {
# Line 642  else Line 1021  else
1021    
1022      case CHAR_l:      case CHAR_l:
1023      case CHAR_L:      case CHAR_L:
1024        *errorcodeptr = ERR37;
1025        break;
1026    
1027      case CHAR_u:      case CHAR_u:
1028        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1029          {
1030          /* In JavaScript, \u must be followed by four hexadecimal numbers.
1031          Otherwise it is a lowercase u letter. */
1032          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1033            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1034            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1035            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1036            {
1037            c = 0;
1038            for (i = 0; i < 4; ++i)
1039              {
1040              register pcre_uint32 cc = *(++ptr);
1041    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1042              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1043              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1044    #else           /* EBCDIC coding */
1045              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1046              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1047    #endif
1048              }
1049    
1050    #if defined COMPILE_PCRE8
1051            if (c > (utf ? 0x10ffffU : 0xffU))
1052    #elif defined COMPILE_PCRE16
1053            if (c > (utf ? 0x10ffffU : 0xffffU))
1054    #elif defined COMPILE_PCRE32
1055            if (utf && c > 0x10ffffU)
1056    #endif
1057              {
1058              *errorcodeptr = ERR76;
1059              }
1060            else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1061            }
1062          }
1063        else
1064          *errorcodeptr = ERR37;
1065        break;
1066    
1067      case CHAR_U:      case CHAR_U:
1068      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
1069        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1070      break;      break;
1071    
1072      /* \g must be followed by one of a number of specific things:      /* In a character class, \g is just a literal "g". Outside a character
1073        class, \g must be followed by one of a number of specific things:
1074    
1075      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
1076      backreference. If negative, it is a relative backreference. This is a Perl      backreference. If negative, it is a relative backreference. This is a Perl
# Line 661  else Line 1084  else
1084      (3) For Oniguruma compatibility we also support \g followed by a name or a      (3) For Oniguruma compatibility we also support \g followed by a name or a
1085      number either in angle brackets or in single quotes. However, these are      number either in angle brackets or in single quotes. However, these are
1086      (possibly recursive) subroutine calls, _not_ backreferences. Just return      (possibly recursive) subroutine calls, _not_ backreferences. Just return
1087      the -ESC_g code (cf \k). */      the ESC_g code (cf \k). */
1088    
1089      case CHAR_g:      case CHAR_g:
1090        if (isclass) break;
1091      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1092        {        {
1093        c = -ESC_g;        escape = ESC_g;
1094        break;        break;
1095        }        }
1096    
# Line 674  else Line 1098  else
1098    
1099      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1100        {        {
1101        const uschar *p;        const pcre_uchar *p;
1102        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1103          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1104        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1105          {          {
1106          c = -ESC_k;          escape = ESC_k;
1107          break;          break;
1108          }          }
1109        braced = TRUE;        braced = TRUE;
# Line 694  else Line 1118  else
1118        }        }
1119      else negated = FALSE;      else negated = FALSE;
1120    
1121      c = 0;      /* The integer range is limited by the machine's int representation. */
1122      while ((digitab[ptr[1]] & ctype_digit) != 0)      s = 0;
1123        c = c * 10 + *(++ptr) - CHAR_0;      overflow = FALSE;
1124        while (IS_DIGIT(ptr[1]))
     if (c < 0)   /* Integer overflow */  
1125        {        {
1126          if (s > INT_MAX / 10 - 1) /* Integer overflow */
1127            {
1128            overflow = TRUE;
1129            break;
1130            }
1131          s = s * 10 + (int)(*(++ptr) - CHAR_0);
1132          }
1133        if (overflow) /* Integer overflow */
1134          {
1135          while (IS_DIGIT(ptr[1]))
1136            ptr++;
1137        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
1138        break;        break;
1139        }        }
# Line 710  else Line 1144  else
1144        break;        break;
1145        }        }
1146    
1147      if (c == 0)      if (s == 0)
1148        {        {
1149        *errorcodeptr = ERR58;        *errorcodeptr = ERR58;
1150        break;        break;
# Line 718  else Line 1152  else
1152    
1153      if (negated)      if (negated)
1154        {        {
1155        if (c > bracount)        if (s > bracount)
1156          {          {
1157          *errorcodeptr = ERR15;          *errorcodeptr = ERR15;
1158          break;          break;
1159          }          }
1160        c = bracount - (c - 1);        s = bracount - (s - 1);
1161        }        }
1162    
1163      c = -(ESC_REF + c);      escape = -s;
1164      break;      break;
1165    
1166      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1167      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1168      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1169        recommended to avoid the ambiguities in the old syntax.
1170    
1171      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1172      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1173      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1174      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1175      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1176      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1177      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1178    
1179        Inside a character class, \ followed by a digit is always either a literal
1180        8 or 9 or an octal number. */
1181    
1182      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1183      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 747  else Line 1185  else
1185      if (!isclass)      if (!isclass)
1186        {        {
1187        oldptr = ptr;        oldptr = ptr;
1188        c -= CHAR_0;        /* The integer range is limited by the machine's int representation. */
1189        while ((digitab[ptr[1]] & ctype_digit) != 0)        s = (int)(c -CHAR_0);
1190          c = c * 10 + *(++ptr) - CHAR_0;        overflow = FALSE;
1191        if (c < 0)    /* Integer overflow */        while (IS_DIGIT(ptr[1]))
1192            {
1193            if (s > INT_MAX / 10 - 1) /* Integer overflow */
1194              {
1195              overflow = TRUE;
1196              break;
1197              }
1198            s = s * 10 + (int)(*(++ptr) - CHAR_0);
1199            }
1200          if (overflow) /* Integer overflow */
1201          {          {
1202            while (IS_DIGIT(ptr[1]))
1203              ptr++;
1204          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1205          break;          break;
1206          }          }
1207        if (c < 10 || c <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1208          {          {
1209          c = -(ESC_REF + c);          escape = -s;
1210          break;          break;
1211          }          }
1212        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1213        }        }
1214    
1215      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1216      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1217      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1218        changed so as not to insert the binary zero. */
1219    
1220      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1221        {  
1222        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1223    
1224      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1225      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
1226      significant 8 bits of octal numbers (I think this is what early Perls used      significant 8 bits of octal numbers (I think this is what early Perls used
1227      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1228      than 3 octal digits. */      but no more than 3 octal digits. */
1229    
1230      case CHAR_0:      case CHAR_0:
1231      c -= CHAR_0;      c -= CHAR_0;
1232      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1233          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
1234      if (!utf8 && c > 255) *errorcodeptr = ERR51;  #ifdef COMPILE_PCRE8
1235        if (!utf && c > 0xff) *errorcodeptr = ERR51;
1236    #endif
1237      break;      break;
1238    
1239      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1240      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      specifying character codes in octal. The only supported form is \o{ddd}. */
     treated as a data character. */  
1241    
1242      case CHAR_x:      case CHAR_o:
1243      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1244        {        {
1245        const uschar *pt = ptr + 2;        ptr += 2;
       int count = 0;  
   
1246        c = 0;        c = 0;
1247        while ((digitab[*pt] & ctype_xdigit) != 0)        overflow = FALSE;
1248          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1249          {          {
1250          register int cc = *pt++;          register pcre_uint32 cc = *ptr++;
1251          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1252          count++;  #ifdef COMPILE_PCRE32
1253            if (c >= 0x20000000l) { overflow = TRUE; break; }
1254    #endif
1255            c = (c << 3) + cc - CHAR_0 ;
1256    #if defined COMPILE_PCRE8
1257            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1258    #elif defined COMPILE_PCRE16
1259            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1260    #elif defined COMPILE_PCRE32
1261            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1262    #endif
1263            }
1264          if (overflow)
1265            {
1266            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1267            *errorcodeptr = ERR34;
1268            }
1269          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1270            {
1271            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1272            }
1273          else *errorcodeptr = ERR80;
1274          }
1275        break;
1276    
1277        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1278        numbers. Otherwise it is a lowercase x letter. */
1279    
1280        case CHAR_x:
1281        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1282          {
1283          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1284            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1285            {
1286            c = 0;
1287            for (i = 0; i < 2; ++i)
1288              {
1289              register pcre_uint32 cc = *(++ptr);
1290  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1291          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1292          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1293  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1294          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1295          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1296  #endif  #endif
1297              }
1298          }          }
1299          }    /* End JavaScript handling */
1300    
1301        /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1302        greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1303        digits. If not, { used to be treated as a data character. However, Perl
1304        seems to read hex digits up to the first non-such, and ignore the rest, so
1305        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1306        now gives an error. */
1307    
1308        if (*pt == CHAR_RIGHT_CURLY_BRACKET)      else
1309          {
1310          if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1311          {          {
1312          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          ptr += 2;
1313          ptr = pt;          c = 0;
1314          break;          overflow = FALSE;
1315          }          while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1316              {
1317              register pcre_uint32 cc = *ptr++;
1318              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1319    
1320        /* If the sequence of hex digits does not end with '}', then we don't  #ifdef COMPILE_PCRE32
1321        recognize this construct; fall through to the normal \x handling. */            if (c >= 0x10000000l) { overflow = TRUE; break; }
1322        }  #endif
1323    
1324    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1325              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1326              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1327    #else           /* EBCDIC coding */
1328              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1329              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1330    #endif
1331    
1332    #if defined COMPILE_PCRE8
1333              if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1334    #elif defined COMPILE_PCRE16
1335              if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1336    #elif defined COMPILE_PCRE32
1337              if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1338    #endif
1339              }
1340    
1341      /* Read just a single-byte hex-defined char */          if (overflow)
1342              {
1343              while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1344              *errorcodeptr = ERR34;
1345              }
1346    
1347      c = 0;          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1348      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)            {
1349        {            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1350        int cc;                                  /* Some compilers don't like */            }
1351        cc = *(++ptr);                           /* ++ in initializers */  
1352            /* If the sequence of hex digits does not end with '}', give an error.
1353            We used just to recognize this construct and fall through to the normal
1354            \x handling, but nowadays Perl gives an error, which seems much more
1355            sensible, so we do too. */
1356    
1357            else *errorcodeptr = ERR79;
1358            }   /* End of \x{} processing */
1359    
1360          /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1361    
1362          else
1363            {
1364            c = 0;
1365            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1366              {
1367              pcre_uint32 cc;                          /* Some compilers don't like */
1368              cc = *(++ptr);                           /* ++ in initializers */
1369  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1370        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1371        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1372  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1373        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1374        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1375  #endif  #endif
1376        }            }
1377            }     /* End of \xdd handling */
1378          }       /* End of Perl-style \x handling */
1379      break;      break;
1380    
1381      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 848  else Line 1385  else
1385    
1386      case CHAR_c:      case CHAR_c:
1387      c = *(++ptr);      c = *(++ptr);
1388      if (c == 0)      if (c == CHAR_NULL)
1389        {        {
1390        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
1391        break;        break;
# Line 885  else Line 1422  else
1422    }    }
1423    
1424  /* Perl supports \N{name} for character names, as well as plain \N for "not  /* Perl supports \N{name} for character names, as well as plain \N for "not
1425  newline". PCRE does not support \N{name}. */  newline". PCRE does not support \N{name}. However, it does support
1426    quantification such as \N{2,3}. */
1427    
1428  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)  if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1429         !is_counted_repeat(ptr+2))
1430    *errorcodeptr = ERR37;    *errorcodeptr = ERR37;
1431    
1432  /* If PCRE_UCP is set, we change the values for \d etc. */  /* If PCRE_UCP is set, we change the values for \d etc. */
1433    
1434  if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)  if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1435    c -= (ESC_DU - ESC_D);    escape += (ESC_DU - ESC_D);
1436    
1437  /* Set the pointer to the final character before returning. */  /* Set the pointer to the final character before returning. */
1438    
1439  *ptrptr = ptr;  *ptrptr = ptr;
1440  return c;  *chptr = c;
1441    return escape;
1442  }  }
1443    
1444    
# Line 916  escape sequence. Line 1456  escape sequence.
1456  Argument:  Argument:
1457    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
1458    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
1459    dptr           points to an int that is set to the detailed property value    ptypeptr       points to an unsigned int that is set to the type value
1460      pdataptr       points to an unsigned int that is set to the detailed property value
1461    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
1462    
1463  Returns:         type value from ucp_type_table, or -1 for an invalid type  Returns:         TRUE if the type value was found, or FALSE for an invalid type
1464  */  */
1465    
1466  static int  static BOOL
1467  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1468      unsigned int *pdataptr, int *errorcodeptr)
1469  {  {
1470  int c, i, bot, top;  pcre_uchar c;
1471  const uschar *ptr = *ptrptr;  int i, bot, top;
1472  char name[32];  const pcre_uchar *ptr = *ptrptr;
1473    pcre_uchar name[32];
1474    
1475  c = *(++ptr);  c = *(++ptr);
1476  if (c == 0) goto ERROR_RETURN;  if (c == CHAR_NULL) goto ERROR_RETURN;
1477    
1478  *negptr = FALSE;  *negptr = FALSE;
1479    
# Line 944  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1487  if (c == CHAR_LEFT_CURLY_BRACKET)
1487      *negptr = TRUE;      *negptr = TRUE;
1488      ptr++;      ptr++;
1489      }      }
1490    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1491      {      {
1492      c = *(++ptr);      c = *(++ptr);
1493      if (c == 0) goto ERROR_RETURN;      if (c == CHAR_NULL) goto ERROR_RETURN;
1494      if (c == CHAR_RIGHT_CURLY_BRACKET) break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1495      name[i] = c;      name[i] = c;
1496      }      }
# Line 968  else Line 1511  else
1511  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1512    
1513  bot = 0;  bot = 0;
1514  top = _pcre_utt_size;  top = PRIV(utt_size);
1515    
1516  while (bot < top)  while (bot < top)
1517    {    {
1518      int r;
1519    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1520    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1521    if (c == 0)    if (r == 0)
1522      {      {
1523      *dptr = _pcre_utt[i].value;      *ptypeptr = PRIV(utt)[i].type;
1524      return _pcre_utt[i].type;      *pdataptr = PRIV(utt)[i].value;
1525        return TRUE;
1526      }      }
1527    if (c > 0) bot = i + 1; else top = i;    if (r > 0) bot = i + 1; else top = i;
1528    }    }
1529    
1530  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
1531  *ptrptr = ptr;  *ptrptr = ptr;
1532  return -1;  return FALSE;
1533    
1534  ERROR_RETURN:  ERROR_RETURN:
1535  *errorcodeptr = ERR46;  *errorcodeptr = ERR46;
1536  *ptrptr = ptr;  *ptrptr = ptr;
1537  return -1;  return FALSE;
1538  }  }
1539  #endif  #endif
1540    
1541    
1542    
   
 /*************************************************  
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if (*p++ != CHAR_COMMA) return FALSE;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == CHAR_RIGHT_CURLY_BRACKET);  
 }  
   
   
   
1543  /*************************************************  /*************************************************
1544  *         Read repeat counts                     *  *         Read repeat counts                     *
1545  *************************************************/  *************************************************/
# Line 1048  Returns:         pointer to '}' on succe Line 1559  Returns:         pointer to '}' on succe
1559                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1560  */  */
1561    
1562  static const uschar *  static const pcre_uchar *
1563  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1564  {  {
1565  int min = 0;  int min = 0;
1566  int max = -1;  int max = -1;
# Line 1057  int max = -1; Line 1568  int max = -1;
1568  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1569  an integer overflow. */  an integer overflow. */
1570    
1571  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1572  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1573    {    {
1574    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1072  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1583  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1583    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1584      {      {
1585      max = 0;      max = 0;
1586      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1587      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1588        {        {
1589        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1097  return p; Line 1608  return p;
1608    
1609    
1610  /*************************************************  /*************************************************
1611  *  Subroutine for finding forward reference      *  *      Find first significant op code            *
1612  *************************************************/  *************************************************/
1613    
1614  /* This recursive function is called only from find_parens() below. The  /* This is called by several functions that scan a compiled expression looking
1615  top-level call starts at the beginning of the pattern. All other calls must  for a fixed first character, or an anchoring op code etc. It skips over things
1616  start at a parenthesis. It scans along a pattern's text looking for capturing  that do not influence this. For some calls, it makes sense to skip negative
1617  subpatterns, and counting them. If it finds a named pattern that matches the  forward and all backward assertions, and also the \b assertion; for others it
1618  name it is given, it returns its number. Alternatively, if the name is NULL, it  does not.
 returns when it reaches a given numbered subpattern. Recursion is used to keep  
 track of subpatterns that reset the capturing group numbers - the (?| feature.  
   
 This function was originally called only from the second pass, in which we know  
 that if (?< or (?' or (?P< is encountered, the name will be correctly  
 terminated because that is checked in the first pass. There is now one call to  
 this function in the first pass, to check for a recursive back reference by  
 name (so that we can make the whole group atomic). In this case, we need check  
 only up to the current position in the pattern, and that is still OK because  
 and previous occurrences will have been checked. To make this work, the test  
 for "end of pattern" is a check against cd->end_pattern in the main loop,  
 instead of looking for a binary zero. This means that the special first-pass  
 call can adjust cd->end_pattern temporarily. (Checks for binary zero while  
 processing items within the loop are OK, because afterwards the main loop will  
 terminate.)  
1619    
1620  Arguments:  Arguments:
1621    ptrptr       address of the current character pointer (updated)    code         pointer to the start of the group
1622    cd           compile background data    skipassert   TRUE if certain assertions are to be skipped
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf8         TRUE if we are in UTF-8 mode  
   count        pointer to the current capturing subpattern number (updated)  
1623    
1624  Returns:       the number of the named subpattern, or -1 if not found  Returns:       pointer to the first significant opcode
1625  */  */
1626    
1627  static int  static const pcre_uchar*
1628  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  first_significant_code(const pcre_uchar *code, BOOL skipassert)
   BOOL xmode, BOOL utf8, int *count)  
1629  {  {
1630  uschar *ptr = *ptrptr;  for (;;)
 int start_count = *count;  
 int hwm_count = start_count;  
 BOOL dup_parens = FALSE;  
   
 /* If the first character is a parenthesis, check on the type of group we are  
 dealing with. The very first call may not start with a parenthesis. */  
   
 if (ptr[0] == CHAR_LEFT_PARENTHESIS)  
1631    {    {
1632    /* Handle specials such as (*SKIP) or (*UTF8) etc. */    switch ((int)*code)
   
   if (ptr[1] == CHAR_ASTERISK) ptr += 2;  
   
   /* Handle a normal, unnamed capturing parenthesis. */  
   
   else if (ptr[1] != CHAR_QUESTION_MARK)  
1633      {      {
1634      *count += 1;      case OP_ASSERT_NOT:
1635      if (name == NULL && *count == lorn) return *count;      case OP_ASSERTBACK:
1636      ptr++;      case OP_ASSERTBACK_NOT:
1637      }      if (!skipassert) return code;
1638        do code += GET(code, 1); while (*code == OP_ALT);
1639    /* All cases now have (? at the start. Remember when we are in a group      code += PRIV(OP_lengths)[*code];
1640    where the parenthesis numbers are duplicated. */      break;
1641    
1642    else if (ptr[2] == CHAR_VERTICAL_LINE)      case OP_WORD_BOUNDARY:
1643      {      case OP_NOT_WORD_BOUNDARY:
1644      ptr += 3;      if (!skipassert) return code;
1645      dup_parens = TRUE;      /* Fall through */
     }  
1646    
1647    /* Handle comments; all characters are allowed until a ket is reached. */      case OP_CALLOUT:
1648        case OP_CREF:
1649        case OP_DNCREF:
1650        case OP_RREF:
1651        case OP_DNRREF:
1652        case OP_DEF:
1653        code += PRIV(OP_lengths)[*code];
1654        break;
1655    
1656    else if (ptr[2] == CHAR_NUMBER_SIGN)      default:
1657      {      return code;
     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;  
     goto FAIL_EXIT;  
1658      }      }
1659      }
1660    /* Control never reaches here */
1661    }
1662    
   /* Handle a condition. If it is an assertion, just carry on so that it  
   is processed as normal. If not, skip to the closing parenthesis of the  
   condition (there can't be any nested parens). */  
1663    
   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)  
     {  
     ptr += 2;  
     if (ptr[1] != CHAR_QUESTION_MARK)  
       {  
       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
       if (*ptr != 0) ptr++;  
       }  
     }  
1664    
1665    /* Start with (? but not a condition. */  /*************************************************
1666    *        Find the fixed length of a branch       *
1667    *************************************************/
1668    
1669    else  /* Scan a branch and compute the fixed length of subject that will match it,
1670      {  if the length is fixed. This is needed for dealing with backward assertions.
1671      ptr += 2;  In UTF8 mode, the result is in characters rather than bytes. The branch is
1672      if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */  temporarily terminated with OP_END when this function is called.
1673    
1674      /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */  This function is called when a backward assertion is encountered, so that if it
1675    fails, the error message can point to the correct place in the pattern.
1676      if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&  However, we cannot do this when the assertion contains subroutine calls,
1677          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)  because they can be forward references. We solve this by remembering this case
1678        {  and doing the check at the end; a flag specifies which mode we are running in.
       int term;  
       const uschar *thisname;  
       *count += 1;  
       if (name == NULL && *count == lorn) return *count;  
       term = *ptr++;  
       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;  
       thisname = ptr;  
       while (*ptr != term) ptr++;  
       if (name != NULL && lorn == ptr - thisname &&  
           strncmp((const char *)name, (const char *)thisname, lorn) == 0)  
         return *count;  
       term++;  
       }  
     }  
   }  
   
 /* Past any initial parenthesis handling, scan for parentheses or vertical  
 bars. Stop if we get to cd->end_pattern. Note that this is important for the  
 first-pass call when this value is temporarily adjusted to stop at the current  
 position. So DO NOT change this to a test for binary zero. */  
   
 for (; ptr < cd->end_pattern; ptr++)  
   {  
   /* Skip over backslashed characters and also entire \Q...\E */  
   
   if (*ptr == CHAR_BACKSLASH)  
     {  
     if (*(++ptr) == 0) goto FAIL_EXIT;  
     if (*ptr == CHAR_Q) for (;;)  
       {  
       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};  
       if (*ptr == 0) goto FAIL_EXIT;  
       if (*(++ptr) == CHAR_E) break;  
       }  
     continue;  
     }  
   
   /* Skip over character classes; this logic must be similar to the way they  
   are handled for real. If the first character is '^', skip it. Also, if the  
   first few characters (either before or after ^) are \Q\E or \E we skip them  
   too. This makes for compatibility with Perl. Note the use of STR macros to  
   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */  
   
   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)  
     {  
     BOOL negate_class = FALSE;  
     for (;;)  
       {  
       if (ptr[1] == CHAR_BACKSLASH)  
         {  
         if (ptr[2] == CHAR_E)  
           ptr+= 2;  
         else if (strncmp((const char *)ptr+2,  
                  STR_Q STR_BACKSLASH STR_E, 3) == 0)  
           ptr += 4;  
         else  
           break;  
         }  
       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)  
         {  
         negate_class = TRUE;  
         ptr++;  
         }  
       else break;  
       }  
   
     /* If the next character is ']', it is a data character that must be  
     skipped, except in JavaScript compatibility mode. */  
   
     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&  
         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)  
       ptr++;  
   
     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)  
       {  
       if (*ptr == 0) return -1;  
       if (*ptr == CHAR_BACKSLASH)  
         {  
         if (*(++ptr) == 0) goto FAIL_EXIT;  
         if (*ptr == CHAR_Q) for (;;)  
           {  
           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};  
           if (*ptr == 0) goto FAIL_EXIT;  
           if (*(++ptr) == CHAR_E) break;  
           }  
         continue;  
         }  
       }  
     continue;  
     }  
   
   /* Skip comments in /x mode */  
   
   if (xmode && *ptr == CHAR_NUMBER_SIGN)  
     {  
     ptr++;  
     while (*ptr != 0)  
       {  
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }  
       ptr++;  
 #ifdef SUPPORT_UTF8  
       if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;  
 #endif  
       }  
     if (*ptr == 0) goto FAIL_EXIT;  
     continue;  
     }  
   
   /* Check for the special metacharacters */  
   
   if (*ptr == CHAR_LEFT_PARENTHESIS)  
     {  
     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);  
     if (rc > 0) return rc;  
     if (*ptr == 0) goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_RIGHT_PARENTHESIS)  
     {  
     if (dup_parens && *count < hwm_count) *count = hwm_count;  
     goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)  
     {  
     if (*count > hwm_count) hwm_count = *count;  
     *count = start_count;  
     }  
   }  
   
 FAIL_EXIT:  
 *ptrptr = ptr;  
 return -1;  
 }  
   
   
   
   
 /*************************************************  
 *       Find forward referenced subpattern       *  
 *************************************************/  
   
 /* This function scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. This is used for forward  
 references to subpatterns. We used to be able to start this scan from the  
 current compiling point, using the current count value from cd->bracount, and  
 do it all in a single loop, but the addition of the possibility of duplicate  
 subpattern numbers means that we have to scan from the very start, in order to  
 take account of such duplicates, and to use a recursive function to keep track  
 of the different types of group.  
   
 Arguments:  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf8         TRUE if we are in UTF-8 mode  
   
 Returns:       the number of the found subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,  
   BOOL utf8)  
 {  
 uschar *ptr = (uschar *)cd->start_pattern;  
 int count = 0;  
 int rc;  
   
 /* If the pattern does not start with an opening parenthesis, the first call  
 to find_parens_sub() will scan right to the end (if necessary). However, if it  
 does start with a parenthesis, find_parens_sub() will return when it hits the  
 matching closing parens. That is why we have to have a loop. */  
   
 for (;;)  
   {  
   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);  
   if (rc > 0 || *ptr++ == 0) break;  
   }  
   
 return rc;  
 }  
   
   
   
   
 /*************************************************  
 *      Find first significant op code            *  
 *************************************************/  
   
 /* This is called by several functions that scan a compiled expression looking  
 for a fixed first character, or an anchoring op code etc. It skips over things  
 that do not influence this. For some calls, it makes sense to skip negative  
 forward and all backward assertions, and also the \b assertion; for others it  
 does not.  
   
 Arguments:  
   code         pointer to the start of the group  
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
   skipassert   TRUE if certain assertions are to be skipped  
   
 Returns:       pointer to the first significant opcode  
 */  
   
 static const uschar*  
 first_significant_code(const uschar *code, int *options, int optbit,  
   BOOL skipassert)  
 {  
 for (;;)  
   {  
   switch ((int)*code)  
     {  
     case OP_ASSERT_NOT:  
     case OP_ASSERTBACK:  
     case OP_ASSERTBACK_NOT:  
     if (!skipassert) return code;  
     do code += GET(code, 1); while (*code == OP_ALT);  
     code += _pcre_OP_lengths[*code];  
     break;  
   
     case OP_WORD_BOUNDARY:  
     case OP_NOT_WORD_BOUNDARY:  
     if (!skipassert) return code;  
     /* Fall through */  
   
     case OP_CALLOUT:  
     case OP_CREF:  
     case OP_NCREF:  
     case OP_RREF:  
     case OP_NRREF:  
     case OP_DEF:  
     code += _pcre_OP_lengths[*code];  
     break;  
   
     default:  
     return code;  
     }  
   }  
 /* Control never reaches here */  
 }  
   
   
   
   
 /*************************************************  
 *        Find the fixed length of a branch       *  
 *************************************************/  
   
 /* Scan a branch and compute the fixed length of subject that will match it,  
 if the length is fixed. This is needed for dealing with backward assertions.  
 In UTF8 mode, the result is in characters rather than bytes. The branch is  
 temporarily terminated with OP_END when this function is called.  
   
 This function is called when a backward assertion is encountered, so that if it  
 fails, the error message can point to the correct place in the pattern.  
 However, we cannot do this when the assertion contains subroutine calls,  
 because they can be forward references. We solve this by remembering this case  
 and doing the check at the end; a flag specifies which mode we are running in.  
1679    
1680  Arguments:  Arguments:
1681    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1682    options  the compiling options    utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1683    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1684    cd       the "compile data" structure    cd       the "compile data" structure
1685    
1686  Returns:   the fixed length,  Returns:   the fixed length,
1687               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1688               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1689               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1690                 or -4 if an unknown opcode was encountered (internal error)
1691  */  */
1692    
1693  static int  static int
1694  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1695  {  {
1696  int length = -1;  int length = -1;
1697    
1698  register int branchlength = 0;  register int branchlength = 0;
1699  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1700    
1701  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1702  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1492  branch, check the length against that of Line 1704  branch, check the length against that of
1704  for (;;)  for (;;)
1705    {    {
1706    int d;    int d;
1707    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1708    register int op = *cc;    register pcre_uchar op = *cc;
1709    
1710    switch (op)    switch (op)
1711      {      {
1712        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1713        OP_BRA (normal non-capturing bracket) because the other variants of these
1714        opcodes are all concerned with unlimited repeated groups, which of course
1715        are not of fixed length. */
1716    
1717      case OP_CBRA:      case OP_CBRA:
1718      case OP_BRA:      case OP_BRA:
1719      case OP_ONCE:      case OP_ONCE:
1720        case OP_ONCE_NC:
1721      case OP_COND:      case OP_COND:
1722      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1723      if (d < 0) return d;      if (d < 0) return d;
1724      branchlength += d;      branchlength += d;
1725      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1726      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1727      break;      break;
1728    
1729      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1730      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1731      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1732        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1733        because they all imply an unlimited repeat. */
1734    
1735      case OP_ALT:      case OP_ALT:
1736      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1737      case OP_END:      case OP_END:
1738        case OP_ACCEPT:
1739        case OP_ASSERT_ACCEPT:
1740      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1741        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1742      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1529  for (;;) Line 1750  for (;;)
1750    
1751      case OP_RECURSE:      case OP_RECURSE:
1752      if (!atend) return -3;      if (!atend) return -3;
1753      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1754      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1755      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1756      d = find_fixedlength(cs + 2, options, atend, cd);      d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1757      if (d < 0) return d;      if (d < 0) return d;
1758      branchlength += d;      branchlength += d;
1759      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1545  for (;;) Line 1766  for (;;)
1766      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1767      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1768      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1769      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1770        break;
1771    
1772      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1773    
1774      case OP_REVERSE:      case OP_MARK:
1775      case OP_CREF:      case OP_PRUNE_ARG:
1776      case OP_NCREF:      case OP_SKIP_ARG:
1777      case OP_RREF:      case OP_THEN_ARG:
1778      case OP_NRREF:      cc += cc[1] + PRIV(OP_lengths)[*cc];
1779      case OP_DEF:      break;
1780    
1781      case OP_CALLOUT:      case OP_CALLOUT:
     case OP_SOD:  
     case OP_SOM:  
     case OP_SET_SOM:  
     case OP_EOD:  
     case OP_EODN:  
1782      case OP_CIRC:      case OP_CIRC:
1783      case OP_CIRCM:      case OP_CIRCM:
1784        case OP_CLOSE:
1785        case OP_COMMIT:
1786        case OP_CREF:
1787        case OP_DEF:
1788        case OP_DNCREF:
1789        case OP_DNRREF:
1790      case OP_DOLL:      case OP_DOLL:
1791      case OP_DOLLM:      case OP_DOLLM:
1792        case OP_EOD:
1793        case OP_EODN:
1794        case OP_FAIL:
1795      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1796        case OP_PRUNE:
1797        case OP_REVERSE:
1798        case OP_RREF:
1799        case OP_SET_SOM:
1800        case OP_SKIP:
1801        case OP_SOD:
1802        case OP_SOM:
1803        case OP_THEN:
1804      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1805      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1806      break;      break;
1807    
1808      /* Handle literal characters */      /* Handle literal characters */
# Line 1575  for (;;) Line 1810  for (;;)
1810      case OP_CHAR:      case OP_CHAR:
1811      case OP_CHARI:      case OP_CHARI:
1812      case OP_NOT:      case OP_NOT:
1813      case OP_NOTI:      case OP_NOTI:
1814      branchlength++;      branchlength++;
1815      cc += 2;      cc += 2;
1816  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1817      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1818  #endif  #endif
1819      break;      break;
1820    
# Line 1588  for (;;) Line 1822  for (;;)
1822      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1823    
1824      case OP_EXACT:      case OP_EXACT:
1825      branchlength += GET2(cc,1);      case OP_EXACTI:
1826      cc += 4;      case OP_NOTEXACT:
1827  #ifdef SUPPORT_UTF8      case OP_NOTEXACTI:
1828      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      branchlength += (int)GET2(cc,1);
1829        cc += _pcre_utf8_table4[cc[-1] & 0x3f];      cc += 2 + IMM2_SIZE;
1830    #ifdef SUPPORT_UTF
1831        if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1832  #endif  #endif
1833      break;      break;
1834    
1835      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1836      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1837      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1838      cc += 4;        cc += 2;
1839        cc += 1 + IMM2_SIZE + 1;
1840      break;      break;
1841    
1842      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1609  for (;;) Line 1846  for (;;)
1846      cc += 2;      cc += 2;
1847      /* Fall through */      /* Fall through */
1848    
1849        case OP_HSPACE:
1850        case OP_VSPACE:
1851        case OP_NOT_HSPACE:
1852        case OP_NOT_VSPACE:
1853      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1854      case OP_DIGIT:      case OP_DIGIT:
1855      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1621  for (;;) Line 1862  for (;;)
1862      cc++;      cc++;
1863      break;      break;
1864    
1865      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1866        otherwise \C is coded as OP_ALLANY. */
1867    
1868      case OP_ANYBYTE:      case OP_ANYBYTE:
1869      return -2;      return -2;
1870    
1871      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1872    
 #ifdef SUPPORT_UTF8  
     case OP_XCLASS:  
     cc += GET(cc, 1) - 33;  
     /* Fall through */  
 #endif  
   
1873      case OP_CLASS:      case OP_CLASS:
1874      case OP_NCLASS:      case OP_NCLASS:
1875      cc += 33;  #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1876        case OP_XCLASS:
1877        /* The original code caused an unsigned overflow in 64 bit systems,
1878        so now we use a conditional statement. */
1879        if (op == OP_XCLASS)
1880          cc += GET(cc, 1);
1881        else
1882          cc += PRIV(OP_lengths)[OP_CLASS];
1883    #else
1884        cc += PRIV(OP_lengths)[OP_CLASS];
1885    #endif
1886    
1887      switch (*cc)      switch (*cc)
1888        {        {
1889        case OP_CRSTAR:        case OP_CRSTAR:
1890        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1891          case OP_CRPLUS:
1892          case OP_CRMINPLUS:
1893        case OP_CRQUERY:        case OP_CRQUERY:
1894        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1895          case OP_CRPOSSTAR:
1896          case OP_CRPOSPLUS:
1897          case OP_CRPOSQUERY:
1898        return -1;        return -1;
1899    
1900        case OP_CRRANGE:        case OP_CRRANGE:
1901        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1902        if (GET2(cc,1) != GET2(cc,3)) return -1;        case OP_CRPOSRANGE:
1903        branchlength += GET2(cc,1);        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1904        cc += 5;        branchlength += (int)GET2(cc,1);
1905          cc += 1 + 2 * IMM2_SIZE;
1906        break;        break;
1907    
1908        default:        default:
# Line 1660  for (;;) Line 1912  for (;;)
1912    
1913      /* Anything else is variable length */      /* Anything else is variable length */
1914    
1915      default:      case OP_ANYNL:
1916        case OP_BRAMINZERO:
1917        case OP_BRAPOS:
1918        case OP_BRAPOSZERO:
1919        case OP_BRAZERO:
1920        case OP_CBRAPOS:
1921        case OP_EXTUNI:
1922        case OP_KETRMAX:
1923        case OP_KETRMIN:
1924        case OP_KETRPOS:
1925        case OP_MINPLUS:
1926        case OP_MINPLUSI:
1927        case OP_MINQUERY:
1928        case OP_MINQUERYI:
1929        case OP_MINSTAR:
1930        case OP_MINSTARI:
1931        case OP_MINUPTO:
1932        case OP_MINUPTOI:
1933        case OP_NOTMINPLUS:
1934        case OP_NOTMINPLUSI:
1935        case OP_NOTMINQUERY:
1936        case OP_NOTMINQUERYI:
1937        case OP_NOTMINSTAR:
1938        case OP_NOTMINSTARI:
1939        case OP_NOTMINUPTO:
1940        case OP_NOTMINUPTOI:
1941        case OP_NOTPLUS:
1942        case OP_NOTPLUSI:
1943        case OP_NOTPOSPLUS:
1944        case OP_NOTPOSPLUSI:
1945        case OP_NOTPOSQUERY:
1946        case OP_NOTPOSQUERYI:
1947        case OP_NOTPOSSTAR:
1948        case OP_NOTPOSSTARI:
1949        case OP_NOTPOSUPTO:
1950        case OP_NOTPOSUPTOI:
1951        case OP_NOTQUERY:
1952        case OP_NOTQUERYI:
1953        case OP_NOTSTAR:
1954        case OP_NOTSTARI:
1955        case OP_NOTUPTO:
1956        case OP_NOTUPTOI:
1957        case OP_PLUS:
1958        case OP_PLUSI:
1959        case OP_POSPLUS:
1960        case OP_POSPLUSI:
1961        case OP_POSQUERY:
1962        case OP_POSQUERYI:
1963        case OP_POSSTAR:
1964        case OP_POSSTARI:
1965        case OP_POSUPTO:
1966        case OP_POSUPTOI:
1967        case OP_QUERY:
1968        case OP_QUERYI:
1969        case OP_REF:
1970        case OP_REFI:
1971        case OP_DNREF:
1972        case OP_DNREFI:
1973        case OP_SBRA:
1974        case OP_SBRAPOS:
1975        case OP_SCBRA:
1976        case OP_SCBRAPOS:
1977        case OP_SCOND:
1978        case OP_SKIPZERO:
1979        case OP_STAR:
1980        case OP_STARI:
1981        case OP_TYPEMINPLUS:
1982        case OP_TYPEMINQUERY:
1983        case OP_TYPEMINSTAR:
1984        case OP_TYPEMINUPTO:
1985        case OP_TYPEPLUS:
1986        case OP_TYPEPOSPLUS:
1987        case OP_TYPEPOSQUERY:
1988        case OP_TYPEPOSSTAR:
1989        case OP_TYPEPOSUPTO:
1990        case OP_TYPEQUERY:
1991        case OP_TYPESTAR:
1992        case OP_TYPEUPTO:
1993        case OP_UPTO:
1994        case OP_UPTOI:
1995      return -1;      return -1;
1996    
1997        /* Catch unrecognized opcodes so that when new ones are added they
1998        are not forgotten, as has happened in the past. */
1999    
2000        default:
2001        return -4;
2002      }      }
2003    }    }
2004  /* Control never gets here */  /* Control never gets here */
# Line 1669  for (;;) Line 2006  for (;;)
2006    
2007    
2008    
   
2009  /*************************************************  /*************************************************
2010  *    Scan compiled regex for specific bracket    *  *    Scan compiled regex for specific bracket    *
2011  *************************************************/  *************************************************/
# Line 1682  length. Line 2018  length.
2018    
2019  Arguments:  Arguments:
2020    code        points to start of expression    code        points to start of expression
2021    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2022    number      the required bracket number or negative to find a lookbehind    number      the required bracket number or negative to find a lookbehind
2023    
2024  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2025  */  */
2026    
2027  const uschar *  const pcre_uchar *
2028  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2029  {  {
2030  for (;;)  for (;;)
2031    {    {
2032    register int c = *code;    register pcre_uchar c = *code;
2033    
2034    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
2035    
2036    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1706  for (;;) Line 2043  for (;;)
2043    
2044    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
2045      {      {
2046      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
2047      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2048      }      }
2049    
2050    /* Handle capturing bracket */    /* Handle capturing bracket */
2051    
2052    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
2053               c == OP_CBRAPOS || c == OP_SCBRAPOS)
2054      {      {
2055      int n = GET2(code, 1+LINK_SIZE);      int n = (int)GET2(code, 1+LINK_SIZE);
2056      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2057      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2058      }      }
2059    
2060    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
# Line 1744  for (;;) Line 2082  for (;;)
2082        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2083        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2084        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2085        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2086            code += 2;
2087        break;        break;
2088    
2089        case OP_MARK:        case OP_MARK:
2090        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2091        case OP_SKIP_ARG:        case OP_SKIP_ARG:
       code += code[1];  
       break;  
   
2092        case OP_THEN_ARG:        case OP_THEN_ARG:
2093        code += code[1+LINK_SIZE];        code += code[1];
2094        break;        break;
2095        }        }
2096    
2097      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2098    
2099      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2100    
2101    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2102    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2103    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2104    
2105  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2106      if (utf8) switch(c)      if (utf) switch(c)
2107        {        {
2108        case OP_CHAR:        case OP_CHAR:
2109        case OP_CHARI:        case OP_CHARI:
# Line 1797  for (;;) Line 2133  for (;;)
2133        case OP_MINQUERYI:        case OP_MINQUERYI:
2134        case OP_POSQUERY:        case OP_POSQUERY:
2135        case OP_POSQUERYI:        case OP_POSQUERYI:
2136        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2137        break;        break;
2138        }        }
2139  #else  #else
2140      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2141  #endif  #endif
2142      }      }
2143    }    }
# Line 1818  instance of OP_RECURSE. Line 2154  instance of OP_RECURSE.
2154    
2155  Arguments:  Arguments:
2156    code        points to start of expression    code        points to start of expression
2157    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2158    
2159  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2160  */  */
2161    
2162  static const uschar *  static const pcre_uchar *
2163  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2164  {  {
2165  for (;;)  for (;;)
2166    {    {
2167    register int c = *code;    register pcre_uchar c = *code;
2168    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
2169    if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
2170    
# Line 1863  for (;;) Line 2199  for (;;)
2199        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2200        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2201        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2202        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2203            code += 2;
2204        break;        break;
2205    
2206        case OP_MARK:        case OP_MARK:
2207        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2208        case OP_SKIP_ARG:        case OP_SKIP_ARG:
       code += code[1];  
       break;  
   
2209        case OP_THEN_ARG:        case OP_THEN_ARG:
2210        code += code[1+LINK_SIZE];        code += code[1];
2211        break;        break;
2212        }        }
2213    
2214      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2215    
2216      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2217    
2218      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2219      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2220      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2221    
2222  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2223      if (utf8) switch(c)      if (utf) switch(c)
2224        {        {
2225        case OP_CHAR:        case OP_CHAR:
2226        case OP_CHARI:        case OP_CHARI:
2227          case OP_NOT:
2228          case OP_NOTI:
2229        case OP_EXACT:        case OP_EXACT:
2230        case OP_EXACTI:        case OP_EXACTI:
2231          case OP_NOTEXACT:
2232          case OP_NOTEXACTI:
2233        case OP_UPTO:        case OP_UPTO:
2234        case OP_UPTOI:        case OP_UPTOI:
2235          case OP_NOTUPTO:
2236          case OP_NOTUPTOI:
2237        case OP_MINUPTO:        case OP_MINUPTO:
2238        case OP_MINUPTOI:        case OP_MINUPTOI:
2239          case OP_NOTMINUPTO:
2240          case OP_NOTMINUPTOI:
2241        case OP_POSUPTO:        case OP_POSUPTO:
2242        case OP_POSUPTOI:        case OP_POSUPTOI:
2243          case OP_NOTPOSUPTO:
2244          case OP_NOTPOSUPTOI:
2245        case OP_STAR:        case OP_STAR:
2246        case OP_STARI:        case OP_STARI:
2247          case OP_NOTSTAR:
2248          case OP_NOTSTARI:
2249        case OP_MINSTAR:        case OP_MINSTAR:
2250        case OP_MINSTARI:        case OP_MINSTARI:
2251          case OP_NOTMINSTAR:
2252          case OP_NOTMINSTARI:
2253        case OP_POSSTAR:        case OP_POSSTAR:
2254        case OP_POSSTARI:        case OP_POSSTARI:
2255          case OP_NOTPOSSTAR:
2256          case OP_NOTPOSSTARI:
2257        case OP_PLUS:        case OP_PLUS:
2258        case OP_PLUSI:        case OP_PLUSI:
2259          case OP_NOTPLUS:
2260          case OP_NOTPLUSI:
2261        case OP_MINPLUS:        case OP_MINPLUS:
2262        case OP_MINPLUSI:        case OP_MINPLUSI:
2263          case OP_NOTMINPLUS:
2264          case OP_NOTMINPLUSI:
2265        case OP_POSPLUS:        case OP_POSPLUS:
2266        case OP_POSPLUSI:        case OP_POSPLUSI:
2267          case OP_NOTPOSPLUS:
2268          case OP_NOTPOSPLUSI:
2269        case OP_QUERY:        case OP_QUERY:
2270        case OP_QUERYI:        case OP_QUERYI:
2271          case OP_NOTQUERY:
2272          case OP_NOTQUERYI:
2273        case OP_MINQUERY:        case OP_MINQUERY:
2274        case OP_MINQUERYI:        case OP_MINQUERYI:
2275          case OP_NOTMINQUERY:
2276          case OP_NOTMINQUERYI:
2277        case OP_POSQUERY:        case OP_POSQUERY:
2278        case OP_POSQUERYI:        case OP_POSQUERYI:
2279        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_NOTPOSQUERY:
2280          case OP_NOTPOSQUERYI:
2281          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2282        break;        break;
2283        }        }
2284  #else  #else
2285      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2286  #endif  #endif
2287      }      }
2288    }    }
# Line 1943  bracket whose current branch will alread Line 2305  bracket whose current branch will alread
2305  Arguments:  Arguments:
2306    code        points to start of search    code        points to start of search
2307    endcode     points to where to stop    endcode     points to where to stop
2308    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2309    cd          contains pointers to tables etc.    cd          contains pointers to tables etc.
2310      recurses    chain of recurse_check to catch mutual recursion
2311    
2312  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2313  */  */
2314    
2315    typedef struct recurse_check {
2316      struct recurse_check *prev;
2317      const pcre_uchar *group;
2318    } recurse_check;
2319    
2320  static BOOL  static BOOL
2321  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2322    compile_data *cd)    BOOL utf, compile_data *cd, recurse_check *recurses)
2323  {  {
2324  register int c;  register pcre_uchar c;
2325  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  recurse_check this_recurse;
2326    
2327    for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2328       code < endcode;       code < endcode;
2329       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2330    {    {
2331    const uschar *ccode;    const pcre_uchar *ccode;
2332    
2333    c = *code;    c = *code;
2334    
# Line 1972  for (code = first_significant_code(code Line 2342  for (code = first_significant_code(code
2342      continue;      continue;
2343      }      }
2344    
2345    /* Groups with zero repeats can of course be empty; skip them. */    /* For a recursion/subroutine call, if its end has been reached, which
2346      implies a backward reference subroutine call, we can scan it. If it's a
2347      forward reference subroutine call, we can't. To detect forward reference
2348      we have to scan up the list that is kept in the workspace. This function is
2349      called only when doing the real compile, not during the pre-compile that
2350      measures the size of the compiled pattern. */
2351    
2352    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)    if (c == OP_RECURSE)
2353      {      {
2354      code += _pcre_OP_lengths[c];      const pcre_uchar *scode = cd->start_code + GET(code, 1);
2355      do code += GET(code, 1); while (*code == OP_ALT);      BOOL empty_branch;
     c = *code;  
     continue;  
     }  
2356    
2357    /* For a recursion/subroutine call, if its end has been reached, which      /* Test for forward reference or uncompleted reference. This is disabled
2358    implies a subroutine call, we can scan it. */      when called to scan a completed pattern by setting cd->start_workspace to
2359        NULL. */
2360    
2361        if (cd->start_workspace != NULL)
2362          {
2363          const pcre_uchar *tcode;
2364          for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2365            if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2366          if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2367          }
2368    
2369        /* If we are scanning a completed pattern, there are no forward references
2370        and all groups are complete. We need to detect whether this is a recursive
2371        call, as otherwise there will be an infinite loop. If it is a recursion,
2372        just skip over it. Simple recursions are easily detected. For mutual
2373        recursions we keep a chain on the stack. */
2374    
2375        else
2376          {
2377          recurse_check *r = recurses;
2378          const pcre_uchar *endgroup = scode;
2379    
2380          do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2381          if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2382    
2383          for (r = recurses; r != NULL; r = r->prev)
2384            if (r->group == scode) break;
2385          if (r != NULL) continue;   /* Mutual recursion */
2386          }
2387    
2388        /* Completed reference; scan the referenced group, remembering it on the
2389        stack chain to detect mutual recursions. */
2390    
2391        empty_branch = FALSE;
2392        this_recurse.prev = recurses;
2393        this_recurse.group = scode;
2394    
   if (c == OP_RECURSE)  
     {  
     BOOL empty_branch = FALSE;  
     const uschar *scode = cd->start_code + GET(code, 1);  
     if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */  
2395      do      do
2396        {        {
2397        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2398          {          {
2399          empty_branch = TRUE;          empty_branch = TRUE;
2400          break;          break;
# Line 2000  for (code = first_significant_code(code Line 2402  for (code = first_significant_code(code
2402        scode += GET(scode, 1);        scode += GET(scode, 1);
2403        }        }
2404      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2405    
2406      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2407      continue;      continue;
2408      }      }
2409    
2410      /* Groups with zero repeats can of course be empty; skip them. */
2411    
2412      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2413          c == OP_BRAPOSZERO)
2414        {
2415        code += PRIV(OP_lengths)[c];
2416        do code += GET(code, 1); while (*code == OP_ALT);
2417        c = *code;
2418        continue;
2419        }
2420    
2421      /* A nested group that is already marked as "could be empty" can just be
2422      skipped. */
2423    
2424      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2425          c == OP_SCBRA || c == OP_SCBRAPOS)
2426        {
2427        do code += GET(code, 1); while (*code == OP_ALT);
2428        c = *code;
2429        continue;
2430        }
2431    
2432    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2433    
2434    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2435          c == OP_CBRA || c == OP_CBRAPOS ||
2436          c == OP_ONCE || c == OP_ONCE_NC ||
2437          c == OP_COND)
2438      {      {
2439      BOOL empty_branch;      BOOL empty_branch;
2440      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 2022  for (code = first_significant_code(code Line 2450  for (code = first_significant_code(code
2450        empty_branch = FALSE;        empty_branch = FALSE;
2451        do        do
2452          {          {
2453          if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2454            empty_branch = TRUE;            empty_branch = TRUE;
2455          code += GET(code, 1);          code += GET(code, 1);
2456          }          }
# Line 2040  for (code = first_significant_code(code Line 2468  for (code = first_significant_code(code
2468      {      {
2469      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2470      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2471      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2472      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2473      here. */      here. */
2474    
2475  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2476      case OP_XCLASS:      case OP_XCLASS:
2477      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2478      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2052  for (code = first_significant_code(code Line 2480  for (code = first_significant_code(code
2480    
2481      case OP_CLASS:      case OP_CLASS:
2482      case OP_NCLASS:      case OP_NCLASS:
2483      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2484    
2485  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2486      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2487  #endif  #endif
2488    
# Line 2064  for (code = first_significant_code(code Line 2492  for (code = first_significant_code(code
2492        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2493        case OP_CRQUERY:        case OP_CRQUERY:
2494        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2495          case OP_CRPOSSTAR:
2496          case OP_CRPOSQUERY:
2497        break;        break;
2498    
2499        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2500        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2501        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2502          case OP_CRPOSPLUS:
2503        return FALSE;        return FALSE;
2504    
2505        case OP_CRRANGE:        case OP_CRRANGE:
2506        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2507          case OP_CRPOSRANGE:
2508        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2509        break;        break;
2510        }        }
# Line 2080  for (code = first_significant_code(code Line 2512  for (code = first_significant_code(code
2512    
2513      /* Opcodes that must match a character */      /* Opcodes that must match a character */
2514    
2515        case OP_ANY:
2516        case OP_ALLANY:
2517        case OP_ANYBYTE:
2518    
2519      case OP_PROP:      case OP_PROP:
2520      case OP_NOTPROP:      case OP_NOTPROP:
2521        case OP_ANYNL:
2522    
2523        case OP_NOT_HSPACE:
2524        case OP_HSPACE:
2525        case OP_NOT_VSPACE:
2526        case OP_VSPACE:
2527      case OP_EXTUNI:      case OP_EXTUNI:
2528    
2529      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
2530      case OP_DIGIT:      case OP_DIGIT:
2531      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
2532      case OP_WHITESPACE:      case OP_WHITESPACE:
2533      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2534      case OP_WORDCHAR:      case OP_WORDCHAR:
2535      case OP_ANY:  
     case OP_ALLANY:  
     case OP_ANYBYTE:  
2536      case OP_CHAR:      case OP_CHAR:
2537      case OP_CHARI:      case OP_CHARI:
2538      case OP_NOT:      case OP_NOT:
2539      case OP_NOTI:      case OP_NOTI:
2540    
2541      case OP_PLUS:      case OP_PLUS:
2542        case OP_PLUSI:
2543      case OP_MINPLUS:      case OP_MINPLUS:
2544      case OP_POSPLUS:      case OP_MINPLUSI:
2545      case OP_EXACT:  
2546      case OP_NOTPLUS:      case OP_NOTPLUS:
2547        case OP_NOTPLUSI:
2548      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
2549        case OP_NOTMINPLUSI:
2550    
2551        case OP_POSPLUS:
2552        case OP_POSPLUSI:
2553      case OP_NOTPOSPLUS:      case OP_NOTPOSPLUS:
2554        case OP_NOTPOSPLUSI:
2555    
2556        case OP_EXACT:
2557        case OP_EXACTI:
2558      case OP_NOTEXACT:      case OP_NOTEXACT:
2559        case OP_NOTEXACTI:
2560    
2561      case OP_TYPEPLUS:      case OP_TYPEPLUS:
2562      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
2563      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
2564      case OP_TYPEEXACT:      case OP_TYPEEXACT:
2565    
2566      return FALSE;      return FALSE;
2567    
2568      /* These are going to continue, as they may be empty, but we have to      /* These are going to continue, as they may be empty, but we have to
# Line 2127  for (code = first_significant_code(code Line 2582  for (code = first_significant_code(code
2582      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2583      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2584      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2585      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2586          code += 2;
2587      break;      break;
2588    
2589      /* End of branch */      /* End of branch */
# Line 2135  for (code = first_significant_code(code Line 2591  for (code = first_significant_code(code
2591      case OP_KET:      case OP_KET:
2592      case OP_KETRMAX:      case OP_KETRMAX:
2593      case OP_KETRMIN:      case OP_KETRMIN:
2594        case OP_KETRPOS:
2595      case OP_ALT:      case OP_ALT:
2596      return TRUE;      return TRUE;
2597    
2598      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2599      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO and their caseless and negative versions may be
2600        followed by a multibyte character. */
2601    
2602  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2603      case OP_STAR:      case OP_STAR:
2604      case OP_STARI:      case OP_STARI:
2605        case OP_NOTSTAR:
2606        case OP_NOTSTARI:
2607    
2608      case OP_MINSTAR:      case OP_MINSTAR:
2609      case OP_MINSTARI:      case OP_MINSTARI:
2610        case OP_NOTMINSTAR:
2611        case OP_NOTMINSTARI:
2612    
2613      case OP_POSSTAR:      case OP_POSSTAR:
2614      case OP_POSSTARI:      case OP_POSSTARI:
2615        case OP_NOTPOSSTAR:
2616        case OP_NOTPOSSTARI:
2617    
2618      case OP_QUERY:      case OP_QUERY:
2619      case OP_QUERYI:      case OP_QUERYI:
2620        case OP_NOTQUERY:
2621        case OP_NOTQUERYI:
2622    
2623      case OP_MINQUERY:      case OP_MINQUERY:
2624      case OP_MINQUERYI:      case OP_MINQUERYI:
2625        case OP_NOTMINQUERY:
2626        case OP_NOTMINQUERYI:
2627    
2628      case OP_POSQUERY:      case OP_POSQUERY:
2629      case OP_POSQUERYI:      case OP_POSQUERYI:
2630      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      case OP_NOTPOSQUERY:
2631        case OP_NOTPOSQUERYI:
2632    
2633        if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2634      break;      break;
2635    
2636      case OP_UPTO:      case OP_UPTO:
2637      case OP_UPTOI:      case OP_UPTOI:
2638        case OP_NOTUPTO:
2639        case OP_NOTUPTOI:
2640    
2641      case OP_MINUPTO:      case OP_MINUPTO:
2642      case OP_MINUPTOI:      case OP_MINUPTOI:
2643        case OP_NOTMINUPTO:
2644        case OP_NOTMINUPTOI:
2645    
2646      case OP_POSUPTO:      case OP_POSUPTO:
2647      case OP_POSUPTOI:      case OP_POSUPTOI:
2648      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      case OP_NOTPOSUPTO:
2649        case OP_NOTPOSUPTOI:
2650    
2651        if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2652      break;      break;
2653  #endif  #endif
2654    
# Line 2173  for (code = first_significant_code(code Line 2658  for (code = first_significant_code(code
2658      case OP_MARK:      case OP_MARK:
2659      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
2660      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     code += code[1];  
     break;  
   
2661      case OP_THEN_ARG:      case OP_THEN_ARG:
2662      code += code[1+LINK_SIZE];      code += code[1];
2663      break;      break;
2664    
2665      /* None of the remaining opcodes are required to match a character. */      /* None of the remaining opcodes are required to match a character. */
# Line 2200  return TRUE; Line 2682  return TRUE;
2682  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2683  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2684  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2685    This function is called only during the real compile, not during the
2686    pre-compile.
2687    
2688  Arguments:  Arguments:
2689    code        points to start of the recursion    code        points to start of the recursion
2690    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2691    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2692    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2693    cd          pointers to tables etc    cd          pointers to tables etc
2694    
2695  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2696  */  */
2697    
2698  static BOOL  static BOOL
2699  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2700    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2701  {  {
2702  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2703    {    {
2704    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2705      return FALSE;      return FALSE;
2706    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2707    }    }
2708  return TRUE;  return TRUE;
2709    }
2710    
2711    
2712    
2713    /*************************************************
2714    *        Base opcode of repeated opcodes         *
2715    *************************************************/
2716    
2717    /* Returns the base opcode for repeated single character type opcodes. If the
2718    opcode is not a repeated character type, it returns with the original value.
2719    
2720    Arguments:  c opcode
2721    Returns:    base opcode for the type
2722    */
2723    
2724    static pcre_uchar
2725    get_repeat_base(pcre_uchar c)
2726    {
2727    return (c > OP_TYPEPOSUPTO)? c :
2728           (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2729           (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2730           (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2731           (c >= OP_STARI)?      OP_STARI :
2732                                 OP_STAR;
2733    }
2734    
2735    
2736    
2737    #ifdef SUPPORT_UCP
2738    /*************************************************
2739    *        Check a character and a property        *
2740    *************************************************/
2741    
2742    /* This function is called by check_auto_possessive() when a property item
2743    is adjacent to a fixed character.
2744    
2745    Arguments:
2746      c            the character
2747      ptype        the property type
2748      pdata        the data for the type
2749      negated      TRUE if it's a negated property (\P or \p{^)
2750    
2751    Returns:       TRUE if auto-possessifying is OK
2752    */
2753    
2754    static BOOL
2755    check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2756      BOOL negated)
2757    {
2758    const pcre_uint32 *p;
2759    const ucd_record *prop = GET_UCD(c);
2760    
2761    switch(ptype)
2762      {
2763      case PT_LAMP:
2764      return (prop->chartype == ucp_Lu ||
2765              prop->chartype == ucp_Ll ||
2766              prop->chartype == ucp_Lt) == negated;
2767    
2768      case PT_GC:
2769      return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2770    
2771      case PT_PC:
2772      return (pdata == prop->chartype) == negated;
2773    
2774      case PT_SC:
2775      return (pdata == prop->script) == negated;
2776    
2777      /* These are specials */
2778    
2779      case PT_ALNUM:
2780      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2781              PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2782    
2783      /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2784      means that Perl space and POSIX space are now identical. PCRE was changed
2785      at release 8.34. */
2786    
2787      case PT_SPACE:    /* Perl space */
2788      case PT_PXSPACE:  /* POSIX space */
2789      switch(c)
2790        {
2791        HSPACE_CASES:
2792        VSPACE_CASES:
2793        return negated;
2794    
2795        default:
2796        return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2797        }
2798      break;  /* Control never reaches here */
2799    
2800      case PT_WORD:
2801      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2802              PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2803              c == CHAR_UNDERSCORE) == negated;
2804    
2805      case PT_CLIST:
2806      p = PRIV(ucd_caseless_sets) + prop->caseset;
2807      for (;;)
2808        {
2809        if (c < *p) return !negated;
2810        if (c == *p++) return negated;
2811        }
2812      break;  /* Control never reaches here */
2813      }
2814    
2815    return FALSE;
2816    }
2817    #endif  /* SUPPORT_UCP */
2818    
2819    
2820    
2821    /*************************************************
2822    *        Fill the character property list        *
2823    *************************************************/
2824    
2825    /* Checks whether the code points to an opcode that can take part in auto-
2826    possessification, and if so, fills a list with its properties.
2827    
2828    Arguments:
2829      code        points to start of expression
2830      utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2831      fcc         points to case-flipping table
2832      list        points to output list
2833                  list[0] will be filled with the opcode
2834                  list[1] will be non-zero if this opcode
2835                    can match an empty character string
2836                  list[2..7] depends on the opcode
2837    
2838    Returns:      points to the start of the next opcode if *code is accepted
2839                  NULL if *code is not accepted
2840    */
2841    
2842    static const pcre_uchar *
2843    get_chr_property_list(const pcre_uchar *code, BOOL utf,
2844      const pcre_uint8 *fcc, pcre_uint32 *list)
2845    {
2846    pcre_uchar c = *code;
2847    const pcre_uchar *end;
2848    const pcre_uint32 *clist_src;
2849    pcre_uint32 *clist_dest;
2850    pcre_uint32 chr;
2851    pcre_uchar base;
2852    
2853    list[0] = c;
2854    list[1] = FALSE;
2855    code++;
2856    
2857    if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2858      {
2859      base = get_repeat_base(c);
2860      c -= (base - OP_STAR);
2861    
2862      if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2863        code += IMM2_SIZE;
2864    
2865      list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2866    
2867      switch(base)
2868        {
2869        case OP_STAR:
2870        list[0] = OP_CHAR;
2871        break;
2872    
2873        case OP_STARI:
2874        list[0] = OP_CHARI;
2875        break;
2876    
2877        case OP_NOTSTAR:
2878        list[0] = OP_NOT;
2879        break;
2880    
2881        case OP_NOTSTARI:
2882        list[0] = OP_NOTI;
2883        break;
2884    
2885        case OP_TYPESTAR:
2886        list[0] = *code;
2887        code++;
2888        break;
2889        }
2890      c = list[0];
2891      }
2892    
2893    switch(c)
2894      {
2895      case OP_NOT_DIGIT:
2896      case OP_DIGIT:
2897      case OP_NOT_WHITESPACE:
2898      case OP_WHITESPACE:
2899      case OP_NOT_WORDCHAR:
2900      case OP_WORDCHAR:
2901      case OP_ANY:
2902      case OP_ALLANY:
2903      case OP_ANYNL:
2904      case OP_NOT_HSPACE:
2905      case OP_HSPACE:
2906      case OP_NOT_VSPACE:
2907      case OP_VSPACE:
2908      case OP_EXTUNI:
2909      case OP_EODN:
2910      case OP_EOD:
2911      case OP_DOLL:
2912      case OP_DOLLM:
2913      return code;
2914    
2915      case OP_CHAR:
2916      case OP_NOT:
2917      GETCHARINCTEST(chr, code);
2918      list[2] = chr;
2919      list[3] = NOTACHAR;
2920      return code;
2921    
2922      case OP_CHARI:
2923      case OP_NOTI:
2924      list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2925      GETCHARINCTEST(chr, code);
2926      list[2] = chr;
2927    
2928    #ifdef SUPPORT_UCP
2929      if (chr < 128 || (chr < 256 && !utf))
2930        list[3] = fcc[chr];
2931      else
2932        list[3] = UCD_OTHERCASE(chr);
2933    #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2934      list[3] = (chr < 256) ? fcc[chr] : chr;
2935    #else
2936      list[3] = fcc[chr];
2937    #endif
2938    
2939      /* The othercase might be the same value. */
2940    
2941      if (chr == list[3])
2942        list[3] = NOTACHAR;
2943      else
2944        list[4] = NOTACHAR;
2945      return code;
2946    
2947    #ifdef SUPPORT_UCP
2948      case OP_PROP:
2949      case OP_NOTPROP:
2950      if (code[0] != PT_CLIST)
2951        {
2952        list[2] = code[0];
2953        list[3] = code[1];
2954        return code + 2;
2955        }
2956    
2957      /* Convert only if we have enough space. */
2958    
2959      clist_src = PRIV(ucd_caseless_sets) + code[1];
2960      clist_dest = list + 2;
2961      code += 2;
2962    
2963      do {
2964         if (clist_dest >= list + 8)
2965           {
2966           /* Early return if there is not enough space. This should never
2967           happen, since all clists are shorter than 5 character now. */
2968           list[2] = code[0];
2969           list[3] = code[1];
2970           return code;
2971           }
2972         *clist_dest++ = *clist_src;
2973         }
2974      while(*clist_src++ != NOTACHAR);
2975    
2976      /* All characters are stored. The terminating NOTACHAR
2977      is copied form the clist itself. */
2978    
2979      list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2980      return code;
2981    #endif
2982    
2983      case OP_NCLASS:
2984      case OP_CLASS:
2985    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2986      case OP_XCLASS:
2987      if (c == OP_XCLASS)
2988        end = code + GET(code, 0) - 1;
2989      else
2990    #endif
2991        end = code + 32 / sizeof(pcre_uchar);
2992    
2993      switch(*end)
2994        {
2995        case OP_CRSTAR:
2996        case OP_CRMINSTAR:
2997        case OP_CRQUERY:
2998        case OP_CRMINQUERY:
2999        case OP_CRPOSSTAR:
3000        case OP_CRPOSQUERY:
3001        list[1] = TRUE;
3002        end++;
3003        break;
3004    
3005        case OP_CRPLUS:
3006        case OP_CRMINPLUS:
3007        case OP_CRPOSPLUS:
3008        end++;
3009        break;
3010    
3011        case OP_CRRANGE:
3012        case OP_CRMINRANGE:
3013        case OP_CRPOSRANGE:
3014        list[1] = (GET2(end, 1) == 0);
3015        end += 1 + 2 * IMM2_SIZE;
3016        break;
3017        }
3018      list[2] = end - code;
3019      return end;
3020      }
3021    return NULL;    /* Opcode not accepted */
3022    }
3023    
3024    
3025    
3026    /*************************************************
3027    *    Scan further character sets for match       *
3028    *************************************************/
3029    
3030    /* Checks whether the base and the current opcode have a common character, in
3031    which case the base cannot be possessified.
3032    
3033    Arguments:
3034      code        points to the byte code
3035      utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3036      cd          static compile data
3037      base_list   the data list of the base opcode
3038    
3039    Returns:      TRUE if the auto-possessification is possible
3040    */
3041    
3042    static BOOL
3043    compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3044      const pcre_uint32 *base_list, const pcre_uchar *base_end)
3045    {
3046    pcre_uchar c;
3047    pcre_uint32 list[8];
3048    const pcre_uint32 *chr_ptr;
3049    const pcre_uint32 *ochr_ptr;
3050    const pcre_uint32 *list_ptr;
3051    const pcre_uchar *next_code;
3052    const pcre_uint8 *class_bitset;
3053    const pcre_uint32 *set1, *set2, *set_end;
3054    pcre_uint32 chr;
3055    BOOL accepted, invert_bits;
3056    
3057    /* Note: the base_list[1] contains whether the current opcode has greedy
3058    (represented by a non-zero value) quantifier. This is a different from
3059    other character type lists, which stores here that the character iterator
3060    matches to an empty string (also represented by a non-zero value). */
3061    
3062    for(;;)
3063      {
3064      /* All operations move the code pointer forward.
3065      Therefore infinite recursions are not possible. */
3066    
3067      c = *code;
3068    
3069      /* Skip over callouts */
3070    
3071      if (c == OP_CALLOUT)
3072        {
3073        code += PRIV(OP_lengths)[c];
3074        continue;
3075        }
3076    
3077      if (c == OP_ALT)
3078        {
3079        do code += GET(code, 1); while (*code == OP_ALT);
3080        c = *code;
3081        }
3082    
3083      switch(c)
3084        {
3085        case OP_END:
3086        case OP_KETRPOS:
3087        /* TRUE only in greedy case. The non-greedy case could be replaced by
3088        an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3089        uses more memory, which we cannot get at this stage.) */
3090    
3091        return base_list[1] != 0;
3092    
3093        case OP_KET:
3094        /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3095        it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3096        cannot be converted to a possessive form. */
3097    
3098        if (base_list[1] == 0) return FALSE;
3099    
3100        switch(*(code - GET(code, 1)))
3101          {
3102          case OP_ASSERT:
3103          case OP_ASSERT_NOT:
3104          case OP_ASSERTBACK:
3105          case OP_ASSERTBACK_NOT:
3106          case OP_ONCE:
3107          case OP_ONCE_NC:
3108          /* Atomic sub-patterns and assertions can always auto-possessify their
3109          last iterator. */
3110          return TRUE;
3111          }
3112    
3113        code += PRIV(OP_lengths)[c];
3114        continue;
3115    
3116        case OP_ONCE:
3117        case OP_ONCE_NC:
3118        case OP_BRA:
3119        case OP_CBRA:
3120        next_code = code + GET(code, 1);
3121        code += PRIV(OP_lengths)[c];
3122    
3123        while (*next_code == OP_ALT)
3124          {
3125          if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3126          code = next_code + 1 + LINK_SIZE;
3127          next_code += GET(next_code, 1);
3128          }
3129        continue;
3130    
3131        case OP_BRAZERO:
3132        case OP_BRAMINZERO:
3133    
3134        next_code = code + 1;
3135        if (*next_code != OP_BRA && *next_code != OP_CBRA
3136            && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3137    
3138        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3139    
3140        /* The bracket content will be checked by the
3141        OP_BRA/OP_CBRA case above. */
3142        next_code += 1 + LINK_SIZE;
3143        if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3144          return FALSE;
3145    
3146        code += PRIV(OP_lengths)[c];
3147        continue;
3148        }
3149    
3150      /* Check for a supported opcode, and load its properties. */
3151    
3152      code = get_chr_property_list(code, utf, cd->fcc, list);
3153      if (code == NULL) return FALSE;    /* Unsupported */
3154    
3155      /* If either opcode is a small character list, set pointers for comparing
3156      characters from that list with another list, or with a property. */
3157    
3158      if (base_list[0] == OP_CHAR)
3159        {
3160        chr_ptr = base_list + 2;
3161        list_ptr = list;
3162        }
3163      else if (list[0] == OP_CHAR)
3164        {
3165        chr_ptr = list + 2;
3166        list_ptr = base_list;
3167        }
3168    
3169      /* Character bitsets can also be compared to certain opcodes. */
3170    
3171      else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3172    #ifdef COMPILE_PCRE8
3173          /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3174          || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3175    #endif
3176          )
3177        {
3178    #ifdef COMPILE_PCRE8
3179        if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3180    #else
3181        if (base_list[0] == OP_CLASS)
3182    #endif
3183          {
3184          set1 = (pcre_uint32 *)(base_end - base_list[2]);
3185          list_ptr = list;
3186          }
3187        else
3188          {
3189          set1 = (pcre_uint32 *)(code - list[2]);
3190          list_ptr = base_list;
3191          }
3192    
3193        invert_bits = FALSE;
3194        switch(list_ptr[0])
3195          {
3196          case OP_CLASS:
3197          case OP_NCLASS:
3198          set2 = (pcre_uint32 *)
3199            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3200          break;
3201    
3202          /* OP_XCLASS cannot be supported here, because its bitset
3203          is not necessarily complete. E.g: [a-\0x{200}] is stored
3204          as a character range, and the appropriate bits are not set. */
3205    
3206          case OP_NOT_DIGIT:
3207            invert_bits = TRUE;
3208            /* Fall through */
3209          case OP_DIGIT:
3210            set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3211            break;
3212    
3213          case OP_NOT_WHITESPACE:
3214            invert_bits = TRUE;
3215            /* Fall through */
3216          case OP_WHITESPACE:
3217            set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3218            break;
3219    
3220          case OP_NOT_WORDCHAR:
3221            invert_bits = TRUE;
3222            /* Fall through */
3223          case OP_WORDCHAR:
3224            set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3225            break;
3226    
3227          default:
3228          return FALSE;
3229          }
3230    
3231        /* Compare 4 bytes to improve speed. */
3232        set_end = set1 + (32 / 4);
3233        if (invert_bits)
3234          {
3235          do
3236            {
3237            if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3238            }
3239          while (set1 < set_end);
3240          }
3241        else
3242          {
3243          do
3244            {
3245            if ((*set1++ & *set2++) != 0) return FALSE;
3246            }
3247          while (set1 < set_end);
3248          }
3249    
3250        if (list[1] == 0) return TRUE;
3251        /* Might be an empty repeat. */
3252        continue;
3253        }
3254    
3255      /* Some property combinations also acceptable. Unicode property opcodes are
3256      processed specially; the rest can be handled with a lookup table. */
3257    
3258      else
3259        {
3260        pcre_uint32 leftop, rightop;
3261    
3262        leftop = base_list[0];
3263        rightop = list[0];
3264    
3265    #ifdef SUPPORT_UCP
3266        accepted = FALSE; /* Always set in non-unicode case. */
3267        if (leftop == OP_PROP || leftop == OP_NOTPROP)
3268          {
3269          if (rightop == OP_EOD)
3270            accepted = TRUE;
3271          else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3272            {
3273            int n;
3274            const pcre_uint8 *p;
3275            BOOL same = leftop == rightop;
3276            BOOL lisprop = leftop == OP_PROP;
3277            BOOL risprop = rightop == OP_PROP;
3278            BOOL bothprop = lisprop && risprop;
3279    
3280            /* There's a table that specifies how each combination is to be
3281            processed:
3282              0   Always return FALSE (never auto-possessify)
3283              1   Character groups are distinct (possessify if both are OP_PROP)
3284              2   Check character categories in the same group (general or particular)
3285              3   Return TRUE if the two opcodes are not the same
3286              ... see comments below
3287            */
3288    
3289            n = propposstab[base_list[2]][list[2]];
3290            switch(n)
3291              {
3292              case 0: break;
3293              case 1: accepted = bothprop; break;
3294              case 2: accepted = (base_list[3] == list[3]) != same; break;
3295              case 3: accepted = !same; break;
3296    
3297              case 4:  /* Left general category, right particular category */
3298              accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3299              break;
3300    
3301              case 5:  /* Right general category, left particular category */
3302              accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3303              break;
3304    
3305              /* This code is logically tricky. Think hard before fiddling with it.
3306              The posspropstab table has four entries per row. Each row relates to
3307              one of PCRE's special properties such as ALNUM or SPACE or WORD.
3308              Only WORD actually needs all four entries, but using repeats for the
3309              others means they can all use the same code below.
3310    
3311              The first two entries in each row are Unicode general categories, and
3312              apply always, because all the characters they include are part of the
3313              PCRE character set. The third and fourth entries are a general and a
3314              particular category, respectively, that include one or more relevant
3315              characters. One or the other is used, depending on whether the check
3316              is for a general or a particular category. However, in both cases the
3317              category contains more characters than the specials that are defined
3318              for the property being tested against. Therefore, it cannot be used
3319              in a NOTPROP case.
3320    
3321              Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3322              Underscore is covered by ucp_P or ucp_Po. */
3323    
3324              case 6:  /* Left alphanum vs right general category */
3325              case 7:  /* Left space vs right general category */
3326              case 8:  /* Left word vs right general category */
3327              p = posspropstab[n-6];
3328              accepted = risprop && lisprop ==
3329                (list[3] != p[0] &&
3330                 list[3] != p[1] &&
3331                (list[3] != p[2] || !lisprop));
3332              break;
3333    
3334              case 9:   /* Right alphanum vs left general category */
3335              case 10:  /* Right space vs left general category */
3336              case 11:  /* Right word vs left general category */
3337              p = posspropstab[n-9];
3338              accepted = lisprop && risprop ==
3339                (base_list[3] != p[0] &&
3340                 base_list[3] != p[1] &&
3341                (base_list[3] != p[2] || !risprop));
3342              break;
3343    
3344              case 12:  /* Left alphanum vs right particular category */
3345              case 13:  /* Left space vs right particular category */
3346              case 14:  /* Left word vs right particular category */
3347              p = posspropstab[n-12];
3348              accepted = risprop && lisprop ==
3349                (catposstab[p[0]][list[3]] &&
3350                 catposstab[p[1]][list[3]] &&
3351                (list[3] != p[3] || !lisprop));
3352              break;
3353    
3354              case 15:  /* Right alphanum vs left particular category */
3355              case 16:  /* Right space vs left particular category */
3356              case 17:  /* Right word vs left particular category */
3357              p = posspropstab[n-15];
3358              accepted = lisprop && risprop ==
3359                (catposstab[p[0]][base_list[3]] &&
3360                 catposstab[p[1]][base_list[3]] &&
3361                (base_list[3] != p[3] || !risprop));
3362              break;
3363              }
3364            }
3365          }
3366    
3367        else
3368    #endif  /* SUPPORT_UCP */
3369    
3370        accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3371               rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3372               autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3373    
3374        if (!accepted)
3375          return FALSE;
3376    
3377        if (list[1] == 0) return TRUE;
3378        /* Might be an empty repeat. */
3379        continue;
3380        }
3381    
3382      /* Control reaches here only if one of the items is a small character list.
3383      All characters are checked against the other side. */
3384    
3385      do
3386        {
3387        chr = *chr_ptr;
3388    
3389        switch(list_ptr[0])
3390          {
3391          case OP_CHAR:
3392          ochr_ptr = list_ptr + 2;
3393          do
3394            {
3395            if (chr == *ochr_ptr) return FALSE;
3396            ochr_ptr++;
3397            }
3398          while(*ochr_ptr != NOTACHAR);
3399          break;
3400    
3401          case OP_NOT:
3402          ochr_ptr = list_ptr + 2;
3403          do
3404            {
3405            if (chr == *ochr_ptr)
3406              break;
3407            ochr_ptr++;
3408            }
3409          while(*ochr_ptr != NOTACHAR);
3410          if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3411          break;
3412    
3413          /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3414          set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3415    
3416          case OP_DIGIT:
3417          if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3418          break;
3419    
3420          case OP_NOT_DIGIT:
3421          if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3422          break;
3423    
3424          case OP_WHITESPACE:
3425          if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3426          break;
3427    
3428          case OP_NOT_WHITESPACE:
3429          if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3430          break;
3431    
3432          case OP_WORDCHAR:
3433          if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3434          break;
3435    
3436          case OP_NOT_WORDCHAR:
3437          if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3438          break;
3439    
3440          case OP_HSPACE:
3441          switch(chr)
3442            {
3443            HSPACE_CASES: return FALSE;
3444            default: break;
3445            }
3446          break;
3447    
3448          case OP_NOT_HSPACE:
3449          switch(chr)
3450            {
3451            HSPACE_CASES: break;
3452            default: return FALSE;
3453            }
3454          break;
3455    
3456          case OP_ANYNL:
3457          case OP_VSPACE:
3458          switch(chr)
3459            {
3460            VSPACE_CASES: return FALSE;
3461            default: break;
3462            }
3463          break;
3464    
3465          case OP_NOT_VSPACE:
3466          switch(chr)
3467            {
3468            VSPACE_CASES: break;
3469            default: return FALSE;
3470            }
3471          break;
3472    
3473          case OP_DOLL:
3474          case OP_EODN:
3475          switch (chr)
3476            {
3477            case CHAR_CR:
3478            case CHAR_LF:
3479            case CHAR_VT:
3480            case CHAR_FF:
3481            case CHAR_NEL:
3482    #ifndef EBCDIC
3483            case 0x2028:
3484            case 0x2029:
3485    #endif  /* Not EBCDIC */
3486            return FALSE;
3487            }
3488          break;
3489    
3490          case OP_EOD:    /* Can always possessify before \z */
3491          break;
3492    
3493    #ifdef SUPPORT_UCP
3494          case OP_PROP:
3495          case OP_NOTPROP:
3496          if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3497                list_ptr[0] == OP_NOTPROP))
3498            return FALSE;
3499          break;
3500    #endif
3501    
3502          case OP_NCLASS:
3503          if (chr > 255) return FALSE;
3504          /* Fall through */
3505    
3506          case OP_CLASS:
3507          if (chr > 255) break;
3508          class_bitset = (pcre_uint8 *)
3509            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3510          if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3511          break;
3512    
3513    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3514          case OP_XCLASS:
3515          if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3516              list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3517          break;
3518    #endif
3519    
3520          default:
3521          return FALSE;
3522          }
3523    
3524        chr_ptr++;
3525        }
3526      while(*chr_ptr != NOTACHAR);
3527    
3528      /* At least one character must be matched from this opcode. */
3529    
3530      if (list[1] == 0) return TRUE;
3531      }
3532    
3533    return FALSE;
3534    }
3535    
3536    
3537    
3538    /*************************************************
3539    *    Scan compiled regex for auto-possession     *
3540    *************************************************/
3541    
3542    /* Replaces single character iterations with their possessive alternatives
3543    if appropriate. This function modifies the compiled opcode!
3544    
3545    Arguments:
3546      code        points to start of the byte code
3547      utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3548      cd          static compile data
3549    
3550    Returns:      nothing
3551    */
3552    
3553    static void
3554    auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3555    {
3556    register pcre_uchar c;
3557    const pcre_uchar *end;
3558    pcre_uchar *repeat_opcode;
3559    pcre_uint32 list[8];
3560    
3561    for (;;)
3562      {
3563      c = *code;
3564    
3565      if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3566        {
3567        c -= get_repeat_base(c) - OP_STAR;
3568        end = (c <= OP_MINUPTO) ?
3569          get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3570        list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3571    
3572        if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3573          {
3574          switch(c)
3575            {
3576            case OP_STAR:
3577            *code += OP_POSSTAR - OP_STAR;
3578            break;
3579    
3580            case OP_MINSTAR:
3581            *code += OP_POSSTAR - OP_MINSTAR;
3582            break;
3583    
3584            case OP_PLUS:
3585            *code += OP_POSPLUS - OP_PLUS;
3586            break;
3587    
3588            case OP_MINPLUS:
3589            *code += OP_POSPLUS - OP_MINPLUS;
3590            break;
3591    
3592            case OP_QUERY:
3593            *code += OP_POSQUERY - OP_QUERY;
3594            break;
3595    
3596            case OP_MINQUERY:
3597            *code += OP_POSQUERY - OP_MINQUERY;
3598            break;
3599    
3600            case OP_UPTO:
3601            *code += OP_POSUPTO - OP_UPTO;
3602            break;
3603    
3604            case OP_MINUPTO:
3605            *code += OP_MINUPTO - OP_UPTO;
3606            break;
3607            }
3608          }
3609        c = *code;
3610        }
3611      else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3612        {
3613    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3614        if (c == OP_XCLASS)
3615          repeat_opcode = code + GET(code, 1);
3616        else
3617    #endif
3618          repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3619    
3620        c = *repeat_opcode;
3621        if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3622          {
3623          /* end must not be NULL. */
3624          end = get_chr_property_list(code, utf, cd->fcc, list);
3625    
3626          list[1] = (c & 1) == 0;
3627    
3628          if (compare_opcodes(end, utf, cd, list, end))
3629            {
3630            switch (c)
3631              {
3632              case OP_CRSTAR: