/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 274 by ph10, Tue Nov 20 10:05:23 2007 UTC revision 807 by ph10, Sun Dec 18 10:03:38 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57  used by pcretest. DEBUG is not defined when building a production library. */  also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. */
59    
60  #ifdef DEBUG  #ifdef PCRE_DEBUG
61  #include "pcre_printint.src"  #include "pcre_printint.src"
62  #endif  #endif
63    
# Line 87  so this number is very generous. Line 88  so this number is very generous.
88  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
89  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
90  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
92    filled up by repetitions of forward references, for example patterns like
93    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
94    that the workspace is expanded using malloc() in this situation. The value
95    below is therefore a minimum, and we put a maximum on it for safety. The
96    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
97    kicks in at the same number of forward references in all cases. */
98    
99  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
100    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
101    
102    /* The overrun tests check for a slightly smaller size so that they detect the
103    overrun before it actually does run off the end of the data block. */
104    
105    #define WORK_SIZE_SAFETY_MARGIN (100)
106    
107    
108  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 97  are simple data values; negative values Line 110  are simple data values; negative values
110  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
111  is invalid. */  is invalid. */
112    
113  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC
114    
115    /* This is the "normal" table for ASCII systems or for EBCDIC systems running
116    in UTF-8 mode. */
117    
118  static const short int escapes[] = {  static const short int escapes[] = {
119       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,                       0,
120       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,                       0,
121     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */       0,                       0,
122  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */       0,                       0,
123  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */       0,                       0,
124  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */       CHAR_COLON,              CHAR_SEMICOLON,
125     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
126  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
127  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */       CHAR_COMMERCIAL_AT,      -ESC_A,
128       0,      0, -ESC_z                                            /* x - z */       -ESC_B,                  -ESC_C,
129         -ESC_D,                  -ESC_E,
130         0,                       -ESC_G,
131         -ESC_H,                  0,
132         0,                       -ESC_K,
133         0,                       0,
134         -ESC_N,                  0,
135         -ESC_P,                  -ESC_Q,
136         -ESC_R,                  -ESC_S,
137         0,                       0,
138         -ESC_V,                  -ESC_W,
139         -ESC_X,                  0,
140         -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
141         CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
142         CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
143         CHAR_GRAVE_ACCENT,       7,
144         -ESC_b,                  0,
145         -ESC_d,                  ESC_e,
146         ESC_f,                   0,
147         -ESC_h,                  0,
148         0,                       -ESC_k,
149         0,                       0,
150         ESC_n,                   0,
151         -ESC_p,                  0,
152         ESC_r,                   -ESC_s,
153         ESC_tee,                 0,
154         -ESC_v,                  -ESC_w,
155         0,                       0,
156         -ESC_z
157  };  };
158    
159  #else           /* This is the "abnormal" table for EBCDIC systems */  #else
160    
161    /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
162    
163  static const short int escapes[] = {  static const short int escapes[] = {
164  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
165  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 130  static const short int escapes[] = { Line 178  static const short int escapes[] = {
178  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
179  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
180  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
181  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
182  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
183  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
184  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
# Line 142  static const short int escapes[] = { Line 190  static const short int escapes[] = {
190    
191  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
192  searched linearly. Put all the names into a single string, in order to reduce  searched linearly. Put all the names into a single string, in order to reduce
193  the number of relocations when a shared library is dynamically linked. */  the number of relocations when a shared library is dynamically linked. The
194    string is built from string macros so that it works in UTF-8 mode on EBCDIC
195    platforms. */
196    
197  typedef struct verbitem {  typedef struct verbitem {
198    int   len;    int   len;                 /* Length of verb name */
199    int   op;    int   op;                  /* Op when no arg, or -1 if arg mandatory */
200      int   op_arg;              /* Op when arg present, or -1 if not allowed */
201  } verbitem;  } verbitem;
202    
203  static const char verbnames[] =  static const char verbnames[] =
204    "ACCEPT\0"    "\0"                       /* Empty name is a shorthand for MARK */
205    "COMMIT\0"    STRING_MARK0
206    "F\0"    STRING_ACCEPT0
207    "FAIL\0"    STRING_COMMIT0
208    "PRUNE\0"    STRING_F0
209    "SKIP\0"    STRING_FAIL0
210    "THEN";    STRING_PRUNE0
211      STRING_SKIP0
212  static verbitem verbs[] = {    STRING_THEN;
213    { 6, OP_ACCEPT },  
214    { 6, OP_COMMIT },  static const verbitem verbs[] = {
215    { 1, OP_FAIL },    { 0, -1,        OP_MARK },
216    { 4, OP_FAIL },    { 4, -1,        OP_MARK },
217    { 5, OP_PRUNE },    { 6, OP_ACCEPT, -1 },
218    { 4, OP_SKIP  },    { 6, OP_COMMIT, -1 },
219    { 4, OP_THEN  }    { 1, OP_FAIL,   -1 },
220      { 4, OP_FAIL,   -1 },
221      { 5, OP_PRUNE,  OP_PRUNE_ARG },
222      { 4, OP_SKIP,   OP_SKIP_ARG  },
223      { 4, OP_THEN,   OP_THEN_ARG  }
224  };  };
225    
226  static int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
227    
228    
229  /* Tables of names of POSIX character classes and their lengths. The names are  /* Tables of names of POSIX character classes and their lengths. The names are
# Line 178  length entry. The first three must be al Line 233  length entry. The first three must be al
233  for handling case independence. */  for handling case independence. */
234    
235  static const char posix_names[] =  static const char posix_names[] =
236    "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
237    "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"    STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
238    "word\0"   "xdigit";    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
239      STRING_word0  STRING_xdigit;
240    
241  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
242    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 212  static const int posix_class_maps[] = { Line 268  static const int posix_class_maps[] = {
268    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
269  };  };
270    
271    /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
272    substitutes must be in the order of the names, defined above, and there are
273    both positive and negative cases. NULL means no substitute. */
274    
275    #ifdef SUPPORT_UCP
276    static const uschar *substitutes[] = {
277      (uschar *)"\\P{Nd}",    /* \D */
278      (uschar *)"\\p{Nd}",    /* \d */
279      (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */
280      (uschar *)"\\p{Xsp}",   /* \s */
281      (uschar *)"\\P{Xwd}",   /* \W */
282      (uschar *)"\\p{Xwd}"    /* \w */
283    };
284    
285    static const uschar *posix_substitutes[] = {
286      (uschar *)"\\p{L}",     /* alpha */
287      (uschar *)"\\p{Ll}",    /* lower */
288      (uschar *)"\\p{Lu}",    /* upper */
289      (uschar *)"\\p{Xan}",   /* alnum */
290      NULL,                   /* ascii */
291      (uschar *)"\\h",        /* blank */
292      NULL,                   /* cntrl */
293      (uschar *)"\\p{Nd}",    /* digit */
294      NULL,                   /* graph */
295      NULL,                   /* print */
296      NULL,                   /* punct */
297      (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */
298      (uschar *)"\\p{Xwd}",   /* word */
299      NULL,                   /* xdigit */
300      /* Negated cases */
301      (uschar *)"\\P{L}",     /* ^alpha */
302      (uschar *)"\\P{Ll}",    /* ^lower */
303      (uschar *)"\\P{Lu}",    /* ^upper */
304      (uschar *)"\\P{Xan}",   /* ^alnum */
305      NULL,                   /* ^ascii */
306      (uschar *)"\\H",        /* ^blank */
307      NULL,                   /* ^cntrl */
308      (uschar *)"\\P{Nd}",    /* ^digit */
309      NULL,                   /* ^graph */
310      NULL,                   /* ^print */
311      NULL,                   /* ^punct */
312      (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */
313      (uschar *)"\\P{Xwd}",   /* ^word */
314      NULL                    /* ^xdigit */
315    };
316    #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
317    #endif
318    
319  #define STRING(a)  # a  #define STRING(a)  # a
320  #define XSTRING(s) STRING(s)  #define XSTRING(s) STRING(s)
# Line 224  the number of relocations needed when a Line 327  the number of relocations needed when a
327  it is now one long string. We cannot use a table of offsets, because the  it is now one long string. We cannot use a table of offsets, because the
328  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
329  simply count through to the one we want - this isn't a performance issue  simply count through to the one we want - this isn't a performance issue
330  because these strings are used only when there is a compilation error. */  because these strings are used only when there is a compilation error.
331    
332    Each substring ends with \0 to insert a null character. This includes the final
333    substring, so that the whole string ends with \0\0, which can be detected when
334    counting through. */
335    
336  static const char error_texts[] =  static const char error_texts[] =
337    "no error\0"    "no error\0"
# Line 271  static const char error_texts[] = Line 378  static const char error_texts[] =
378    /* 35 */    /* 35 */
379    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
380    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
381    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"    "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
382    "number after (?C is > 255\0"    "number after (?C is > 255\0"
383    "closing ) for (?C expected\0"    "closing ) for (?C expected\0"
384    /* 40 */    /* 40 */
# Line 293  static const char error_texts[] = Line 400  static const char error_texts[] =
400    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
401    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
402    /* 55 */    /* 55 */
403    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
404    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
405    "\\g is not followed by a braced name or an optionally braced non-zero number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
406    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"    "a numbered reference must not be zero\0"
407    "(*VERB) with an argument is not supported\0"    "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
408    /* 60 */    /* 60 */
409    "(*VERB) not recognized\0"    "(*VERB) not recognized\0"
410    "number is too big\0"    "number is too big\0"
411    "subpattern name expected\0"    "subpattern name expected\0"
412    "digit expected after (?+";    "digit expected after (?+\0"
413      "] is an invalid data character in JavaScript compatibility mode\0"
414      /* 65 */
415      "different names for subpatterns of the same number are not allowed\0"
416      "(*MARK) must have an argument\0"
417      "this version of PCRE is not compiled with PCRE_UCP support\0"
418      "\\c must be followed by an ASCII character\0"
419      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
420      /* 70 */
421      "internal error: unknown opcode in find_fixedlength()\0"
422      "\\N is not supported in a class\0"
423      "too many forward references\0"
424      ;
425    
426  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
427  patterns. Note that the tables in chartables are dependent on the locale, and  patterns. Note that the tables in chartables are dependent on the locale, and
# Line 321  For convenience, we use the same bit def Line 439  For convenience, we use the same bit def
439    
440  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
441    
442  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC
443    
444    /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
445    UTF-8 mode. */
446    
447  static const unsigned char digitab[] =  static const unsigned char digitab[] =
448    {    {
449    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 357  static const unsigned char digitab[] = Line 479  static const unsigned char digitab[] =
479    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
480    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
481    
482  #else           /* This is the "abnormal" case, for EBCDIC systems */  #else
483    
484    /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
485    
486  static const unsigned char digitab[] =  static const unsigned char digitab[] =
487    {    {
488    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 432  static const unsigned char ebcdic_charta Line 557  static const unsigned char ebcdic_charta
557  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
558    
559  static BOOL  static BOOL
560    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,
561      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
562    
563    
# Line 454  static const char * Line 579  static const char *
579  find_error_text(int n)  find_error_text(int n)
580  {  {
581  const char *s = error_texts;  const char *s = error_texts;
582  for (; n > 0; n--) while (*s++ != 0);  for (; n > 0; n--)
583      {
584      while (*s++ != 0) {};
585      if (*s == 0) return "Error text not found (please report)";
586      }
587  return s;  return s;
588  }  }
589    
590    
591  /*************************************************  /*************************************************
592    *           Expand the workspace                 *
593    *************************************************/
594    
595    /* This function is called during the second compiling phase, if the number of
596    forward references fills the existing workspace, which is originally a block on
597    the stack. A larger block is obtained from malloc() unless the ultimate limit
598    has been reached or the increase will be rather small.
599    
600    Argument: pointer to the compile data block
601    Returns:  0 if all went well, else an error number
602    */
603    
604    static int
605    expand_workspace(compile_data *cd)
606    {
607    uschar *newspace;
608    int newsize = cd->workspace_size * 2;
609    
610    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
611    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
612        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
613     return ERR72;
614    
615    newspace = (pcre_malloc)(newsize);
616    if (newspace == NULL) return ERR21;
617    
618    memcpy(newspace, cd->start_workspace, cd->workspace_size);
619    cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace);
620    if (cd->workspace_size > COMPILE_WORK_SIZE)
621      (pcre_free)((void *)cd->start_workspace);
622    cd->start_workspace = newspace;
623    cd->workspace_size = newsize;
624    return 0;
625    }
626    
627    
628    
629    /*************************************************
630    *            Check for counted repeat            *
631    *************************************************/
632    
633    /* This function is called when a '{' is encountered in a place where it might
634    start a quantifier. It looks ahead to see if it really is a quantifier or not.
635    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
636    where the ddds are digits.
637    
638    Arguments:
639      p         pointer to the first char after '{'
640    
641    Returns:    TRUE or FALSE
642    */
643    
644    static BOOL
645    is_counted_repeat(const uschar *p)
646    {
647    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
648    while ((digitab[*p] & ctype_digit) != 0) p++;
649    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
650    
651    if (*p++ != CHAR_COMMA) return FALSE;
652    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
653    
654    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
655    while ((digitab[*p] & ctype_digit) != 0) p++;
656    
657    return (*p == CHAR_RIGHT_CURLY_BRACKET);
658    }
659    
660    
661    
662    /*************************************************
663  *            Handle escapes                      *  *            Handle escapes                      *
664  *************************************************/  *************************************************/
665    
# Line 502  if (c == 0) *errorcodeptr = ERR1; Line 702  if (c == 0) *errorcodeptr = ERR1;
702  in a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
703  Otherwise further processing may be required. */  Otherwise further processing may be required. */
704    
705  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
706  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
707  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
708    
709  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
710  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
# Line 523  else Line 723  else
723      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
724      error. */      error. */
725    
726      case 'l':      case CHAR_l:
727      case 'L':      case CHAR_L:
     case 'N':  
     case 'u':  
     case 'U':  
728      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
729      break;      break;
730    
731      /* \g must be followed by a number, either plain or braced. If positive, it      case CHAR_u:
732      is an absolute backreference. If negative, it is a relative backreference.      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
733      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a        {
734      reference to a named group. This is part of Perl's movement towards a        /* In JavaScript, \u must be followed by four hexadecimal numbers.
735      unified syntax for back references. As this is synonymous with \k{name}, we        Otherwise it is a lowercase u letter. */
736      fudge it up by pretending it really was \k. */        if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
737               && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
738            {
739            c = 0;
740            for (i = 0; i < 4; ++i)
741              {
742              register int cc = *(++ptr);
743    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
744              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
745              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
746    #else           /* EBCDIC coding */
747              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
748              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
749    #endif
750              }
751            }
752          }
753        else
754          *errorcodeptr = ERR37;
755        break;
756    
757        case CHAR_U:
758        /* In JavaScript, \U is an uppercase U letter. */
759        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
760        break;
761    
762        /* In a character class, \g is just a literal "g". Outside a character
763        class, \g must be followed by one of a number of specific things:
764    
765      case 'g':      (1) A number, either plain or braced. If positive, it is an absolute
766      if (ptr[1] == '{')      backreference. If negative, it is a relative backreference. This is a Perl
767        5.10 feature.
768    
769        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
770        is part of Perl's movement towards a unified syntax for back references. As
771        this is synonymous with \k{name}, we fudge it up by pretending it really
772        was \k.
773    
774        (3) For Oniguruma compatibility we also support \g followed by a name or a
775        number either in angle brackets or in single quotes. However, these are
776        (possibly recursive) subroutine calls, _not_ backreferences. Just return
777        the -ESC_g code (cf \k). */
778    
779        case CHAR_g:
780        if (isclass) break;
781        if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
782          {
783          c = -ESC_g;
784          break;
785          }
786    
787        /* Handle the Perl-compatible cases */
788    
789        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
790        {        {
791        const uschar *p;        const uschar *p;
792        for (p = ptr+2; *p != 0 && *p != '}'; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
793          if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
794        if (*p != 0 && *p != '}')        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
795          {          {
796          c = -ESC_k;          c = -ESC_k;
797          break;          break;
# Line 554  else Line 801  else
801        }        }
802      else braced = FALSE;      else braced = FALSE;
803    
804      if (ptr[1] == '-')      if (ptr[1] == CHAR_MINUS)
805        {        {
806        negated = TRUE;        negated = TRUE;
807        ptr++;        ptr++;
# Line 563  else Line 810  else
810    
811      c = 0;      c = 0;
812      while ((digitab[ptr[1]] & ctype_digit) != 0)      while ((digitab[ptr[1]] & ctype_digit) != 0)
813        c = c * 10 + *(++ptr) - '0';        c = c * 10 + *(++ptr) - CHAR_0;
814    
815      if (c < 0)      if (c < 0)   /* Integer overflow */
816        {        {
817        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
818        break;        break;
819        }        }
820    
821      if (c == 0 || (braced && *(++ptr) != '}'))      if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
822        {        {
823        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
824        break;        break;
825        }        }
826    
827        if (c == 0)
828          {
829          *errorcodeptr = ERR58;
830          break;
831          }
832    
833      if (negated)      if (negated)
834        {        {
835        if (c > bracount)        if (c > bracount)
# Line 602  else Line 855  else
855      value is greater than 377, the least significant 8 bits are taken. Inside a      value is greater than 377, the least significant 8 bits are taken. Inside a
856      character class, \ followed by a digit is always an octal number. */      character class, \ followed by a digit is always an octal number. */
857    
858      case '1': case '2': case '3': case '4': case '5':      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
859      case '6': case '7': case '8': case '9':      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
860    
861      if (!isclass)      if (!isclass)
862        {        {
863        oldptr = ptr;        oldptr = ptr;
864        c -= '0';        c -= CHAR_0;
865        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
866          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - CHAR_0;
867        if (c < 0)        if (c < 0)    /* Integer overflow */
868          {          {
869          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
870          break;          break;
# Line 628  else Line 881  else
881      generates a binary zero byte and treats the digit as a following literal.      generates a binary zero byte and treats the digit as a following literal.
882      Thus we have to pull back the pointer by one. */      Thus we have to pull back the pointer by one. */
883    
884      if ((c = *ptr) >= '8')      if ((c = *ptr) >= CHAR_8)
885        {        {
886        ptr--;        ptr--;
887        c = 0;        c = 0;
# Line 641  else Line 894  else
894      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
895      than 3 octal digits. */      than 3 octal digits. */
896    
897      case '0':      case CHAR_0:
898      c -= '0';      c -= CHAR_0;
899      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
900          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - CHAR_0;
901      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf8 && c > 255) *errorcodeptr = ERR51;
902      break;      break;
903    
# Line 652  else Line 905  else
905      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
906      treated as a data character. */      treated as a data character. */
907    
908      case 'x':      case CHAR_x:
909      if (ptr[1] == '{')      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
910          {
911          /* In JavaScript, \x must be followed by two hexadecimal numbers.
912          Otherwise it is a lowercase x letter. */
913          if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
914            {
915            c = 0;
916            for (i = 0; i < 2; ++i)
917              {
918              register int cc = *(++ptr);
919    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
920              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
921              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
922    #else           /* EBCDIC coding */
923              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
924              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
925    #endif
926              }
927            }
928          break;
929          }
930    
931        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
932        {        {
933        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
934        int count = 0;        int count = 0;
# Line 662  else Line 937  else
937        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
938          {          {
939          register int cc = *pt++;          register int cc = *pt++;
940          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
941          count++;          count++;
942    
943  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
944          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
945          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
946  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
947          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
948          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
949  #endif  #endif
950          }          }
951    
952        if (*pt == '}')        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
953          {          {
954          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
955          ptr = pt;          ptr = pt;
# Line 690  else Line 965  else
965      c = 0;      c = 0;
966      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
967        {        {
968        int cc;                               /* Some compilers don't like ++ */        int cc;                                  /* Some compilers don't like */
969        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                           /* ++ in initializers */
970  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
971        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
972        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
973  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
974        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
975        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
976  #endif  #endif
977        }        }
978      break;      break;
979    
980      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
981      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
982        coding is ASCII-specific, but then the whole concept of \cx is
983      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
984    
985      case 'c':      case CHAR_c:
986      c = *(++ptr);      c = *(++ptr);
987      if (c == 0)      if (c == 0)
988        {        {
989        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
990        break;        break;
991        }        }
992    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
993  #ifndef EBCDIC  /* ASCII coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
994      if (c >= 'a' && c <= 'z') c -= 32;        {
995          *errorcodeptr = ERR68;
996          break;
997          }
998        if (c >= CHAR_a && c <= CHAR_z) c -= 32;
999      c ^= 0x40;      c ^= 0x40;
1000  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1001      if (c >= 'a' && c <= 'z') c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1002      c ^= 0xC0;      c ^= 0xC0;
1003  #endif  #endif
1004      break;      break;
# Line 740  else Line 1020  else
1020      }      }
1021    }    }
1022    
1023    /* Perl supports \N{name} for character names, as well as plain \N for "not
1024    newline". PCRE does not support \N{name}. However, it does support
1025    quantification such as \N{2,3}. */
1026    
1027    if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1028         !is_counted_repeat(ptr+2))
1029      *errorcodeptr = ERR37;
1030    
1031    /* If PCRE_UCP is set, we change the values for \d etc. */
1032    
1033    if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1034      c -= (ESC_DU - ESC_D);
1035    
1036    /* Set the pointer to the final character before returning. */
1037    
1038  *ptrptr = ptr;  *ptrptr = ptr;
1039  return c;  return c;
1040  }  }
# Line 780  if (c == 0) goto ERROR_RETURN; Line 1075  if (c == 0) goto ERROR_RETURN;
1075  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1076  negation. */  negation. */
1077    
1078  if (c == '{')  if (c == CHAR_LEFT_CURLY_BRACKET)
1079    {    {
1080    if (ptr[1] == '^')    if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1081      {      {
1082      *negptr = TRUE;      *negptr = TRUE;
1083      ptr++;      ptr++;
# Line 791  if (c == '{') Line 1086  if (c == '{')
1086      {      {
1087      c = *(++ptr);      c = *(++ptr);
1088      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
1089      if (c == '}') break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1090      name[i] = c;      name[i] = c;
1091      }      }
1092    if (c !='}') goto ERROR_RETURN;    if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1093    name[i] = 0;    name[i] = 0;
1094    }    }
1095    
# Line 840  return -1; Line 1135  return -1;
1135    
1136    
1137  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == '}') return TRUE;  
   
 if (*p++ != ',') return FALSE;  
 if (*p == '}') return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == '}');  
 }  
   
   
   
 /*************************************************  
1138  *         Read repeat counts                     *  *         Read repeat counts                     *
1139  *************************************************/  *************************************************/
1140    
# Line 900  int max = -1; Line 1162  int max = -1;
1162  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1163  an integer overflow. */  an integer overflow. */
1164    
1165  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1166  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1167    {    {
1168    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 910  if (min < 0 || min > 65535) Line 1172  if (min < 0 || min > 65535)
1172  /* Read the maximum value if there is one, and again do a paranoid on its size.  /* Read the maximum value if there is one, and again do a paranoid on its size.
1173  Also, max must not be less than min. */  Also, max must not be less than min. */
1174    
1175  if (*p == '}') max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1176    {    {
1177    if (*(++p) != '}')    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1178      {      {
1179      max = 0;      max = 0;
1180      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1181      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1182        {        {
1183        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 940  return p; Line 1202  return p;
1202    
1203    
1204  /*************************************************  /*************************************************
1205  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1206  *************************************************/  *************************************************/
1207    
1208  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1209    top-level call starts at the beginning of the pattern. All other calls must
1210    start at a parenthesis. It scans along a pattern's text looking for capturing
1211  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1212  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1213  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. Recursion is used to keep
1214  references to subpatterns. We know that if (?P< is encountered, the name will  track of subpatterns that reset the capturing group numbers - the (?| feature.
1215  be terminated by '>' because that is checked in the first pass.  
1216    This function was originally called only from the second pass, in which we know
1217    that if (?< or (?' or (?P< is encountered, the name will be correctly
1218    terminated because that is checked in the first pass. There is now one call to
1219    this function in the first pass, to check for a recursive back reference by
1220    name (so that we can make the whole group atomic). In this case, we need check
1221    only up to the current position in the pattern, and that is still OK because
1222    and previous occurrences will have been checked. To make this work, the test
1223    for "end of pattern" is a check against cd->end_pattern in the main loop,
1224    instead of looking for a binary zero. This means that the special first-pass
1225    call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1226    processing items within the loop are OK, because afterwards the main loop will
1227    terminate.)
1228    
1229  Arguments:  Arguments:
1230    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1231    count        current count of capturing parens so far encountered    cd           compile background data
1232    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1233    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1234    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1235      utf8         TRUE if we are in UTF-8 mode
1236      count        pointer to the current capturing subpattern number (updated)
1237    
1238  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1239  */  */
1240    
1241  static int  static int
1242  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1243    BOOL xmode)    BOOL xmode, BOOL utf8, int *count)
1244  {  {
1245  const uschar *thisname;  uschar *ptr = *ptrptr;
1246    int start_count = *count;
1247    int hwm_count = start_count;
1248    BOOL dup_parens = FALSE;
1249    
1250  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1251    dealing with. The very first call may not start with a parenthesis. */
1252    
1253    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1254    {    {
1255    int term;    /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1256    
1257      if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1258    
1259      /* Handle a normal, unnamed capturing parenthesis. */
1260    
1261      else if (ptr[1] != CHAR_QUESTION_MARK)
1262        {
1263        *count += 1;
1264        if (name == NULL && *count == lorn) return *count;
1265        ptr++;
1266        }
1267    
1268      /* All cases now have (? at the start. Remember when we are in a group
1269      where the parenthesis numbers are duplicated. */
1270    
1271      else if (ptr[2] == CHAR_VERTICAL_LINE)
1272        {
1273        ptr += 3;
1274        dup_parens = TRUE;
1275        }
1276    
1277      /* Handle comments; all characters are allowed until a ket is reached. */
1278    
1279      else if (ptr[2] == CHAR_NUMBER_SIGN)
1280        {
1281        for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1282        goto FAIL_EXIT;
1283        }
1284    
1285      /* Handle a condition. If it is an assertion, just carry on so that it
1286      is processed as normal. If not, skip to the closing parenthesis of the
1287      condition (there can't be any nested parens). */
1288    
1289      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1290        {
1291        ptr += 2;
1292        if (ptr[1] != CHAR_QUESTION_MARK)
1293          {
1294          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1295          if (*ptr != 0) ptr++;
1296          }
1297        }
1298    
1299      /* Start with (? but not a condition. */
1300    
1301      else
1302        {
1303        ptr += 2;
1304        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1305    
1306        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1307    
1308        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1309            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1310          {
1311          int term;
1312          const uschar *thisname;
1313          *count += 1;
1314          if (name == NULL && *count == lorn) return *count;
1315          term = *ptr++;
1316          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1317          thisname = ptr;
1318          while (*ptr != term) ptr++;
1319          if (name != NULL && lorn == ptr - thisname &&
1320              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1321            return *count;
1322          term++;
1323          }
1324        }
1325      }
1326    
1327    /* Past any initial parenthesis handling, scan for parentheses or vertical
1328    bars. Stop if we get to cd->end_pattern. Note that this is important for the
1329    first-pass call when this value is temporarily adjusted to stop at the current
1330    position. So DO NOT change this to a test for binary zero. */
1331    
1332    for (; ptr < cd->end_pattern; ptr++)
1333      {
1334    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1335    
1336    if (*ptr == '\\')    if (*ptr == CHAR_BACKSLASH)
1337      {      {
1338      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1339      if (*ptr == 'Q') for (;;)      if (*ptr == CHAR_Q) for (;;)
1340        {        {
1341        while (*(++ptr) != 0 && *ptr != '\\');        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1342        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1343        if (*(++ptr) == 'E') break;        if (*(++ptr) == CHAR_E) break;
1344        }        }
1345      continue;      continue;
1346      }      }
1347    
1348    /* Skip over character classes */    /* Skip over character classes; this logic must be similar to the way they
1349      are handled for real. If the first character is '^', skip it. Also, if the
1350      first few characters (either before or after ^) are \Q\E or \E we skip them
1351      too. This makes for compatibility with Perl. Note the use of STR macros to
1352      encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1353    
1354    if (*ptr == '[')    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1355      {      {
1356      while (*(++ptr) != ']')      BOOL negate_class = FALSE;
1357        for (;;)
1358          {
1359          if (ptr[1] == CHAR_BACKSLASH)
1360            {
1361            if (ptr[2] == CHAR_E)
1362              ptr+= 2;
1363            else if (strncmp((const char *)ptr+2,
1364                     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1365              ptr += 4;
1366            else
1367              break;
1368            }
1369          else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1370            {
1371            negate_class = TRUE;
1372            ptr++;
1373            }
1374          else break;
1375          }
1376    
1377        /* If the next character is ']', it is a data character that must be
1378        skipped, except in JavaScript compatibility mode. */
1379    
1380        if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1381            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1382          ptr++;
1383    
1384        while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1385        {        {
1386        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
1387        if (*ptr == '\\')        if (*ptr == CHAR_BACKSLASH)
1388          {          {
1389          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1390          if (*ptr == 'Q') for (;;)          if (*ptr == CHAR_Q) for (;;)
1391            {            {
1392            while (*(++ptr) != 0 && *ptr != '\\');            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1393            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1394            if (*(++ptr) == 'E') break;            if (*(++ptr) == CHAR_E) break;
1395            }            }
1396          continue;          continue;
1397          }          }
# Line 1008  for (; *ptr != 0; ptr++) Line 1401  for (; *ptr != 0; ptr++)
1401    
1402    /* Skip comments in /x mode */    /* Skip comments in /x mode */
1403    
1404    if (xmode && *ptr == '#')    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1405      {      {
1406      while (*(++ptr) != 0 && *ptr != '\n');      ptr++;
1407      if (*ptr == 0) return -1;      while (*ptr != 0)
1408          {
1409          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1410          ptr++;
1411    #ifdef SUPPORT_UTF8
1412          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1413    #endif
1414          }
1415        if (*ptr == 0) goto FAIL_EXIT;
1416      continue;      continue;
1417      }      }
1418    
1419    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1420    
1421    if (*ptr != '(') continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != '?' && ptr[1] != '*')  
1422      {      {
1423      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1424      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1425      continue;      if (*ptr == 0) goto FAIL_EXIT;
1426      }      }
1427    
1428    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1429    if (*ptr == 'P') ptr++;                      /* Allow optional P */      {
1430        if (dup_parens && *count < hwm_count) *count = hwm_count;
1431        goto FAIL_EXIT;
1432        }
1433    
1434    /* We have to disambiguate (?<! and (?<= from (?<name> */    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1435        {
1436        if (*count > hwm_count) hwm_count = *count;
1437        *count = start_count;
1438        }
1439      }
1440    
1441    if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&  FAIL_EXIT:
1442         *ptr != '\'')  *ptrptr = ptr;
1443      continue;  return -1;
1444    }
1445    
   count++;  
1446    
1447    if (name == NULL && count == lorn) return count;  
1448    term = *ptr++;  
1449    if (term == '<') term = '>';  /*************************************************
1450    thisname = ptr;  *       Find forward referenced subpattern       *
1451    while (*ptr != term) ptr++;  *************************************************/
1452    if (name != NULL && lorn == ptr - thisname &&  
1453        strncmp((const char *)name, (const char *)thisname, lorn) == 0)  /* This function scans along a pattern's text looking for capturing
1454      return count;  subpatterns, and counting them. If it finds a named pattern that matches the
1455    name it is given, it returns its number. Alternatively, if the name is NULL, it
1456    returns when it reaches a given numbered subpattern. This is used for forward
1457    references to subpatterns. We used to be able to start this scan from the
1458    current compiling point, using the current count value from cd->bracount, and
1459    do it all in a single loop, but the addition of the possibility of duplicate
1460    subpattern numbers means that we have to scan from the very start, in order to
1461    take account of such duplicates, and to use a recursive function to keep track
1462    of the different types of group.
1463    
1464    Arguments:
1465      cd           compile background data
1466      name         name to seek, or NULL if seeking a numbered subpattern
1467      lorn         name length, or subpattern number if name is NULL
1468      xmode        TRUE if we are in /x mode
1469      utf8         TRUE if we are in UTF-8 mode
1470    
1471    Returns:       the number of the found subpattern, or -1 if not found
1472    */
1473    
1474    static int
1475    find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1476      BOOL utf8)
1477    {
1478    uschar *ptr = (uschar *)cd->start_pattern;
1479    int count = 0;
1480    int rc;
1481    
1482    /* If the pattern does not start with an opening parenthesis, the first call
1483    to find_parens_sub() will scan right to the end (if necessary). However, if it
1484    does start with a parenthesis, find_parens_sub() will return when it hits the
1485    matching closing parens. That is why we have to have a loop. */
1486    
1487    for (;;)
1488      {
1489      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1490      if (rc > 0 || *ptr++ == 0) break;
1491    }    }
1492    
1493  return -1;  return rc;
1494  }  }
1495    
1496    
1497    
1498    
1499  /*************************************************  /*************************************************
1500  *      Find first significant op code            *  *      Find first significant op code            *
1501  *************************************************/  *************************************************/
1502    
1503  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1504  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1505  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, it makes sense to skip negative
1506  For some calls, it makes sense to skip negative forward and all backward  forward and all backward assertions, and also the \b assertion; for others it
1507  assertions, and also the \b assertion; for others it does not.  does not.
1508    
1509  Arguments:  Arguments:
1510    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1511    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1512    
1513  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1514  */  */
1515    
1516  static const uschar*  static const uschar*
1517  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const uschar *code, BOOL skipassert)
   BOOL skipassert)  
1518  {  {
1519  for (;;)  for (;;)
1520    {    {
1521    switch ((int)*code)    switch ((int)*code)
1522      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1523      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1524      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1525      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1100  for (;;) Line 1535  for (;;)
1535    
1536      case OP_CALLOUT:      case OP_CALLOUT:
1537      case OP_CREF:      case OP_CREF:
1538        case OP_NCREF:
1539      case OP_RREF:      case OP_RREF:
1540        case OP_NRREF:
1541      case OP_DEF:      case OP_DEF:
1542      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1543      break;      break;
# Line 1116  for (;;) Line 1553  for (;;)
1553    
1554    
1555  /*************************************************  /*************************************************
1556  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1557  *************************************************/  *************************************************/
1558    
1559  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1560  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1561  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1562    temporarily terminated with OP_END when this function is called.
1563    
1564    This function is called when a backward assertion is encountered, so that if it
1565    fails, the error message can point to the correct place in the pattern.
1566    However, we cannot do this when the assertion contains subroutine calls,
1567    because they can be forward references. We solve this by remembering this case
1568    and doing the check at the end; a flag specifies which mode we are running in.
1569    
1570  Arguments:  Arguments:
1571    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1572    options  the compiling options    utf8     TRUE in UTF-8 mode
1573      atend    TRUE if called when the pattern is complete
1574  Returns:   the fixed length, or -1 if there is no fixed length,    cd       the "compile data" structure
1575               or -2 if \C was encountered  
1576    Returns:   the fixed length,
1577                 or -1 if there is no fixed length,
1578                 or -2 if \C was encountered (in UTF-8 mode only)
1579                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1580                 or -4 if an unknown opcode was encountered (internal error)
1581  */  */
1582    
1583  static int  static int
1584  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1585  {  {
1586  int length = -1;  int length = -1;
1587    
# Line 1145  branch, check the length against that of Line 1594  branch, check the length against that of
1594  for (;;)  for (;;)
1595    {    {
1596    int d;    int d;
1597      uschar *ce, *cs;
1598    register int op = *cc;    register int op = *cc;
1599    switch (op)    switch (op)
1600      {      {
1601        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1602        OP_BRA (normal non-capturing bracket) because the other variants of these
1603        opcodes are all concerned with unlimited repeated groups, which of course
1604        are not of fixed length. */
1605    
1606      case OP_CBRA:      case OP_CBRA:
1607      case OP_BRA:      case OP_BRA:
1608      case OP_ONCE:      case OP_ONCE:
1609        case OP_ONCE_NC:
1610      case OP_COND:      case OP_COND:
1611      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1612      if (d < 0) return d;      if (d < 0) return d;
1613      branchlength += d;      branchlength += d;
1614      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1615      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1616      break;      break;
1617    
1618      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1619      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1620      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1621        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1622        because they all imply an unlimited repeat. */
1623    
1624      case OP_ALT:      case OP_ALT:
1625      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1626      case OP_END:      case OP_END:
1627        case OP_ACCEPT:
1628        case OP_ASSERT_ACCEPT:
1629      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1630        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1631      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1175  for (;;) Line 1633  for (;;)
1633      branchlength = 0;      branchlength = 0;
1634      break;      break;
1635    
1636        /* A true recursion implies not fixed length, but a subroutine call may
1637        be OK. If the subroutine is a forward reference, we can't deal with
1638        it until the end of the pattern, so return -3. */
1639    
1640        case OP_RECURSE:
1641        if (!atend) return -3;
1642        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1643        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1644        if (cc > cs && cc < ce) return -1;                /* Recursion */
1645        d = find_fixedlength(cs + 2, utf8, atend, cd);
1646        if (d < 0) return d;
1647        branchlength += d;
1648        cc += 1 + LINK_SIZE;
1649        break;
1650    
1651      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1652    
1653      case OP_ASSERT:      case OP_ASSERT:
# Line 1182  for (;;) Line 1655  for (;;)
1655      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1656      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1657      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1658      /* Fall through */      cc += _pcre_OP_lengths[*cc];
1659        break;
1660    
1661      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1662    
1663      case OP_REVERSE:      case OP_MARK:
1664        case OP_PRUNE_ARG:
1665        case OP_SKIP_ARG:
1666        case OP_THEN_ARG:
1667        cc += cc[1] + _pcre_OP_lengths[*cc];
1668        break;
1669    
1670        case OP_CALLOUT:
1671        case OP_CIRC:
1672        case OP_CIRCM:
1673        case OP_CLOSE:
1674        case OP_COMMIT:
1675      case OP_CREF:      case OP_CREF:
     case OP_RREF:  
1676      case OP_DEF:      case OP_DEF:
1677      case OP_OPT:      case OP_DOLL:
1678      case OP_CALLOUT:      case OP_DOLLM:
     case OP_SOD:  
     case OP_SOM:  
1679      case OP_EOD:      case OP_EOD:
1680      case OP_EODN:      case OP_EODN:
1681      case OP_CIRC:      case OP_FAIL:
1682      case OP_DOLL:      case OP_NCREF:
1683        case OP_NRREF:
1684      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1685        case OP_PRUNE:
1686        case OP_REVERSE:
1687        case OP_RREF:
1688        case OP_SET_SOM:
1689        case OP_SKIP:
1690        case OP_SOD:
1691        case OP_SOM:
1692        case OP_THEN:
1693      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1694      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
1695      break;      break;
# Line 1206  for (;;) Line 1697  for (;;)
1697      /* Handle literal characters */      /* Handle literal characters */
1698    
1699      case OP_CHAR:      case OP_CHAR:
1700      case OP_CHARNC:      case OP_CHARI:
1701      case OP_NOT:      case OP_NOT:
1702        case OP_NOTI:
1703      branchlength++;      branchlength++;
1704      cc += 2;      cc += 2;
1705  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1706      if ((options & PCRE_UTF8) != 0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       {  
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1707  #endif  #endif
1708      break;      break;
1709    
# Line 1222  for (;;) Line 1711  for (;;)
1711      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1712    
1713      case OP_EXACT:      case OP_EXACT:
1714        case OP_EXACTI:
1715        case OP_NOTEXACT:
1716        case OP_NOTEXACTI:
1717      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1718      cc += 4;      cc += 4;
1719  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1720      if ((options & PCRE_UTF8) != 0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       {  
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1721  #endif  #endif
1722      break;      break;
1723    
# Line 1245  for (;;) Line 1734  for (;;)
1734      cc += 2;      cc += 2;
1735      /* Fall through */      /* Fall through */
1736    
1737        case OP_HSPACE:
1738        case OP_VSPACE:
1739        case OP_NOT_HSPACE:
1740        case OP_NOT_VSPACE:
1741      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1742      case OP_DIGIT:      case OP_DIGIT:
1743      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1252  for (;;) Line 1745  for (;;)
1745      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1746      case OP_WORDCHAR:      case OP_WORDCHAR:
1747      case OP_ANY:      case OP_ANY:
1748        case OP_ALLANY:
1749      branchlength++;      branchlength++;
1750      cc++;      cc++;
1751      break;      break;
1752    
1753      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1754        otherwise \C is coded as OP_ALLANY. */
1755    
1756      case OP_ANYBYTE:      case OP_ANYBYTE:
1757      return -2;      return -2;
# Line 1275  for (;;) Line 1770  for (;;)
1770    
1771      switch (*cc)      switch (*cc)
1772        {        {
1773          case OP_CRPLUS:
1774          case OP_CRMINPLUS:
1775        case OP_CRSTAR:        case OP_CRSTAR:
1776        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1777        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1295  for (;;) Line 1792  for (;;)
1792    
1793      /* Anything else is variable length */      /* Anything else is variable length */
1794    
1795      default:      case OP_ANYNL:
1796        case OP_BRAMINZERO:
1797        case OP_BRAPOS:
1798        case OP_BRAPOSZERO:
1799        case OP_BRAZERO:
1800        case OP_CBRAPOS:
1801        case OP_EXTUNI:
1802        case OP_KETRMAX:
1803        case OP_KETRMIN:
1804        case OP_KETRPOS:
1805        case OP_MINPLUS:
1806        case OP_MINPLUSI:
1807        case OP_MINQUERY:
1808        case OP_MINQUERYI:
1809        case OP_MINSTAR:
1810        case OP_MINSTARI:
1811        case OP_MINUPTO:
1812        case OP_MINUPTOI:
1813        case OP_NOTMINPLUS:
1814        case OP_NOTMINPLUSI:
1815        case OP_NOTMINQUERY:
1816        case OP_NOTMINQUERYI:
1817        case OP_NOTMINSTAR:
1818        case OP_NOTMINSTARI:
1819        case OP_NOTMINUPTO:
1820        case OP_NOTMINUPTOI:
1821        case OP_NOTPLUS:
1822        case OP_NOTPLUSI:
1823        case OP_NOTPOSPLUS:
1824        case OP_NOTPOSPLUSI:
1825        case OP_NOTPOSQUERY:
1826        case OP_NOTPOSQUERYI:
1827        case OP_NOTPOSSTAR:
1828        case OP_NOTPOSSTARI:
1829        case OP_NOTPOSUPTO:
1830        case OP_NOTPOSUPTOI:
1831        case OP_NOTQUERY:
1832        case OP_NOTQUERYI:
1833        case OP_NOTSTAR:
1834        case OP_NOTSTARI:
1835        case OP_NOTUPTO:
1836        case OP_NOTUPTOI:
1837        case OP_PLUS:
1838        case OP_PLUSI:
1839        case OP_POSPLUS:
1840        case OP_POSPLUSI:
1841        case OP_POSQUERY:
1842        case OP_POSQUERYI:
1843        case OP_POSSTAR:
1844        case OP_POSSTARI:
1845        case OP_POSUPTO:
1846        case OP_POSUPTOI:
1847        case OP_QUERY:
1848        case OP_QUERYI:
1849        case OP_REF:
1850        case OP_REFI:
1851        case OP_SBRA:
1852        case OP_SBRAPOS:
1853        case OP_SCBRA:
1854        case OP_SCBRAPOS:
1855        case OP_SCOND:
1856        case OP_SKIPZERO:
1857        case OP_STAR:
1858        case OP_STARI:
1859        case OP_TYPEMINPLUS:
1860        case OP_TYPEMINQUERY:
1861        case OP_TYPEMINSTAR:
1862        case OP_TYPEMINUPTO:
1863        case OP_TYPEPLUS:
1864        case OP_TYPEPOSPLUS:
1865        case OP_TYPEPOSQUERY:
1866        case OP_TYPEPOSSTAR:
1867        case OP_TYPEPOSUPTO:
1868        case OP_TYPEQUERY:
1869        case OP_TYPESTAR:
1870        case OP_TYPEUPTO:
1871        case OP_UPTO:
1872        case OP_UPTOI:
1873      return -1;      return -1;
1874    
1875        /* Catch unrecognized opcodes so that when new ones are added they
1876        are not forgotten, as has happened in the past. */
1877    
1878        default:
1879        return -4;
1880      }      }
1881    }    }
1882  /* Control never gets here */  /* Control never gets here */
# Line 1306  for (;;) Line 1886  for (;;)
1886    
1887    
1888  /*************************************************  /*************************************************
1889  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1890  *************************************************/  *************************************************/
1891    
1892  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1893  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1894    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1895    so that it can be called from pcre_study() when finding the minimum matching
1896    length.
1897    
1898  Arguments:  Arguments:
1899    code        points to start of expression    code        points to start of expression
1900    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1901    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1902    
1903  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1904  */  */
1905    
1906  static const uschar *  const uschar *
1907  find_bracket(const uschar *code, BOOL utf8, int number)  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1908  {  {
1909  for (;;)  for (;;)
1910    {    {
1911    register int c = *code;    register int c = *code;
1912    
1913    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1914    
1915    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1334  for (;;) Line 1918  for (;;)
1918    
1919    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1920    
1921      /* Handle recursion */
1922    
1923      else if (c == OP_REVERSE)
1924        {
1925        if (number < 0) return (uschar *)code;
1926        code += _pcre_OP_lengths[c];
1927        }
1928    
1929    /* Handle capturing bracket */    /* Handle capturing bracket */
1930    
1931    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
1932               c == OP_CBRAPOS || c == OP_SCBRAPOS)
1933      {      {
1934      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
1935      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
# Line 1345  for (;;) Line 1938  for (;;)
1938    
1939    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
1940    repeated character types, we have to test for \p and \P, which have an extra    repeated character types, we have to test for \p and \P, which have an extra
1941    two bytes of parameters. */    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1942      must add in its length. */
1943    
1944    else    else
1945      {      {
# Line 1369  for (;;) Line 1963  for (;;)
1963        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1964        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1965        break;        break;
1966    
1967          case OP_MARK:
1968          case OP_PRUNE_ARG:
1969          case OP_SKIP_ARG:
1970          code += code[1];
1971          break;
1972    
1973          case OP_THEN_ARG:
1974          code += code[1];
1975          break;
1976        }        }
1977    
1978      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
# Line 1383  for (;;) Line 1987  for (;;)
1987      if (utf8) switch(c)      if (utf8) switch(c)
1988        {        {
1989        case OP_CHAR:        case OP_CHAR:
1990        case OP_CHARNC:        case OP_CHARI:
1991        case OP_EXACT:        case OP_EXACT:
1992          case OP_EXACTI:
1993        case OP_UPTO:        case OP_UPTO:
1994          case OP_UPTOI:
1995        case OP_MINUPTO:        case OP_MINUPTO:
1996          case OP_MINUPTOI:
1997        case OP_POSUPTO:        case OP_POSUPTO:
1998          case OP_POSUPTOI:
1999        case OP_STAR:        case OP_STAR:
2000          case OP_STARI:
2001        case OP_MINSTAR:        case OP_MINSTAR:
2002          case OP_MINSTARI:
2003        case OP_POSSTAR:        case OP_POSSTAR:
2004          case OP_POSSTARI:
2005        case OP_PLUS:        case OP_PLUS:
2006          case OP_PLUSI:
2007        case OP_MINPLUS:        case OP_MINPLUS:
2008          case OP_MINPLUSI:
2009        case OP_POSPLUS:        case OP_POSPLUS:
2010          case OP_POSPLUSI:
2011        case OP_QUERY:        case OP_QUERY:
2012          case OP_QUERYI:
2013        case OP_MINQUERY:        case OP_MINQUERY:
2014          case OP_MINQUERYI:
2015        case OP_POSQUERY:        case OP_POSQUERY:
2016          case OP_POSQUERYI:
2017        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2018        break;        break;
2019        }        }
2020    #else
2021        (void)(utf8);  /* Keep compiler happy by referencing function argument */
2022  #endif  #endif
2023      }      }
2024    }    }
# Line 1438  for (;;) Line 2057  for (;;)
2057    
2058    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
2059    repeated character types, we have to test for \p and \P, which have an extra    repeated character types, we have to test for \p and \P, which have an extra
2060    two bytes of parameters. */    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2061      must add in its length. */
2062    
2063    else    else
2064      {      {
# Line 1462  for (;;) Line 2082  for (;;)
2082        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2083        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2084        break;        break;
2085    
2086          case OP_MARK:
2087          case OP_PRUNE_ARG:
2088          case OP_SKIP_ARG:
2089          code += code[1];
2090          break;
2091    
2092          case OP_THEN_ARG:
2093          code += code[1];
2094          break;
2095        }        }
2096    
2097      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
# Line 1476  for (;;) Line 2106  for (;;)
2106      if (utf8) switch(c)      if (utf8) switch(c)
2107        {        {
2108        case OP_CHAR:        case OP_CHAR:
2109        case OP_CHARNC:        case OP_CHARI:
2110        case OP_EXACT:        case OP_EXACT:
2111          case OP_EXACTI:
2112        case OP_UPTO:        case OP_UPTO:
2113          case OP_UPTOI:
2114        case OP_MINUPTO:        case OP_MINUPTO:
2115          case OP_MINUPTOI:
2116        case OP_POSUPTO:        case OP_POSUPTO:
2117          case OP_POSUPTOI:
2118        case OP_STAR:        case OP_STAR:
2119          case OP_STARI:
2120        case OP_MINSTAR:        case OP_MINSTAR:
2121          case OP_MINSTARI:
2122        case OP_POSSTAR:        case OP_POSSTAR:
2123          case OP_POSSTARI:
2124        case OP_PLUS:        case OP_PLUS:
2125          case OP_PLUSI:
2126        case OP_MINPLUS:        case OP_MINPLUS:
2127          case OP_MINPLUSI:
2128        case OP_POSPLUS:        case OP_POSPLUS:
2129          case OP_POSPLUSI:
2130        case OP_QUERY:        case OP_QUERY:
2131          case OP_QUERYI:
2132        case OP_MINQUERY:        case OP_MINQUERY:
2133          case OP_MINQUERYI:
2134        case OP_POSQUERY:        case OP_POSQUERY:
2135          case OP_POSQUERYI:
2136        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2137        break;        break;
2138        }        }
2139    #else
2140        (void)(utf8);  /* Keep compiler happy by referencing function argument */
2141  #endif  #endif
2142      }      }
2143    }    }
# Line 1508  for (;;) Line 2153  for (;;)
2153  can match the empty string or not. It is called from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
2154  below and from compile_branch() when checking for an unlimited repeat of a  below and from compile_branch() when checking for an unlimited repeat of a
2155  group that can match nothing. Note that first_significant_code() skips over  group that can match nothing. Note that first_significant_code() skips over
2156  assertions. If we hit an unclosed bracket, we return "empty" - this means we've  backward and negative forward assertions when its final argument is TRUE. If we
2157  struck an inner bracket whose current branch will already have been scanned.  hit an unclosed bracket, we return "empty" - this means we've struck an inner
2158    bracket whose current branch will already have been scanned.
2159    
2160  Arguments:  Arguments:
2161    code        points to start of search    code        points to start of search
2162    endcode     points to where to stop    endcode     points to where to stop
2163    utf8        TRUE if in UTF8 mode    utf8        TRUE if in UTF8 mode
2164      cd          contains pointers to tables etc.
2165    
2166  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2167  */  */
2168    
2169  static BOOL  static BOOL
2170  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
2171      compile_data *cd)
2172  {  {
2173  register int c;  register int c;
2174  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
2175       code < endcode;       code < endcode;
2176       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
2177    {    {
2178    const uschar *ccode;    const uschar *ccode;
2179    
2180    c = *code;    c = *code;
2181    
2182      /* Skip over forward assertions; the other assertions are skipped by
2183      first_significant_code() with a TRUE final argument. */
2184    
2185      if (c == OP_ASSERT)
2186        {
2187        do code += GET(code, 1); while (*code == OP_ALT);
2188        c = *code;
2189        continue;
2190        }
2191    
2192      /* For a recursion/subroutine call, if its end has been reached, which
2193      implies a backward reference subroutine call, we can scan it. If it's a
2194      forward reference subroutine call, we can't. To detect forward reference
2195      we have to scan up the list that is kept in the workspace. This function is
2196      called only when doing the real compile, not during the pre-compile that
2197      measures the size of the compiled pattern. */
2198    
2199      if (c == OP_RECURSE)
2200        {
2201        const uschar *scode;
2202        BOOL empty_branch;
2203    
2204        /* Test for forward reference */
2205    
2206        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2207          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2208    
2209        /* Not a forward reference, test for completed backward reference */
2210    
2211        empty_branch = FALSE;
2212        scode = cd->start_code + GET(code, 1);
2213        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2214    
2215        /* Completed backwards reference */
2216    
2217        do
2218          {
2219          if (could_be_empty_branch(scode, endcode, utf8, cd))
2220            {
2221            empty_branch = TRUE;
2222            break;
2223            }
2224          scode += GET(scode, 1);
2225          }
2226        while (*scode == OP_ALT);
2227    
2228        if (!empty_branch) return FALSE;  /* All branches are non-empty */
2229        continue;
2230        }
2231    
2232    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
2233    
2234    if (c == OP_BRAZERO || c == OP_BRAMINZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2235          c == OP_BRAPOSZERO)
2236      {      {
2237      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
2238      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
# Line 1541  for (code = first_significant_code(code Line 2240  for (code = first_significant_code(code
2240      continue;      continue;
2241      }      }
2242    
2243      /* A nested group that is already marked as "could be empty" can just be
2244      skipped. */
2245    
2246      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2247          c == OP_SCBRA || c == OP_SCBRAPOS)
2248        {
2249        do code += GET(code, 1); while (*code == OP_ALT);
2250        c = *code;
2251        continue;
2252        }
2253    
2254    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2255    
2256    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2257          c == OP_CBRA || c == OP_CBRAPOS ||
2258          c == OP_ONCE || c == OP_ONCE_NC ||
2259          c == OP_COND)
2260      {      {
2261      BOOL empty_branch;      BOOL empty_branch;
2262      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2263    
2264      /* Scan a closed bracket */      /* If a conditional group has only one branch, there is a second, implied,
2265        empty branch, so just skip over the conditional, because it could be empty.
2266        Otherwise, scan the individual branches of the group. */
2267    
2268      empty_branch = FALSE;      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
     do  
       {  
       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))  
         empty_branch = TRUE;  
2269        code += GET(code, 1);        code += GET(code, 1);
2270        else
2271          {
2272          empty_branch = FALSE;
2273          do
2274            {
2275            if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2276              empty_branch = TRUE;
2277            code += GET(code, 1);
2278            }
2279          while (*code == OP_ALT);
2280          if (!empty_branch) return FALSE;   /* All branches are non-empty */
2281        }        }
2282      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
2283      c = *code;      c = *code;
2284      continue;      continue;
2285      }      }
# Line 1619  for (code = first_significant_code(code Line 2340  for (code = first_significant_code(code
2340      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2341      case OP_WORDCHAR:      case OP_WORDCHAR:
2342      case OP_ANY:      case OP_ANY:
2343        case OP_ALLANY:
2344      case OP_ANYBYTE:      case OP_ANYBYTE:
2345      case OP_CHAR:      case OP_CHAR:
2346      case OP_CHARNC:      case OP_CHARI:
2347      case OP_NOT:      case OP_NOT:
2348        case OP_NOTI:
2349      case OP_PLUS:      case OP_PLUS:
2350      case OP_MINPLUS:      case OP_MINPLUS:
2351      case OP_POSPLUS:      case OP_POSPLUS:
# Line 1662  for (code = first_significant_code(code Line 2385  for (code = first_significant_code(code
2385      case OP_KET:      case OP_KET:
2386      case OP_KETRMAX:      case OP_KETRMAX:
2387      case OP_KETRMIN:      case OP_KETRMIN:
2388        case OP_KETRPOS:
2389      case OP_ALT:      case OP_ALT:
2390      return TRUE;      return TRUE;
2391    
# Line 1670  for (code = first_significant_code(code Line 2394  for (code = first_significant_code(code
2394    
2395  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2396      case OP_STAR:      case OP_STAR:
2397        case OP_STARI:
2398      case OP_MINSTAR:      case OP_MINSTAR:
2399        case OP_MINSTARI:
2400      case OP_POSSTAR:      case OP_POSSTAR:
2401        case OP_POSSTARI:
2402      case OP_QUERY:      case OP_QUERY:
2403        case OP_QUERYI:
2404      case OP_MINQUERY:      case OP_MINQUERY:
2405        case OP_MINQUERYI:
2406      case OP_POSQUERY:      case OP_POSQUERY:
2407        case OP_POSQUERYI:
2408        if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2409        break;
2410    
2411      case OP_UPTO:      case OP_UPTO:
2412        case OP_UPTOI:
2413      case OP_MINUPTO:      case OP_MINUPTO:
2414        case OP_MINUPTOI:
2415      case OP_POSUPTO:      case OP_POSUPTO:
2416      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      case OP_POSUPTOI:
2417        if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2418      break;      break;
2419  #endif  #endif
2420    
2421        /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2422        string. */
2423    
2424        case OP_MARK:
2425        case OP_PRUNE_ARG:
2426        case OP_SKIP_ARG:
2427        code += code[1];
2428        break;
2429    
2430        case OP_THEN_ARG:
2431        code += code[1];
2432        break;
2433    
2434        /* None of the remaining opcodes are required to match a character. */
2435    
2436        default:
2437        break;
2438      }      }
2439    }    }
2440    
# Line 1697  return TRUE; Line 2451  return TRUE;
2451  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2452  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2453  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2454    This function is called only during the real compile, not during the
2455    pre-compile.
2456    
2457  Arguments:  Arguments:
2458    code        points to start of the recursion    code        points to start of the recursion
2459    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2460    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2461    utf8        TRUE if in UTF-8 mode    utf8        TRUE if in UTF-8 mode
2462      cd          pointers to tables etc
2463    
2464  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2465  */  */
2466    
2467  static BOOL  static BOOL
2468  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2469    BOOL utf8)    BOOL utf8, compile_data *cd)
2470  {  {
2471  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2472    {    {
2473    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2474        return FALSE;
2475    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2476    }    }
2477  return TRUE;  return TRUE;
# Line 1726  return TRUE; Line 2484  return TRUE;
2484  *************************************************/  *************************************************/
2485    
2486  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
2487  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
2488  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2489  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
2490    
2491    Originally, this function only recognized a sequence of letters between the
2492    terminators, but it seems that Perl recognizes any sequence of characters,
2493    though of course unknown POSIX names are subsequently rejected. Perl gives an
2494    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2495    didn't consider this to be a POSIX class. Likewise for [:1234:].
2496    
2497    The problem in trying to be exactly like Perl is in the handling of escapes. We
2498    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2499    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2500    below handles the special case of \], but does not try to do any other escape
2501    processing. This makes it different from Perl for cases such as [:l\ower:]
2502    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2503    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2504    I think.
2505    
2506    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2507    It seems that the appearance of a nested POSIX class supersedes an apparent
2508    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2509    a digit.
2510    
2511    In Perl, unescaped square brackets may also appear as part of class names. For
2512    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2513    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2514    seem right at all. PCRE does not allow closing square brackets in POSIX class
2515    names.
2516    
2517  Argument:  Arguments:
2518    ptr      pointer to the initial [    ptr      pointer to the initial [
2519    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
2520    
2521  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
2522  */  */
2523    
2524  static BOOL  static BOOL
2525  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
2526  {  {
2527  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2528  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2529  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
2530    {    {
2531    *endptr = ptr;    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2532    return TRUE;      ptr++;
2533      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2534      else
2535        {
2536        if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2537          {
2538          *endptr = ptr;
2539          return TRUE;
2540          }
2541        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2542             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2543              ptr[1] == CHAR_EQUALS_SIGN) &&
2544            check_posix_syntax(ptr, endptr))
2545          return FALSE;
2546        }
2547    }    }
2548  return FALSE;  return FALSE;
2549  }  }
# Line 1794  return -1; Line 2589  return -1;
2589  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
2590  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
2591  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
2592  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2593  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
2594  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
2595  offsets adjusted. That one of the jobs of this function. Before it is called,  have their offsets adjusted. That one of the jobs of this function. Before it
2596  the partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
2597    OP_END.
2598    
2599  This function has been extended with the possibility of forward references for  This function has been extended with the possibility of forward references for
2600  recursions and subroutine calls. It must also check the list of such references  recursions and subroutine calls. It must also check the list of such references
# Line 1875  auto_callout(uschar *code, const uschar Line 2671  auto_callout(uschar *code, const uschar
2671  {  {
2672  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2673  *code++ = 255;  *code++ = 255;
2674  PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2675  PUT(code, LINK_SIZE, 0);                /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2676  return code + 2*LINK_SIZE;  return code + 2*LINK_SIZE;
2677  }  }
2678    
# Line 1901  Returns:             nothing Line 2697  Returns:             nothing
2697  static void  static void
2698  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2699  {  {
2700  int length = ptr - cd->start_pattern - GET(previous_callout, 2);  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2701  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
2702  }  }
2703    
# Line 1933  get_othercase_range(unsigned int *cptr, Line 2729  get_othercase_range(unsigned int *cptr,
2729  unsigned int c, othercase, next;  unsigned int c, othercase, next;
2730    
2731  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2732    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2733    
2734  if (c > d) return FALSE;  if (c > d) return FALSE;
2735    
# Line 1942  next = othercase + 1; Line 2738  next = othercase + 1;
2738    
2739  for (++c; c <= d; c++)  for (++c; c <= d; c++)
2740    {    {
2741    if (_pcre_ucp_othercase(c) != next) break;    if (UCD_OTHERCASE(c) != next) break;
2742    next++;    next++;
2743    }    }
2744    
# Line 1951  for (++c; c <= d; c++) Line 2747  for (++c; c <= d; c++)
2747    
2748  return TRUE;  return TRUE;
2749  }  }
2750    
2751    
2752    
2753    /*************************************************
2754    *        Check a character and a property        *
2755    *************************************************/
2756    
2757    /* This function is called by check_auto_possessive() when a property item
2758    is adjacent to a fixed character.
2759    
2760    Arguments:
2761      c            the character
2762      ptype        the property type
2763      pdata        the data for the type
2764      negated      TRUE if it's a negated property (\P or \p{^)
2765    
2766    Returns:       TRUE if auto-possessifying is OK
2767    */
2768    
2769    static BOOL
2770    check_char_prop(int c, int ptype, int pdata, BOOL negated)
2771    {
2772    const ucd_record *prop = GET_UCD(c);
2773    switch(ptype)
2774      {
2775      case PT_LAMP:
2776      return (prop->chartype == ucp_Lu ||
2777              prop->chartype == ucp_Ll ||
2778              prop->chartype == ucp_Lt) == negated;
2779    
2780      case PT_GC:
2781      return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2782    
2783      case PT_PC:
2784      return (pdata == prop->chartype) == negated;
2785    
2786      case PT_SC:
2787      return (pdata == prop->script) == negated;
2788    
2789      /* These are specials */
2790    
2791      case PT_ALNUM:
2792      return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2793              _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2794    
2795      case PT_SPACE:    /* Perl space */
2796      return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2797              c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2798              == negated;
2799    
2800      case PT_PXSPACE:  /* POSIX space */
2801      return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2802              c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2803              c == CHAR_FF || c == CHAR_CR)
2804              == negated;
2805    
2806      case PT_WORD:
2807      return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2808              _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2809              c == CHAR_UNDERSCORE) == negated;
2810      }
2811    return FALSE;
2812    }
2813  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2814    
2815    
# Line 1964  whether the next thing could possibly ma Line 2823  whether the next thing could possibly ma
2823  sense to automatically possessify the repeated item.  sense to automatically possessify the repeated item.
2824    
2825  Arguments:  Arguments:
2826    op_code       the repeated op code    previous      pointer to the repeated opcode
   this          data for this item, depends on the opcode  
2827    utf8          TRUE in UTF-8 mode    utf8          TRUE in UTF-8 mode
   utf8_char     used for utf8 character bytes, NULL if not relevant  
2828    ptr           next character in pattern    ptr           next character in pattern
2829    options       options bits    options       options bits
2830    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 1976  Returns:        TRUE if possessifying is Line 2833  Returns:        TRUE if possessifying is
2833  */  */
2834    
2835  static BOOL  static BOOL
2836  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2837    const uschar *ptr, int options, compile_data *cd)    int options, compile_data *cd)
2838  {  {
2839  int next;  int c, next;
2840    int op_code = *previous++;
2841    
2842  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
2843    
# Line 1988  if ((options & PCRE_EXTENDED) != 0) Line 2846  if ((options & PCRE_EXTENDED) != 0)
2846    for (;;)    for (;;)
2847      {      {
2848      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2849      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2850        {        {
2851        while (*(++ptr) != 0)        ptr++;
2852          while (*ptr != 0)
2853            {
2854          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2855            ptr++;
2856    #ifdef SUPPORT_UTF8
2857            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2858    #endif
2859            }
2860        }        }
2861      else break;      else break;
2862      }      }
# Line 2000  if ((options & PCRE_EXTENDED) != 0) Line 2865  if ((options & PCRE_EXTENDED) != 0)
2865  /* If the next item is one that we can handle, get its value. A non-negative  /* If the next item is one that we can handle, get its value. A non-negative
2866  value is a character, a negative value is an escape value. */  value is a character, a negative value is an escape value. */
2867    
2868  if (*ptr == '\\')  if (*ptr == CHAR_BACKSLASH)
2869    {    {
2870    int temperrorcode = 0;    int temperrorcode = 0;
2871    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
# Line 2025  if ((options & PCRE_EXTENDED) != 0) Line 2890  if ((options & PCRE_EXTENDED) != 0)
2890    for (;;)    for (;;)
2891      {      {
2892      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2893      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2894        {        {
2895        while (*(++ptr) != 0)        ptr++;
2896          while (*ptr != 0)
2897            {
2898          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2899            ptr++;
2900    #ifdef SUPPORT_UTF8
2901            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2902    #endif
2903            }
2904        }        }
2905      else break;      else break;
2906      }      }
# Line 2036  if ((options & PCRE_EXTENDED) != 0) Line 2908  if ((options & PCRE_EXTENDED) != 0)
2908    
2909  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2910    
2911  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2912    return FALSE;    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2913        return FALSE;
 /* Now compare the next item with the previous opcode. If the previous is a  
 positive single character match, "item" either contains the character or, if  
 "item" is greater than 127 in utf8 mode, the character's bytes are in  
 utf8_char. */  
   
2914    
2915  /* Handle cases when the next item is a character. */  /* Now compare the next item with the previous opcode. First, handle cases when
2916    the next item is a character. */
2917    
2918  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
2919    {    {
2920    case OP_CHAR:    case OP_CHAR:
2921  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2922    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
2923    #else
2924      c = *previous;
2925  #endif  #endif
2926    return item != next;    return c != next;
2927    
2928    /* For CHARNC (caseless character) we must check the other case. If we have    /* For CHARI (caseless character) we must check the other case. If we have
2929    Unicode property support, we can use it to test the other case of    Unicode property support, we can use it to test the other case of
2930    high-valued characters. */    high-valued characters. */
2931    
2932    case OP_CHARNC:    case OP_CHARI:
2933  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2934    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
2935    #else
2936      c = *previous;
2937  #endif  #endif
2938    if (item == next) return FALSE;    if (c == next) return FALSE;
2939  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2940    if (utf8)    if (utf8)
2941      {      {
2942      unsigned int othercase;      unsigned int othercase;
2943      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2944  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2945      othercase = _pcre_ucp_othercase((unsigned int)next);      othercase = UCD_OTHERCASE((unsigned int)next);
2946  #else  #else
2947      othercase = NOTACHAR;      othercase = NOTACHAR;
2948  #endif  #endif
2949      return (unsigned int)item != othercase;      return (unsigned int)c != othercase;
2950      }      }
2951    else    else
2952  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
2953    return (item != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
2954    
2955    /* For OP_NOT, "item" must be a single-byte character. */    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2956      opcodes are not used for multi-byte characters, because they are coded using
2957      an XCLASS instead. */
2958    
2959    case OP_NOT:    case OP_NOT:
2960    if (next < 0) return FALSE;  /* Not a character */    return (c = *previous) == next;
2961    if (item == next) return TRUE;  
2962    if ((options & PCRE_CASELESS) == 0) return FALSE;    case OP_NOTI:
2963      if ((c = *previous) == next) return TRUE;
2964  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2965    if (utf8)    if (utf8)
2966      {      {
2967      unsigned int othercase;      unsigned int othercase;
2968      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2969  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2970      othercase = _pcre_ucp_othercase(next);      othercase = UCD_OTHERCASE(next);
2971  #else  #else
2972      othercase = NOTACHAR;      othercase = NOTACHAR;
2973  #endif  #endif
2974      return (unsigned int)item == othercase;      return (unsigned int)c == othercase;
2975      }      }
2976    else    else
2977  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
2978    return (item == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
2979    
2980      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2981      When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2982    
2983    case OP_DIGIT:    case OP_DIGIT:
2984    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
# Line 2143  if (next >= 0) switch(op_code) Line 3021  if (next >= 0) switch(op_code)
3021      case 0x202f:      case 0x202f:
3022      case 0x205f:      case 0x205f:
3023      case 0x3000:      case 0x3000:
3024      return op_code != OP_HSPACE;      return op_code == OP_NOT_HSPACE;
3025      default:      default:
3026      return op_code == OP_HSPACE;      return op_code != OP_NOT_HSPACE;
3027      }      }
3028    
3029      case OP_ANYNL:
3030    case OP_VSPACE:    case OP_VSPACE:
3031    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3032    switch(next)    switch(next)
# Line 2159  if (next >= 0) switch(op_code) Line 3038  if (next >= 0) switch(op_code)
3038      case 0x85:      case 0x85:
3039      case 0x2028:      case 0x2028:
3040      case 0x2029:      case 0x2029:
3041      return op_code != OP_VSPACE;      return op_code == OP_NOT_VSPACE;
3042      default:      default:
3043      return op_code == OP_VSPACE;      return op_code != OP_NOT_VSPACE;
3044      }      }
3045    
3046    #ifdef SUPPORT_UCP
3047      case OP_PROP:
3048      return check_char_prop(next, previous[0], previous[1], FALSE);
3049    
3050      case OP_NOTPROP:
3051      return check_char_prop(next, previous[0], previous[1], TRUE);
3052    #endif
3053    
3054    default:    default:
3055    return FALSE;    return FALSE;
3056    }    }
3057    
3058    
3059  /* Handle the case when the next item is \d, \s, etc. */  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3060    is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3061    generated only when PCRE_UCP is *not* set, that is, when only ASCII
3062    characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3063    replaced by OP_PROP codes when PCRE_UCP is set. */
3064    
3065  switch(op_code)  switch(op_code)
3066    {    {
3067    case OP_CHAR:    case OP_CHAR:
3068    case OP_CHARNC:    case OP_CHARI:
3069  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3070    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
3071    #else
3072      c = *previous;
3073  #endif  #endif
3074    switch(-next)    switch(-next)
3075      {      {
3076      case ESC_d:      case ESC_d:
3077      return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;      return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3078    
3079      case ESC_D:      case ESC_D:
3080      return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3081    
3082      case ESC_s:      case ESC_s:
3083      return item > 127 || (cd->ctypes[item] & ctype_space) == 0;      return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3084    
3085      case ESC_S:      case ESC_S:
3086      return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3087    
3088      case ESC_w:      case ESC_w:
3089      return item > 127 || (cd->ctypes[item] & ctype_word) == 0;      return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3090    
3091      case ESC_W:      case ESC_W:
3092      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3093    
3094      case ESC_h:      case ESC_h:
3095      case ESC_H:      case ESC_H:
3096      switch(item)      switch(c)
3097        {        {
3098        case 0x09:        case 0x09:
3099        case 0x20:        case 0x20:
# Line 2228  switch(op_code) Line 3121  switch(op_code)
3121    
3122      case ESC_v:      case ESC_v:
3123      case ESC_V:      case ESC_V:
3124      switch(item)      switch(c)
3125        {        {
3126        case 0x0a:        case 0x0a:
3127        case 0x0b:        case 0x0b:
# Line 2242  switch(op_code) Line 3135  switch(op_code)
3135        return -next == ESC_v;        return -next == ESC_v;
3136        }        }
3137    
3138        /* When PCRE_UCP is set, these values get generated for \d etc. Find
3139        their substitutions and process them. The result will always be either
3140        -ESC_p or -ESC_P. Then fall through to process those values. */
3141    
3142    #ifdef SUPPORT_UCP
3143        case ESC_du:
3144        case ESC_DU:
3145        case ESC_wu:
3146        case ESC_WU:
3147        case ESC_su:
3148        case ESC_SU:
3149          {
3150          int temperrorcode = 0;
3151          ptr = substitutes[-next - ESC_DU];
3152          next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3153          if (temperrorcode != 0) return FALSE;
3154          ptr++;    /* For compatibility */
3155          }
3156        /* Fall through */
3157    
3158        case ESC_p:
3159        case ESC_P:
3160          {
3161          int ptype, pdata, errorcodeptr;
3162          BOOL negated;
3163    
3164          ptr--;      /* Make ptr point at the p or P */
3165          ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3166          if (ptype < 0) return FALSE;
3167          ptr++;      /* Point past the final curly ket */
3168    
3169          /* If the property item is optional, we have to give up. (When generated
3170          from \d etc by PCRE_UCP, this test will have been applied much earlier,
3171          to the original \d etc. At this point, ptr will point to a zero byte. */
3172    
3173          if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3174            strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3175              return FALSE;
3176    
3177          /* Do the property check. */
3178    
3179          return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3180          }
3181    #endif
3182    
3183      default:      default:
3184      return FALSE;      return FALSE;
3185      }      }
3186    
3187      /* In principle, support for Unicode properties should be integrated here as
3188      well. It means re-organizing the above code so as to get hold of the property
3189      values before switching on the op-code. However, I wonder how many patterns
3190      combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3191      these op-codes are never generated.) */
3192    
3193    case OP_DIGIT:    case OP_DIGIT:
3194    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3195           next == -ESC_h || next == -ESC_v;           next == -ESC_h || next == -ESC_v || next == -ESC_R;
3196    
3197    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3198    return next == -ESC_d;    return next == -ESC_d;
3199    
3200    case OP_WHITESPACE:    case OP_WHITESPACE:
3201    return next == -ESC_S || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3202    
3203    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3204    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3205    
3206    case OP_HSPACE:    case OP_HSPACE:
3207    return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3208             next == -ESC_w || next == -ESC_v || next == -ESC_R;
3209    
3210    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3211    return next == -ESC_h;    return next == -ESC_h;
3212    
3213    /* Can't have \S in here because VT matches \S (Perl anomaly) */    /* Can't have \S in here because VT matches \S (Perl anomaly) */
3214      case OP_ANYNL:
3215    case OP_VSPACE:    case OP_VSPACE:
3216    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3217    
3218    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3219    return next == -ESC_v;    return next == -ESC_v || next == -ESC_R;
3220    
3221    case OP_WORDCHAR:    case OP_WORDCHAR:
3222    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3223             next == -ESC_v || next == -ESC_R;
3224    
3225    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3226    return next == -ESC_w || next == -ESC_d;    return next == -ESC_w || next == -ESC_d;
# Line 2305  Arguments: Line 3252  Arguments:
3252    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3253    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
3254    bcptr          points to current branch chain    bcptr          points to current branch chain
3255      cond_depth     conditional nesting depth
3256    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3257    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3258                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 2316  Returns:         TRUE on success Line 3264  Returns:         TRUE on success
3264  static BOOL  static BOOL
3265  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3266    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3267    compile_data *cd, int *lengthptr)    int cond_depth, compile_data *cd, int *lengthptr)
3268  {  {
3269  int repeat_type, op_type;  int repeat_type, op_type;
3270  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 2325  int greedy_default, greedy_non_default; Line 3273  int greedy_default, greedy_non_default;
3273  int firstbyte, reqbyte;  int firstbyte, reqbyte;
3274  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
3275  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
3276  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3277  int after_manual_callout = 0;  int after_manual_callout = 0;
3278  int length_prevgroup = 0;  int length_prevgroup = 0;
3279  register int c;  register int c;
# Line 2337  BOOL inescq = FALSE; Line 3285  BOOL inescq = FALSE;
3285  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
3286  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
3287  const uschar *tempptr;  const uschar *tempptr;
3288    const uschar *nestptr = NULL;
3289  uschar *previous = NULL;  uschar *previous = NULL;
3290  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
3291  uschar *save_hwm = NULL;  uschar *save_hwm = NULL;
3292  uschar classbits[32];  uschar classbits[32];
3293    
3294    /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3295    must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3296    dynamically as we process the pattern. */
3297    
3298  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3299  BOOL class_utf8;  BOOL class_utf8;
3300  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
3301  uschar *class_utf8data;  uschar *class_utf8data;
3302    uschar *class_utf8data_base;
3303  uschar utf8_char[6];  uschar utf8_char[6];
3304  #else  #else
3305  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
 uschar *utf8_char = NULL;  
3306  #endif  #endif
3307    
3308  #ifdef DEBUG  #ifdef PCRE_DEBUG
3309  if (lengthptr != NULL) DPRINTF((">> start branch\n"));  if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3310  #endif  #endif
3311    
# Line 2385  req_caseopt = ((options & PCRE_CASELESS) Line 3338  req_caseopt = ((options & PCRE_CASELESS)
3338  for (;; ptr++)  for (;; ptr++)
3339    {    {
3340    BOOL negate_class;    BOOL negate_class;
3341    BOOL should_flip_negation;    BOOL should_flip_negation;
3342    BOOL possessive_quantifier;    BOOL possessive_quantifier;
3343    BOOL is_quantifier;    BOOL is_quantifier;
3344    BOOL is_recurse;    BOOL is_recurse;
# Line 2400  for (;; ptr++) Line 3353  for (;; ptr++)
3353    int subfirstbyte;    int subfirstbyte;
3354    int terminator;    int terminator;
3355    int mclength;    int mclength;
3356      int tempbracount;
3357    uschar mcbuffer[8];    uschar mcbuffer[8];
3358    
3359    /* Get next byte in the pattern */    /* Get next byte in the pattern */
3360    
3361    c = *ptr;    c = *ptr;
3362    
3363      /* If we are at the end of a nested substitution, revert to the outer level
3364      string. Nesting only happens one level deep. */
3365    
3366      if (c == 0 && nestptr != NULL)
3367        {
3368        ptr = nestptr;
3369        nestptr = NULL;
3370        c = *ptr;
3371        }
3372    
3373    /* If we are in the pre-compile phase, accumulate the length used for the    /* If we are in the pre-compile phase, accumulate the length used for the
3374    previous cycle of this loop. */    previous cycle of this loop. */
3375    
3376    if (lengthptr != NULL)    if (lengthptr != NULL)
3377      {      {
3378  #ifdef DEBUG  #ifdef PCRE_DEBUG
3379      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3380  #endif  #endif
3381      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3382            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3383        {        {
3384        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3385        goto FAILED;        goto FAILED;
# Line 2436  for (;; ptr++) Line 3401  for (;; ptr++)
3401        goto FAILED;        goto FAILED;
3402        }        }
3403    
3404      *lengthptr += code - last_code;      *lengthptr += (int)(code - last_code);
3405      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3406          c));
3407    
3408      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3409      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 2463  for (;; ptr++) Line 3429  for (;; ptr++)
3429    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3430    reference list. */    reference list. */
3431    
3432    else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3433               WORK_SIZE_SAFETY_MARGIN)
3434      {      {
3435      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3436      goto FAILED;      goto FAILED;
# Line 2473  for (;; ptr++) Line 3440  for (;; ptr++)
3440    
3441    if (inescq && c != 0)    if (inescq && c != 0)
3442      {      {
3443      if (c == '\\' && ptr[1] == 'E')      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3444        {        {
3445        inescq = FALSE;        inescq = FALSE;
3446        ptr++;        ptr++;
# Line 2499  for (;; ptr++) Line 3466  for (;; ptr++)
3466    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
3467    a quantifier. */    a quantifier. */
3468    
3469    is_quantifier = c == '*' || c == '+' || c == '?' ||    is_quantifier =
3470      (c == '{' && is_counted_repeat(ptr+1));      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3471        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3472    
3473    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
3474         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
# Line 2510  for (;; ptr++) Line 3478  for (;; ptr++)
3478      previous_callout = NULL;      previous_callout = NULL;
3479      }      }
3480    
3481    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3482    
3483    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3484      {      {
3485      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
3486      if (c == '#')      if (c == CHAR_NUMBER_SIGN)
3487        {        {
3488        while (*(++ptr) != 0)        ptr++;
3489          while (*ptr != 0)
3490          {          {
3491          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3492            ptr++;
3493    #ifdef SUPPORT_UTF8
3494            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3495    #endif
3496          }          }
3497        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3498    
# Line 2540  for (;; ptr++) Line 3513  for (;; ptr++)
3513      {      {
3514      /* ===================================================================*/      /* ===================================================================*/
3515      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3516      case '|':                      /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3517      case ')':      case CHAR_RIGHT_PARENTHESIS:
3518      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
3519      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
3520      *codeptr = code;      *codeptr = code;
# Line 2553  for (;; ptr++) Line 3526  for (;; ptr++)
3526          *errorcodeptr = ERR20;          *errorcodeptr = ERR20;
3527          goto FAILED;          goto FAILED;
3528          }          }
3529        *lengthptr += code - last_code;   /* To include callout length */        *lengthptr += (int)(code - last_code);   /* To include callout length */
3530        DPRINTF((">> end branch\n"));        DPRINTF((">> end branch\n"));
3531        }        }
3532      return TRUE;      return TRUE;
# Line 2563  for (;; ptr++) Line 3536  for (;; ptr++)
3536      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
3537      the setting of any following char as a first character. */      the setting of any following char as a first character. */
3538    
3539      case '^':      case CHAR_CIRCUMFLEX_ACCENT:
3540        previous = NULL;
3541      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3542        {        {
3543        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3544          *code++ = OP_CIRCM;
3545        }        }
3546      previous = NULL;      else *code++ = OP_CIRC;
     *code++ = OP_CIRC;  
3547      break;      break;
3548    
3549      case '$':      case CHAR_DOLLAR_SIGN:
3550      previous = NULL;      previous = NULL;
3551      *code++ = OP_DOLL;      *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3552      break;      break;
3553    
3554      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3555      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqbyte doesn't change either. */
3556    
3557      case '.':      case CHAR_DOT:
3558      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3559      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
3560      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3561      previous = code;      previous = code;
3562      *code++ = OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3563      break;      break;
3564    
3565    
# Line 2600  for (;; ptr++) Line 3574  for (;; ptr++)
3574      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
3575      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
3576      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
     */  
3577    
3578      case '[':      In JavaScript compatibility mode, an isolated ']' causes an error. In
3579        default (Perl) mode, it is treated as a data character. */
3580    
3581        case CHAR_RIGHT_SQUARE_BRACKET:
3582        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3583          {
3584          *errorcodeptr = ERR64;
3585          goto FAILED;
3586          }
3587        goto NORMAL_CHAR;
3588    
3589        case CHAR_LEFT_SQUARE_BRACKET:
3590      previous = code;      previous = code;
3591    
3592      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3593      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
3594    
3595      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3596          check_posix_syntax(ptr, &tempptr, cd))           ptr[1] == CHAR_EQUALS_SIGN) &&
3597            check_posix_syntax(ptr, &tempptr))
3598        {        {
3599        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3600        goto FAILED;        goto FAILED;
3601        }        }
3602    
# Line 2623  for (;; ptr++) Line 3608  for (;; ptr++)
3608      for (;;)      for (;;)
3609        {        {
3610        c = *(++ptr);        c = *(++ptr);
3611        if (c == '\\')        if (c == CHAR_BACKSLASH)
3612          {          {
3613          if (ptr[1] == 'E') ptr++;          if (ptr[1] == CHAR_E)
3614            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;            ptr++;
3615              else break;          else if (strncmp((const char *)ptr+1,
3616                              STR_Q STR_BACKSLASH STR_E, 3) == 0)
3617              ptr += 3;
3618            else
3619              break;
3620          }          }
3621        else if (!negate_class && c == '^')        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3622          negate_class = TRUE;          negate_class = TRUE;
3623        else break;        else break;
3624        }        }
3625    
3626      /* If a class contains a negative special such as \S, we need to flip the      /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3627      negation flag at the end, so that support for characters > 255 works      an initial ']' is taken as a data character -- the code below handles
3628        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3629        [^] must match any character, so generate OP_ALLANY. */
3630    
3631        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3632            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3633          {
3634          *code++ = negate_class? OP_ALLANY : OP_FAIL;
3635          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3636          zerofirstbyte = firstbyte;
3637          break;
3638          }
3639    
3640        /* If a class contains a negative special such as \S, we need to flip the
3641        negation flag at the end, so that support for characters > 255 works
3642      correctly (they are all included in the class). */      correctly (they are all included in the class). */
3643    
3644      should_flip_negation = FALSE;      should_flip_negation = FALSE;
# Line 2657  for (;; ptr++) Line 3660  for (;; ptr++)
3660  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3661      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
3662      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
3663        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
3664  #endif  #endif
3665    
3666      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2672  for (;; ptr++) Line 3676  for (;; ptr++)
3676          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3677          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3678          }          }
3679    
3680          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3681          data and reset the pointer. This is so that very large classes that
3682          contain a zillion UTF-8 characters no longer overwrite the work space
3683          (which is on the stack). */
3684    
3685          if (lengthptr != NULL)
3686            {
3687            *lengthptr += (int)(class_utf8data - class_utf8data_base);
3688            class_utf8data = class_utf8data_base;
3689            }
3690    
3691  #endif  #endif
3692    
3693        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
3694    
3695        if (inescq)        if (inescq)
3696          {          {
3697          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */          if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3698            {            {
3699            inescq = FALSE;                   /* Reset literal state */            inescq = FALSE;                   /* Reset literal state */
3700            ptr++;                            /* Skip the 'E' */            ptr++;                            /* Skip the 'E' */
# Line 2693  for (;; ptr++) Line 3709  for (;; ptr++)
3709        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3710        5.6 and 5.8 do. */        5.6 and 5.8 do. */
3711    
3712        if (c == '[' &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
3713            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3714            check_posix_syntax(ptr, &tempptr, cd))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3715          {          {
3716          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3717          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3718          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
3719          uschar pbits[32];          uschar pbits[32];
3720    
3721          if (ptr[1] != ':')          if (ptr[1] != CHAR_COLON)
3722            {            {
3723            *errorcodeptr = ERR31;            *errorcodeptr = ERR31;
3724            goto FAILED;            goto FAILED;
3725            }            }
3726    
3727          ptr += 2;          ptr += 2;
3728          if (*ptr == '^')          if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3729            {            {
3730            local_negate = TRUE;            local_negate = TRUE;
3731            should_flip_negation = TRUE;  /* Note negative special */            should_flip_negation = TRUE;  /* Note negative special */
3732            ptr++;            ptr++;
3733            }            }
3734    
3735          posix_class = check_posix_name(ptr, tempptr - ptr);          posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3736          if (posix_class < 0)          if (posix_class < 0)
3737            {            {
3738            *errorcodeptr = ERR30;            *errorcodeptr = ERR30;
# Line 2730  for (;; ptr++) Line 3746  for (;; ptr++)
3746          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3747            posix_class = 0;            posix_class = 0;
3748    
3749          /* We build the bit map for the POSIX class in a chunk of local store          /* When PCRE_UCP is set, some of the POSIX classes are converted to
3750          because we may be adding and subtracting from it, and we don't want to          different escape sequences that use Unicode properties. */
3751          subtract bits that may be in the main map already. At the end we or the  
3752          result into the bit map that is being built. */  #ifdef SUPPORT_UCP
3753            if ((options & PCRE_UCP) != 0)
3754              {
3755              int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3756              if (posix_substitutes[pc] != NULL)
3757                {
3758                nestptr = tempptr + 1;
3759                ptr = posix_substitutes[pc] - 1;
3760                continue;
3761                }
3762              }
3763    #endif
3764            /* In the non-UCP case, we build the bit map for the POSIX class in a
3765            chunk of local store because we may be adding and subtracting from it,
3766            and we don't want to subtract bits that may be in the main map already.
3767            At the end we or the result into the bit map that is being built. */
3768    
3769          posix_class *= 3;          posix_class *= 3;
3770    
# Line 2777  for (;; ptr++) Line 3808  for (;; ptr++)
3808    
3809        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3810        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3811        case. Inside a class (and only there) it is treated as backspace.        case. Inside a class (and only there) it is treated as backspace. We
3812        Elsewhere it marks a word boundary. Other escapes have preset maps ready        assume that other escapes have more than one character in them, so set
3813        to 'or' into the one we are building. We assume they have more than one        class_charcount bigger than one. Unrecognized escapes fall through and
3814        character in them, so set class_charcount bigger than one. */        are either treated as literal characters (by default), or are faulted if
3815          PCRE_EXTRA is set. */
3816    
3817        if (c == '\\')        if (c == CHAR_BACKSLASH)
3818          {          {
3819          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3820          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3821    
3822          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3823          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_N)            /* \N is not supported in a class */
3824          else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */            {
3825              *errorcodeptr = ERR71;
3826              goto FAILED;
3827              }
3828          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3829            {            {
3830            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3831              {              {
3832              ptr += 2; /* avoid empty string */              ptr += 2; /* avoid empty string */
3833              }              }
# Line 2806  for (;; ptr++) Line 3841  for (;; ptr++)
3841            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
3842            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
3843    
3844            /* Save time by not doing this in the pre-compile phase. */            switch (-c)
   
           if (lengthptr == NULL) switch (-c)  
3845              {              {
3846    #ifdef SUPPORT_UCP
3847                case ESC_du:     /* These are the values given for \d etc */
3848                case ESC_DU:     /* when PCRE_UCP is set. We replace the */
3849                case ESC_wu:     /* escape sequence with an appropriate \p */
3850                case ESC_WU:     /* or \P to test Unicode properties instead */
3851                case ESC_su:     /* of the default ASCII testing. */
3852                case ESC_SU:
3853                nestptr = ptr;
3854                ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3855                class_charcount -= 2;                /* Undo! */
3856                continue;
3857    #endif
3858              case ESC_d:              case ESC_d:
3859              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3860              continue;              continue;
3861    
3862              case ESC_D:              case ESC_D:
3863              should_flip_negation = TRUE;              should_flip_negation = TRUE;
3864              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3865              continue;              continue;
3866    
# Line 2824  for (;; ptr++) Line 3869  for (;; ptr++)
3869              continue;              continue;
3870    
3871              case ESC_W:              case ESC_W:
3872              should_flip_negation = TRUE;              should_flip_negation = TRUE;
3873              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3874              continue;              continue;
3875    
3876                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3877                if it was previously set by something earlier in the character
3878                class. */
3879    
3880              case ESC_s:              case ESC_s:
3881              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
3882              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
3883                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3884              continue;              continue;
3885    
3886              case ESC_S:              case ESC_S:
3887              should_flip_negation = TRUE;              should_flip_negation = TRUE;
3888              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3889              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3890              continue;              continue;
3891    
3892              case ESC_E: /* Perl ignores an orphan \E */              case ESC_h:
             continue;  
   
             default:    /* Not recognized; fall through */  
             break;      /* Need "default" setting to stop compiler warning. */  
             }  
   
           /* In the pre-compile phase, just do the recognition. */  
   
           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||  
                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;  
   
           /* We need to deal with \H, \h, \V, and \v in both phases because  
           they use extra memory. */  
   
           if (-c == ESC_h)  
             {  
3893              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
3894              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
3895              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
# Line 2879  for (;; ptr++) Line 3913  for (;; ptr++)
3913                }                }
3914  #endif  #endif
3915              continue;              continue;
             }  
3916    
3917            if (-c == ESC_H)              case ESC_H:
             {  
3918              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
3919                {                {
3920                int x = 0xff;                int x = 0xff;
# Line 2924  for (;; ptr++) Line 3956  for (;; ptr++)
3956                }                }
3957  #endif  #endif
3958              continue;              continue;
             }  
3959    
3960            if (-c == ESC_v)              case ESC_v:
             {  
3961              SETBIT(classbits, 0x0a); /* LF */              SETBIT(classbits, 0x0a); /* LF */
3962              SETBIT(classbits, 0x0b); /* VT */              SETBIT(classbits, 0x0b); /* VT */
3963              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
# Line 2943  for (;; ptr++) Line 3973  for (;; ptr++)
3973                }                }
3974  #endif  #endif
3975              continue;              continue;
             }  
3976    
3977            if (-c == ESC_V)              case ESC_V:
             {  
3978              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
3979                {                {
3980                int x = 0xff;                int x = 0xff;
# Line 2976  for (;; ptr++) Line 4004  for (;; ptr++)
4004                }                }
4005  #endif  #endif
4006              continue;              continue;
             }  
   
           /* We need to deal with \P and \p in both phases. */  
4007    
4008  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4009            if (-c == ESC_p || -c == ESC_P)              case ESC_p:
4010              {              case ESC_P:
4011              BOOL negated;                {
4012              int pdata;                BOOL negated;
4013              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int pdata;
4014              if (ptype < 0) goto FAILED;                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4015              class_utf8 = TRUE;                if (ptype < 0) goto FAILED;
4016              *class_utf8data++ = ((-c == ESC_p) != negated)?                class_utf8 = TRUE;
4017                XCL_PROP : XCL_NOTPROP;                *class_utf8data++ = ((-c == ESC_p) != negated)?
4018              *class_utf8data++ = ptype;                  XCL_PROP : XCL_NOTPROP;
4019              *class_utf8data++ = pdata;                *class_utf8data++ = ptype;
4020              class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = pdata;
4021              continue;                class_charcount -= 2;   /* Not a < 256 character */
4022              }                continue;
4023                  }
4024  #endif  #endif
4025            /* Unrecognized escapes are faulted if PCRE is running in its              /* Unrecognized escapes are faulted if PCRE is running in its
4026            strict mode. By default, for compatibility with Perl, they are              strict mode. By default, for compatibility with Perl, they are
4027            treated as literals. */              treated as literals. */
4028    
4029            if ((options & PCRE_EXTRA) != 0)              default:
4030              {              if ((options & PCRE_EXTRA) != 0)
4031              *errorcodeptr = ERR7;                {
4032              goto FAILED;                *errorcodeptr = ERR7;
4033                  goto FAILED;
4034                  }
4035                class_charcount -= 2;  /* Undo the default count from above */
4036                c = *ptr;              /* Get the final character and fall through */
4037                break;
4038              }              }
   
           class_charcount -= 2;  /* Undo the default count from above */  
           c = *ptr;              /* Get the final character and fall through */  
4039            }            }
4040    
4041          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
# Line 3021  for (;; ptr++) Line 4049  for (;; ptr++)
4049        entirely. The code for handling \Q and \E is messy. */        entirely. The code for handling \Q and \E is messy. */
4050    
4051        CHECK_RANGE:        CHECK_RANGE:
4052        while (ptr[1] == '\\' && ptr[2] == 'E')        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4053          {          {
4054          inescq = FALSE;          inescq = FALSE;
4055          ptr += 2;          ptr += 2;
# Line 3031  for (;; ptr++) Line 4059  for (;; ptr++)
4059    
4060        /* Remember \r or \n */        /* Remember \r or \n */
4061    
4062        if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;        if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4063    
4064        /* Check for range */        /* Check for range */
4065    
4066        if (!inescq && ptr[1] == '-')        if (!inescq && ptr[1] == CHAR_MINUS)
4067          {          {
4068          int d;          int d;
4069          ptr += 2;          ptr += 2;
4070          while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4071    
4072          /* If we hit \Q (not followed by \E) at this point, go into escaped          /* If we hit \Q (not followed by \E) at this point, go into escaped
4073          mode. */          mode. */
4074    
4075          while (*ptr == '\\' && ptr[1] == 'Q')          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4076            {            {
4077            ptr += 2;            ptr += 2;
4078            if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4079                { ptr += 2; continue; }
4080            inescq = TRUE;            inescq = TRUE;
4081            break;            break;
4082            }            }
4083    
4084          if (*ptr == 0 || (!inescq && *ptr == ']'))          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4085            {            {
4086            ptr = oldptr;            ptr = oldptr;
4087            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
# Line 3071  for (;; ptr++) Line 4100  for (;; ptr++)
4100          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4101          in such circumstances. */          in such circumstances. */
4102    
4103          if (!inescq && d == '\\')          if (!inescq && d == CHAR_BACKSLASH)
4104            {            {
4105            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4106            if (*errorcodeptr != 0) goto FAILED;            if (*errorcodeptr != 0) goto FAILED;
4107    
4108            /* \b is backslash; \X is literal X; \R is literal R; any other            /* \b is backspace; any other special means the '-' was literal */
           special means the '-' was literal */  
4109    
4110            if (d < 0)            if (d < 0)
4111              {              {
4112              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = CHAR_BS; else
             else if (d == -ESC_X) d = 'X';  
             else if (d == -ESC_R) d = 'R'; else  
4113                {                {
4114                ptr = oldptr;                ptr = oldptr;
4115                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
# Line 3104  for (;; ptr++) Line 4130  for (;; ptr++)
4130    
4131          /* Remember \r or \n */          /* Remember \r or \n */
4132    
4133          if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4134    
4135          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4136          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
# Line 3224  for (;; ptr++) Line 4250  for (;; ptr++)
4250          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4251            {            {
4252            unsigned int othercase;            unsigned int othercase;
4253            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)            if ((othercase = UCD_OTHERCASE(c)) != c)
4254              {              {
4255              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
4256              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 3249  for (;; ptr++) Line 4275  for (;; ptr++)
4275          }          }
4276        }        }
4277    
4278      /* Loop until ']' reached. This "while" is the end of the "do" above. */      /* Loop until ']' reached. This "while" is the end of the "do" far above.
4279        If we are at the end of an internal nested string, revert to the outer
4280        string. */
4281    
4282        while (((c = *(++ptr)) != 0 ||
4283               (nestptr != NULL &&
4284                 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
4285               (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4286    
4287      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));      /* Check for missing terminating ']' */
4288    
4289      if (c == 0)                          /* Missing terminating ']' */      if (c == 0)
4290        {        {
4291        *errorcodeptr = ERR6;        *errorcodeptr = ERR6;
4292        goto FAILED;        goto FAILED;
4293        }        }
4294    
   
 /* This code has been disabled because it would mean that \s counts as  
 an explicit \r or \n reference, and that's not really what is wanted. Now  
 we set the flag only if there is a literal "\r" or "\n" in the class. */  
   
 #if 0  
     /* Remember whether \r or \n are in this class */  
   
     if (negate_class)  
       {  
       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;  
       }  
     else  
       {  
       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;  
       }  
 #endif  
   
   
4295      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
4296      less than 256. As long as there were no characters >= 128 and there was no      less than 256. As long as there were no characters >= 128 and there was no
4297      use of \p or \P, in other words, no use of any XCLASS features, we can      use of \p or \P, in other words, no use of any XCLASS features, we can
# Line 3285  we set the flag only if there is a liter Line 4299  we set the flag only if there is a liter
4299    
4300      In UTF-8 mode, we can optimize the negative case only if there were no      In UTF-8 mode, we can optimize the negative case only if there were no
4301      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4302      operate on single-bytes only. This is an historical hangover. Maybe one day      operate on single-bytes characters only. This is an historical hangover.
4303      we can tidy these opcodes to handle multi-byte characters.      Maybe one day we can tidy these opcodes to handle multi-byte characters.
4304    
4305      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
4306      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4307      that OP_NOT does not support multibyte characters. In the positive case, it      Note that OP_NOT[I] does not support multibyte characters. In the positive
4308      can cause firstbyte to be set. Otherwise, there can be no first char if      case, it can cause firstbyte to be set. Otherwise, there can be no first
4309      this item is first, whatever repeat count may follow. In the case of      char if this item is first, whatever repeat count may follow. In the case
4310      reqbyte, save the previous value for reinstating. */      of reqbyte, save the previous value for reinstating. */
4311    
4312  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4313      if (class_charcount == 1 && !class_utf8 &&      if (class_charcount == 1 && !class_utf8 &&
# Line 3304  we set the flag only if there is a liter Line 4318  we set the flag only if there is a liter
4318        {        {
4319        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4320    
4321        /* The OP_NOT opcode works on one-byte characters only. */        /* The OP_NOT[I] opcodes work on one-byte characters only. */
4322    
4323        if (negate_class)        if (negate_class)
4324          {          {
4325          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4326          zerofirstbyte = firstbyte;          zerofirstbyte = firstbyte;
4327          *code++ = OP_NOT;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4328          *code++ = class_lastchar;          *code++ = class_lastchar;
4329          break;          break;
4330          }          }
# Line 3340  we set the flag only if there is a liter Line 4354  we set the flag only if there is a liter
4354      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4355    
4356      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4357      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
4358      such as \S in the class, because in that case all characters > 255 are in      such as \S in the class, and PCRE_UCP is not set, because in that case all
4359      the class, so any that were explicitly given as well can be ignored. If      characters > 255 are in the class, so any that were explicitly given as
4360      (when there are explicit characters > 255 that must be listed) there are no      well can be ignored. If (when there are explicit characters > 255 that must
4361      characters < 256, we can omit the bitmap in the actual compiled code. */      be listed) there are no characters < 256, we can omit the bitmap in the
4362        actual compiled code. */
4363    
4364  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4365      if (class_utf8 && !should_flip_negation)      if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
4366        {        {
4367        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
4368        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
# Line 3368  we set the flag only if there is a liter Line 4383  we set the flag only if there is a liter
4383    
4384        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4385    
4386        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4387        break;   /* End of class handling */        break;   /* End of class handling */
4388        }        }
4389  #endif  #endif
4390    
4391      /* If there are no characters > 255, set the opcode to OP_CLASS or      /* If there are no characters > 255, or they are all to be included or
4392      OP_NCLASS, depending on whether the whole class was negated and whether      excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4393      there were negative specials such as \S in the class. Then copy the 32-byte      whole class was negated and whether there were negative specials such as \S
4394      map into the code vector, negating it if necessary. */      (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4395        negating it if necessary. */
4396    
4397      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4398      if (negate_class)      if (negate_class)
4399        {        {
# Line 3396  we set the flag only if there is a liter Line 4412  we set the flag only if there is a liter
4412      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4413      has been tested above. */      has been tested above. */
4414    
4415      case '{':      case CHAR_LEFT_CURLY_BRACKET:
4416      if (!is_quantifier) goto NORMAL_CHAR;      if (!is_quantifier) goto NORMAL_CHAR;
4417      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4418      if (*errorcodeptr != 0) goto FAILED;      if (*errorcodeptr != 0) goto FAILED;
4419      goto REPEAT;      goto REPEAT;
4420    
4421      case '*':      case CHAR_ASTERISK:
4422      repeat_min = 0;      repeat_min = 0;
4423      repeat_max = -1;      repeat_max = -1;
4424      goto REPEAT;      goto REPEAT;
4425    
4426      case '+':      case CHAR_PLUS:
4427      repeat_min = 1;      repeat_min = 1;
4428      repeat_max = -1;      repeat_max = -1;
4429      goto REPEAT;      goto REPEAT;
4430    
4431      case '?':      case CHAR_QUESTION_MARK:
4432      repeat_min = 0;      repeat_min = 0;
4433      repeat_max = 1;      repeat_max = 1;
4434    
# Line 3436  we set the flag only if there is a liter Line 4452  we set the flag only if there is a liter
4452      op_type = 0;                    /* Default single-char op codes */      op_type = 0;                    /* Default single-char op codes */
4453      possessive_quantifier = FALSE;  /* Default not possessive quantifier */      possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4454    
4455      /* Save start of previous item, in case we have to move it up to make space      /* Save start of previous item, in case we have to move it up in order to
4456      for an inserted OP_ONCE for the additional '+' extension. */      insert something before it. */
4457    
4458      tempcode = previous;      tempcode = previous;
4459    
# Line 3447  we set the flag only if there is a liter Line 4463  we set the flag only if there is a liter
4463      but if PCRE_UNGREEDY is set, it works the other way round. We change the      but if PCRE_UNGREEDY is set, it works the other way round. We change the
4464      repeat type to the non-default. */      repeat type to the non-default. */
4465    
4466      if (ptr[1] == '+')      if (ptr[1] == CHAR_PLUS)
4467        {        {
4468        repeat_type = 0;                  /* Force greedy */        repeat_type = 0;                  /* Force greedy */
4469        possessive_quantifier = TRUE;        possessive_quantifier = TRUE;
4470        ptr++;        ptr++;
4471        }        }
4472      else if (ptr[1] == '?')      else if (ptr[1] == CHAR_QUESTION_MARK)
4473        {        {
4474        repeat_type = greedy_non_default;        repeat_type = greedy_non_default;
4475        ptr++;        ptr++;
4476        }        }
4477      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4478    
4479        /* If previous was a recursion call, wrap it in atomic brackets so that
4480        previous becomes the atomic group. All recursions were so wrapped in the
4481        past, but it no longer happens for non-repeated recursions. In fact, the
4482        repeated ones could be re-implemented independently so as not to need this,
4483        but for the moment we rely on the code for repeating groups. */
4484    
4485        if (*previous == OP_RECURSE)
4486          {
4487          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
4488          *previous = OP_ONCE;
4489          PUT(previous, 1, 2 + 2*LINK_SIZE);
4490          previous[2 + 2*LINK_SIZE] = OP_KET;
4491          PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4492          code += 2 + 2 * LINK_SIZE;
4493          length_prevgroup = 3 + 3*LINK_SIZE;
4494    
4495          /* When actually compiling, we need to check whether this was a forward
4496          reference, and if so, adjust the offset. */
4497    
4498          if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4499            {
4500            int offset = GET(cd->hwm, -LINK_SIZE);
4501            if (offset == previous + 1 - cd->start_code)
4502              PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4503            }
4504          }
4505    
4506        /* Now handle repetition for the different types of item. */
4507    
4508      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4509      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
4510      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
4511      the first thing in a branch because the x will have gone into firstbyte      the first thing in a branch because the x will have gone into firstbyte
4512      instead.  */      instead.  */
4513    
4514      if (*previous == OP_CHAR || *previous == OP_CHARNC)      if (*previous == OP_CHAR || *previous == OP_CHARI)
4515        {        {
4516          op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4517    
4518        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF-8 characters that take up more than one byte. It's
4519        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4520        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus 0x80 to flag that it's a
# Line 3478  we set the flag only if there is a liter Line 4525  we set the flag only if there is a liter
4525          {          {
4526          uschar *lastchar = code - 1;          uschar *lastchar = code - 1;
4527          while((*lastchar & 0xc0) == 0x80) lastchar--;          while((*lastchar & 0xc0) == 0x80) lastchar--;
4528          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4529          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf8_char, lastchar, c); /* Save the char */
4530          c |= 0x80;                      /* Flag c as a length */          c |= 0x80;                      /* Flag c as a length */
4531          }          }
# Line 3500  we set the flag only if there is a liter Line 4547  we set the flag only if there is a liter
4547    
4548        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4549            repeat_max < 0 &&            repeat_max < 0 &&
4550            check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,            check_auto_possessive(previous, utf8, ptr + 1, options, cd))
             options, cd))  
4551          {          {
4552          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4553          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 3513  we set the flag only if there is a liter Line 4559  we set the flag only if there is a liter
4559      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
4560      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
4561      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
4562      repeat_type. We can also test for auto-possessification. OP_NOT is      repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
4563      currently used only for single-byte chars. */      are currently used only for single-byte chars. */
4564    
4565      else if (*previous == OP_NOT)      else if (*previous == OP_NOT || *previous == OP_NOTI)
4566        {        {
4567        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
4568        c = previous[1];        c = previous[1];
4569        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4570            repeat_max < 0 &&            repeat_max < 0 &&
4571            check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))            check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4572          {          {
4573          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4574          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 3546  we set the flag only if there is a liter Line 4592  we set the flag only if there is a liter
4592    
4593        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4594            repeat_max < 0 &&            repeat_max < 0 &&
4595            check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))            check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4596          {          {
4597          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4598          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 3568  we set the flag only if there is a liter Line 4614  we set the flag only if there is a liter
4614    
4615        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
4616    
4617          /*--------------------------------------------------------------------*/
4618          /* This code is obsolete from release 8.00; the restriction was finally
4619          removed: */
4620    
4621        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
4622        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4623    
4624        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4625          /*--------------------------------------------------------------------*/
4626    
4627        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
4628    
# Line 3710  we set the flag only if there is a liter Line 4761  we set the flag only if there is a liter
4761  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4762               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
4763  #endif  #endif
4764               *previous == OP_REF)               *previous == OP_REF ||
4765                 *previous == OP_REFI)
4766        {        {
4767        if (repeat_max == 0)        if (repeat_max == 0)
4768          {          {
# Line 3718  we set the flag only if there is a liter Line 4770  we set the flag only if there is a liter
4770          goto END_REPEAT;          goto END_REPEAT;
4771          }          }
4772    
4773          /*--------------------------------------------------------------------*/
4774          /* This code is obsolete from release 8.00; the restriction was finally
4775          removed: */
4776    
4777        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
4778        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4779    
4780        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4781          /*--------------------------------------------------------------------*/
4782    
4783        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
4784          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 3739  we set the flag only if there is a liter Line 4796  we set the flag only if there is a liter
4796        }        }
4797    
4798      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
4799      cases. */      cases. Note that at this point we can encounter only the "basic" bracket
4800        opcodes such as BRA and CBRA, as this is the place where they get converted
4801        into the more special varieties such as BRAPOS and SBRA. A test for >=
4802        OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
4803        ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
4804        repetition of assertions, but now it does, for Perl compatibility. */
4805    
4806      else if (*previous == OP_BRA  || *previous == OP_CBRA ||      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
              *previous == OP_ONCE || *previous == OP_COND)  
4807        {        {
4808        register int i;        register int i;
4809        int ketoffset = 0;        int len = (int)(code - previous);
       int len = code - previous;  
4810        uschar *bralink = NULL;        uschar *bralink = NULL;
4811          uschar *brazeroptr = NULL;
4812    
4813        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
4814          we just ignore the repeat. */
4815    
4816        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4817          {          goto END_REPEAT;
         *errorcodeptr = ERR55;  
         goto FAILED;  
         }  
4818    
4819        /* If the maximum repeat count is unlimited, find the end of the bracket        /* There is no sense in actually repeating assertions. The only potential
4820        by scanning through from the start, and compute the offset back to it        use of repetition is in cases when the assertion is optional. Therefore,
4821        from the current code pointer. There may be an OP_OPT setting following        if the minimum is greater than zero, just ignore the repeat. If the
4822        the final KET, so we can't find the end just by going back from the code        maximum is not not zero or one, set it to 1. */
4823        pointer. */  
4824          if (*previous < OP_ONCE)    /* Assertion */
4825        if (repeat_max == -1)          {
4826          {          if (repeat_min > 0) goto END_REPEAT;
4827          register uschar *ket = previous;          if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
         do ket += GET(ket, 1); while (*ket != OP_KET);  
         ketoffset = code - ket;  
4828          }          }
4829    
4830        /* The case of a zero minimum is special because of the need to stick        /* The case of a zero minimum is special because of the need to stick
# Line 3779  we set the flag only if there is a liter Line 4836  we set the flag only if there is a liter
4836    
4837        if (repeat_min == 0)        if (repeat_min == 0)
4838          {          {
4839          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
4840          altogether. */          output altogether, like this:
4841    
4842          if (repeat_max == 0)          ** if (repeat_max == 0)
4843            {          **   {
4844            code = previous;          **   code = previous;
4845            goto END_REPEAT;          **   goto END_REPEAT;
4846            }          **   }
4847    
4848            However, that fails when a group or a subgroup within it is referenced
4849            as a subroutine from elsewhere in the pattern, so now we stick in
4850            OP_SKIPZERO in front of it so that it is skipped on execution. As we
4851            don't have a list of which groups are referenced, we cannot do this
4852            selectively.
4853    
4854            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4855            and do no more at this point. However, we do need to adjust any
4856            OP_RECURSE calls inside the group that refer to the group itself or any
4857            internal or forward referenced group, because the offset is from the
4858            start of the whole regex. Temporarily terminate the pattern while doing
4859            this. */
4860    
4861          /* If the maximum is 1 or unlimited, we just have to stick in the          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
         BRAZERO and do no more at this point. However, we do need to adjust  
         any OP_RECURSE calls inside the group that refer to the group itself or  
         any internal or forward referenced group, because the offset is from  
         the start of the whole regex. Temporarily terminate the pattern while  
         doing this. */  
   
         if (repeat_max <= 1)  
4862            {            {
4863            *code = OP_END;            *code = OP_END;
4864            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
4865            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
4866            code++;            code++;
4867              if (repeat_max == 0)
4868                {
4869                *previous++ = OP_SKIPZERO;
4870                goto END_REPEAT;
4871                }
4872              brazeroptr = previous;    /* Save for possessive optimizing */
4873            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4874            }            }
4875    
# Line 3825  we set the flag only if there is a liter Line 4894  we set the flag only if there is a liter
4894            /* We chain together the bracket offset fields that have to be            /* We chain together the bracket offset fields that have to be