/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 835 by ph10, Wed Dec 28 16:10:09 2011 UTC revision 836 by ph10, Wed Dec 28 17:16:11 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2011 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
# Line 88  so this number is very generous. Line 92  so this number is very generous.
92  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
93  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
94  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
96    filled up by repetitions of forward references, for example patterns like
97    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98    that the workspace is expanded using malloc() in this situation. The value
99    below is therefore a minimum, and we put a maximum on it for safety. The
100    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101    kicks in at the same number of forward references in all cases. */
102    
103  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105    
106  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
107  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
108    
109  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
110    
111    /* Private flags added to firstchar and reqchar. */
112    
113    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
114    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
115    
116    /* Repeated character flags. */
117    
118    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
119    
120  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 231  static const char posix_names[] = Line 250  static const char posix_names[] =
250    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
252    
253  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
254    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255    
256  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 266  substitutes must be in the order of the Line 285  substitutes must be in the order of the
285  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
286    
287  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
288  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
289    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
292    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
295      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297    static const pcre_uchar string_pXsp[] = {
298      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300    static const pcre_uchar string_PXwd[] = {
301      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303    static const pcre_uchar string_pXwd[] = {
304      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    
307    static const pcre_uchar *substitutes[] = {
308      string_PNd,           /* \D */
309      string_pNd,           /* \d */
310      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
311      string_pXsp,          /* \s */
312      string_PXwd,          /* \W */
313      string_pXwd           /* \w */
314  };  };
315    
316  static const uschar *posix_substitutes[] = {  static const pcre_uchar string_pL[] =   {
317    (uschar *)"\\p{L}",     /* alpha */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318    (uschar *)"\\p{Ll}",    /* lower */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319    (uschar *)"\\p{Lu}",    /* upper */  static const pcre_uchar string_pLl[] =  {
320    (uschar *)"\\p{Xan}",   /* alnum */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321    NULL,                   /* ascii */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pLu[] =  {
323    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325    NULL,                   /* graph */  static const pcre_uchar string_pXan[] = {
326    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327    NULL,                   /* punct */    CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_h[] =    {
329    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_h, '\0' };
330    NULL,                   /* xdigit */  static const pcre_uchar string_pXps[] = {
331      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333    static const pcre_uchar string_PL[] =   {
334      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336    static const pcre_uchar string_PLl[] =  {
337      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339    static const pcre_uchar string_PLu[] =  {
340      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342    static const pcre_uchar string_PXan[] = {
343      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345    static const pcre_uchar string_H[] =    {
346      CHAR_BACKSLASH, CHAR_H, '\0' };
347    static const pcre_uchar string_PXps[] = {
348      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350    
351    static const pcre_uchar *posix_substitutes[] = {
352      string_pL,            /* alpha */
353      string_pLl,           /* lower */
354      string_pLu,           /* upper */
355      string_pXan,          /* alnum */
356      NULL,                 /* ascii */
357      string_h,             /* blank */
358      NULL,                 /* cntrl */
359      string_pNd,           /* digit */
360      NULL,                 /* graph */
361      NULL,                 /* print */
362      NULL,                 /* punct */
363      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
364      string_pXwd,          /* word */
365      NULL,                 /* xdigit */
366    /* Negated cases */    /* Negated cases */
367    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
368    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
369    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
370    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
371    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
372    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
373    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
374    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
375    NULL,                   /* ^graph */    NULL,                 /* ^graph */
376    NULL,                   /* ^print */    NULL,                 /* ^print */
377    NULL,                   /* ^punct */    NULL,                 /* ^punct */
378    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
379    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */
380    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
381  };  };
382  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383  #endif  #endif
384    
385  #define STRING(a)  # a  #define STRING(a)  # a
# Line 412  static const char error_texts[] = Line 485  static const char error_texts[] =
485    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486    /* 70 */    /* 70 */
487    "internal error: unknown opcode in find_fixedlength()\0"    "internal error: unknown opcode in find_fixedlength()\0"
488      "\\N is not supported in a class\0"
489      "too many forward references\0"
490      "disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff)\0"
491    ;    ;
492    
493  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 430  For convenience, we use the same bit def Line 506  For convenience, we use the same bit def
506    
507  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
508    
509    /* Using a simple comparison for decimal numbers rather than a memory read
510    is much faster, and the resulting code is simpler (the compiler turns it
511    into a subtraction and unsigned comparison). */
512    
513    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
514    
515  #ifndef EBCDIC  #ifndef EBCDIC
516    
517  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
518  UTF-8 mode. */  UTF-8 mode. */
519    
520  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
521    {    {
522    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
523    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 474  static const unsigned char digitab[] = Line 556  static const unsigned char digitab[] =
556    
557  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
558    
559  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
560    {    {
561    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
562    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 509  static const unsigned char digitab[] = Line 591  static const unsigned char digitab[] =
591    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
592    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
593    
594  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
595    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
596    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
597    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 548  static const unsigned char ebcdic_charta Line 630  static const unsigned char ebcdic_charta
630  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
631    
632  static BOOL  static BOOL
633    compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
634      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
635    
636    
# Line 580  return s; Line 662  return s;
662    
663    
664  /*************************************************  /*************************************************
665    *           Expand the workspace                 *
666    *************************************************/
667    
668    /* This function is called during the second compiling phase, if the number of
669    forward references fills the existing workspace, which is originally a block on
670    the stack. A larger block is obtained from malloc() unless the ultimate limit
671    has been reached or the increase will be rather small.
672    
673    Argument: pointer to the compile data block
674    Returns:  0 if all went well, else an error number
675    */
676    
677    static int
678    expand_workspace(compile_data *cd)
679    {
680    pcre_uchar *newspace;
681    int newsize = cd->workspace_size * 2;
682    
683    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
684    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
685        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
686     return ERR72;
687    
688    newspace = (PUBL(malloc))(IN_UCHARS(newsize));
689    if (newspace == NULL) return ERR21;
690    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
691    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
692    if (cd->workspace_size > COMPILE_WORK_SIZE)
693      (PUBL(free))((void *)cd->start_workspace);
694    cd->start_workspace = newspace;
695    cd->workspace_size = newsize;
696    return 0;
697    }
698    
699    
700    
701    /*************************************************
702  *            Check for counted repeat            *  *            Check for counted repeat            *
703  *************************************************/  *************************************************/
704    
# Line 595  Returns:    TRUE or FALSE Line 714  Returns:    TRUE or FALSE
714  */  */
715    
716  static BOOL  static BOOL
717  is_counted_repeat(const uschar *p)  is_counted_repeat(const pcre_uchar *p)
718  {  {
719  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
720  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
721    while (IS_DIGIT(*p)) p++;
722  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
723    
724  if (*p++ != CHAR_COMMA) return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
725  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
726    
727  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
728  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
729    while (IS_DIGIT(*p)) p++;
730    
731  return (*p == CHAR_RIGHT_CURLY_BRACKET);  return (*p == CHAR_RIGHT_CURLY_BRACKET);
732  }  }
# Line 637  Returns:         zero or positive => a d Line 758  Returns:         zero or positive => a d
758  */  */
759    
760  static int  static int
761  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
762    int options, BOOL isclass)    int options, BOOL isclass)
763  {  {
764  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
765  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
766  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
767    pcre_int32 c;
768    int i;
769    
770  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
771  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
# Line 656  in a table. A non-zero result is somethi Line 779  in a table. A non-zero result is somethi
779  Otherwise further processing may be required. */  Otherwise further processing may be required. */
780    
781  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
782  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
783    else if (c < CHAR_0 || c > CHAR_z) {}
784  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
785    
786  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
787  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
788    else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
789  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
790  #endif  #endif
791    
# Line 668  else if ((i = escapes[c - 0x48]) != 0) Line 793  else if ((i = escapes[c - 0x48]) != 0)
793    
794  else  else
795    {    {
796    const uschar *oldptr;    const pcre_uchar *oldptr;
797    BOOL braced, negated;    BOOL braced, negated;
798    
799    switch (c)    switch (c)
# Line 686  else Line 811  else
811        {        {
812        /* In JavaScript, \u must be followed by four hexadecimal numbers.        /* In JavaScript, \u must be followed by four hexadecimal numbers.
813        Otherwise it is a lowercase u letter. */        Otherwise it is a lowercase u letter. */
814        if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
815             && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
816            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
817            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
818          {          {
819          c = 0;          c = 0;
820          for (i = 0; i < 4; ++i)          for (i = 0; i < 4; ++i)
# Line 741  else Line 868  else
868    
869      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
870        {        {
871        const uschar *p;        const pcre_uchar *p;
872        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
873          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
874        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
875          {          {
876          c = -ESC_k;          c = -ESC_k;
# Line 761  else Line 888  else
888        }        }
889      else negated = FALSE;      else negated = FALSE;
890    
891        /* The integer range is limited by the machine's int representation. */
892      c = 0;      c = 0;
893      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
894          {
895          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
896            {
897            c = -1;
898            break;
899            }
900        c = c * 10 + *(++ptr) - CHAR_0;        c = c * 10 + *(++ptr) - CHAR_0;
901          }
902      if (c < 0)   /* Integer overflow */      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
903        {        {
904          while (IS_DIGIT(ptr[1]))
905            ptr++;
906        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
907        break;        break;
908        }        }
# Line 814  else Line 950  else
950      if (!isclass)      if (!isclass)
951        {        {
952        oldptr = ptr;        oldptr = ptr;
953          /* The integer range is limited by the machine's int representation. */
954        c -= CHAR_0;        c -= CHAR_0;
955        while ((digitab[ptr[1]] & ctype_digit) != 0)        while (IS_DIGIT(ptr[1]))
956            {
957            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
958              {
959              c = -1;
960              break;
961              }
962          c = c * 10 + *(++ptr) - CHAR_0;          c = c * 10 + *(++ptr) - CHAR_0;
963        if (c < 0)    /* Integer overflow */          }
964          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
965          {          {
966            while (IS_DIGIT(ptr[1]))
967              ptr++;
968          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
969          break;          break;
970          }          }
# Line 851  else Line 997  else
997      c -= CHAR_0;      c -= CHAR_0;
998      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
999          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
1000      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf && c > 0xff) *errorcodeptr = ERR51;
1001      break;      break;
1002    
1003      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
1004      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1005      treated as a data character. */      If not, { is treated as a data character. */
1006    
1007      case CHAR_x:      case CHAR_x:
1008      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1009        {        {
1010        /* In JavaScript, \x must be followed by two hexadecimal numbers.        /* In JavaScript, \x must be followed by two hexadecimal numbers.
1011        Otherwise it is a lowercase x letter. */        Otherwise it is a lowercase x letter. */
1012        if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1013            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1014          {          {
1015          c = 0;          c = 0;
1016          for (i = 0; i < 2; ++i)          for (i = 0; i < 2; ++i)
# Line 883  else Line 1030  else
1030    
1031      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1032        {        {
1033        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
       int count = 0;  
1034    
1035        c = 0;        c = 0;
1036        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1037          {          {
1038          register int cc = *pt++;          register int cc = *pt++;
1039          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
         count++;  
1040    
1041  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1042          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
# Line 900  else Line 1045  else
1045          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1046          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1047  #endif  #endif
1048    
1049    #ifdef COMPILE_PCRE8
1050            if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1051    #else
1052    #ifdef COMPILE_PCRE16
1053            if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1054    #endif
1055    #endif
1056            }
1057    
1058          if (c < 0)
1059            {
1060            while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1061            *errorcodeptr = ERR34;
1062          }          }
1063    
1064        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1065          {          {
1066          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1067          ptr = pt;          ptr = pt;
1068          break;          break;
1069          }          }
# Line 916  else Line 1075  else
1075      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1076    
1077      c = 0;      c = 0;
1078      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1079        {        {
1080        int cc;                                  /* Some compilers don't like */        int cc;                                  /* Some compilers don't like */
1081        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
# Line 1014  Returns:         type value from ucp_typ Line 1173  Returns:         type value from ucp_typ
1173  */  */
1174    
1175  static int  static int
1176  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1177  {  {
1178  int c, i, bot, top;  int c, i, bot, top;
1179  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1180  char name[32];  pcre_uchar name[32];
1181    
1182  c = *(++ptr);  c = *(++ptr);
1183  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 1035  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1194  if (c == CHAR_LEFT_CURLY_BRACKET)
1194      *negptr = TRUE;      *negptr = TRUE;
1195      ptr++;      ptr++;
1196      }      }
1197    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1198      {      {
1199      c = *(++ptr);      c = *(++ptr);
1200      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 1059  else Line 1218  else
1218  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1219    
1220  bot = 0;  bot = 0;
1221  top = _pcre_utt_size;  top = PRIV(utt_size);
1222    
1223  while (bot < top)  while (bot < top)
1224    {    {
1225    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1226    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1227    if (c == 0)    if (c == 0)
1228      {      {
1229      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1230      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1231      }      }
1232    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1233    }    }
# Line 1106  Returns:         pointer to '}' on succe Line 1265  Returns:         pointer to '}' on succe
1265                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1266  */  */
1267    
1268  static const uschar *  static const pcre_uchar *
1269  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1270  {  {
1271  int min = 0;  int min = 0;
1272  int max = -1;  int max = -1;
# Line 1115  int max = -1; Line 1274  int max = -1;
1274  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1275  an integer overflow. */  an integer overflow. */
1276    
1277  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1278  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1279    {    {
1280    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1130  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1289  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1289    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1290      {      {
1291      max = 0;      max = 0;
1292      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1293      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1294        {        {
1295        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1185  Arguments: Line 1344  Arguments:
1344    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1345    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1346    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1347    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1348    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1349    
1350  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1351  */  */
1352    
1353  static int  static int
1354  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1355    BOOL xmode, BOOL utf8, int *count)    BOOL xmode, BOOL utf, int *count)
1356  {  {
1357  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1358  int start_count = *count;  int start_count = *count;
1359  int hwm_count = start_count;  int hwm_count = start_count;
1360  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1262  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1421  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1421          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1422        {        {
1423        int term;        int term;
1424        const uschar *thisname;        const pcre_uchar *thisname;
1425        *count += 1;        *count += 1;
1426        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1427        term = *ptr++;        term = *ptr++;
# Line 1270  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1429  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1429        thisname = ptr;        thisname = ptr;
1430        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1431        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1432            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, lorn) == 0)
1433          return *count;          return *count;
1434        term++;        term++;
1435        }        }
# Line 1313  for (; ptr < cd->end_pattern; ptr++) Line 1472  for (; ptr < cd->end_pattern; ptr++)
1472          {          {
1473          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1474            ptr+= 2;            ptr+= 2;
1475          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1476                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1477            ptr += 4;            ptr += 4;
1478          else          else
# Line 1361  for (; ptr < cd->end_pattern; ptr++) Line 1520  for (; ptr < cd->end_pattern; ptr++)
1520        {        {
1521        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1522        ptr++;        ptr++;
1523  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1524        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;        if (utf) FORWARDCHAR(ptr);
1525  #endif  #endif
1526        }        }
1527      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
# Line 1373  for (; ptr < cd->end_pattern; ptr++) Line 1532  for (; ptr < cd->end_pattern; ptr++)
1532    
1533    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1534      {      {
1535      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1536      if (rc > 0) return rc;      if (rc > 0) return rc;
1537      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1538      }      }
# Line 1419  Arguments: Line 1578  Arguments:
1578    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1579    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1580    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1581    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1582    
1583  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1584  */  */
1585    
1586  static int  static int
1587  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1588    BOOL utf8)    BOOL utf)
1589  {  {
1590  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1591  int count = 0;  int count = 0;
1592  int rc;  int rc;
1593    
# Line 1439  matching closing parens. That is why we Line 1598  matching closing parens. That is why we
1598    
1599  for (;;)  for (;;)
1600    {    {
1601    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1602    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1603    }    }
1604    
# Line 1466  Arguments: Line 1625  Arguments:
1625  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1626  */  */
1627    
1628  static const uschar*  static const pcre_uchar*
1629  first_significant_code(const uschar *code, BOOL skipassert)  first_significant_code(const pcre_uchar *code, BOOL skipassert)
1630  {  {
1631  for (;;)  for (;;)
1632    {    {
# Line 1478  for (;;) Line 1637  for (;;)
1637      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1638      if (!skipassert) return code;      if (!skipassert) return code;
1639      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1640      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1641      break;      break;
1642    
1643      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1492  for (;;) Line 1651  for (;;)
1651      case OP_RREF:      case OP_RREF:
1652      case OP_NRREF:      case OP_NRREF:
1653      case OP_DEF:      case OP_DEF:
1654      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1655      break;      break;
1656    
1657      default:      default:
# Line 1522  and doing the check at the end; a flag s Line 1681  and doing the check at the end; a flag s
1681    
1682  Arguments:  Arguments:
1683    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1684    utf8     TRUE in UTF-8 mode    utf      TRUE in UTF-8 / UTF-16 mode
1685    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1686    cd       the "compile data" structure    cd       the "compile data" structure
1687    
# Line 1534  Returns:   the fixed length, Line 1693  Returns:   the fixed length,
1693  */  */
1694    
1695  static int  static int
1696  find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1697  {  {
1698  int length = -1;  int length = -1;
1699    
1700  register int branchlength = 0;  register int branchlength = 0;
1701  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1702    
1703  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1704  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1547  branch, check the length against that of Line 1706  branch, check the length against that of
1706  for (;;)  for (;;)
1707    {    {
1708    int d;    int d;
1709    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1710    register int op = *cc;    register int op = *cc;
1711    
1712    switch (op)    switch (op)
1713      {      {
1714      /* We only need to continue for OP_CBRA (normal capturing bracket) and      /* We only need to continue for OP_CBRA (normal capturing bracket) and
# Line 1561  for (;;) Line 1721  for (;;)
1721      case OP_ONCE:      case OP_ONCE:
1722      case OP_ONCE_NC:      case OP_ONCE_NC:
1723      case OP_COND:      case OP_COND:
1724      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1725      if (d < 0) return d;      if (d < 0) return d;
1726      branchlength += d;      branchlength += d;
1727      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1592  for (;;) Line 1752  for (;;)
1752    
1753      case OP_RECURSE:      case OP_RECURSE:
1754      if (!atend) return -3;      if (!atend) return -3;
1755      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1756      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1757      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1758      d = find_fixedlength(cs + 2, utf8, atend, cd);      d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1759      if (d < 0) return d;      if (d < 0) return d;
1760      branchlength += d;      branchlength += d;
1761      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1608  for (;;) Line 1768  for (;;)
1768      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1769      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1770      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1771      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1772        break;
1773    
1774      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1775    
# Line 1616  for (;;) Line 1777  for (;;)
1777      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
1778      case OP_SKIP_ARG:      case OP_SKIP_ARG:
1779      case OP_THEN_ARG:      case OP_THEN_ARG:
1780      cc += cc[1] + _pcre_OP_lengths[*cc];      cc += cc[1] + PRIV(OP_lengths)[*cc];
1781      break;      break;
1782    
1783      case OP_CALLOUT:      case OP_CALLOUT:
# Line 1643  for (;;) Line 1804  for (;;)
1804      case OP_SOM:      case OP_SOM:
1805      case OP_THEN:      case OP_THEN:
1806      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1807      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1808      break;      break;
1809    
1810      /* Handle literal characters */      /* Handle literal characters */
# Line 1654  for (;;) Line 1815  for (;;)
1815      case OP_NOTI:      case OP_NOTI:
1816      branchlength++;      branchlength++;
1817      cc += 2;      cc += 2;
1818  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1819      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1820  #endif  #endif
1821      break;      break;
1822    
# Line 1667  for (;;) Line 1828  for (;;)
1828      case OP_NOTEXACT:      case OP_NOTEXACT:
1829      case OP_NOTEXACTI:      case OP_NOTEXACTI:
1830      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1831      cc += 4;      cc += 2 + IMM2_SIZE;
1832  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1833      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1834  #endif  #endif
1835      break;      break;
1836    
1837      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1838      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1839      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1840      cc += 4;      cc += 1 + IMM2_SIZE + 1;
1841      break;      break;
1842    
1843      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1702  for (;;) Line 1863  for (;;)
1863      cc++;      cc++;
1864      break;      break;
1865    
1866      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1867      otherwise \C is coded as OP_ALLANY. */      otherwise \C is coded as OP_ALLANY. */
1868    
1869      case OP_ANYBYTE:      case OP_ANYBYTE:
# Line 1710  for (;;) Line 1871  for (;;)
1871    
1872      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1873    
1874  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1875      case OP_XCLASS:      case OP_XCLASS:
1876      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1877      /* Fall through */      /* Fall through */
1878  #endif  #endif
1879    
1880      case OP_CLASS:      case OP_CLASS:
1881      case OP_NCLASS:      case OP_NCLASS:
1882      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1883    
1884      switch (*cc)      switch (*cc)
1885        {        {
# Line 1732  for (;;) Line 1893  for (;;)
1893    
1894        case OP_CRRANGE:        case OP_CRRANGE:
1895        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1896        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1897        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1898        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1899        break;        break;
1900    
1901        default:        default:
# Line 1849  length. Line 2010  length.
2010    
2011  Arguments:  Arguments:
2012    code        points to start of expression    code        points to start of expression
2013    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2014    number      the required bracket number or negative to find a lookbehind    number      the required bracket number or negative to find a lookbehind
2015    
2016  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2017  */  */
2018    
2019  const uschar *  const pcre_uchar *
2020  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2021  {  {
2022  for (;;)  for (;;)
2023    {    {
# Line 1874  for (;;) Line 2035  for (;;)
2035    
2036    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
2037      {      {
2038      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
2039      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2040      }      }
2041    
2042    /* Handle capturing bracket */    /* Handle capturing bracket */
# Line 1884  for (;;) Line 2045  for (;;)
2045             c == OP_CBRAPOS || c == OP_SCBRAPOS)             c == OP_CBRAPOS || c == OP_SCBRAPOS)
2046      {      {
2047      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
2048      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2049      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2050      }      }
2051    
2052    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
# Line 1913  for (;;) Line 2074  for (;;)
2074        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2075        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2076        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2077        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2078            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2079        break;        break;
2080    
2081        case OP_MARK:        case OP_MARK:
# Line 1929  for (;;) Line 2091  for (;;)
2091    
2092      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2093    
2094      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2095    
2096    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2097    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2098    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2099    
2100  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2101      if (utf8) switch(c)      if (utf) switch(c)
2102        {        {
2103        case OP_CHAR:        case OP_CHAR:
2104        case OP_CHARI:        case OP_CHARI:
# Line 1966  for (;;) Line 2128  for (;;)
2128        case OP_MINQUERYI:        case OP_MINQUERYI:
2129        case OP_POSQUERY:        case OP_POSQUERY:
2130        case OP_POSQUERYI:        case OP_POSQUERYI:
2131        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2132        break;        break;
2133        }        }
2134  #else  #else
2135      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2136  #endif  #endif
2137      }      }
2138    }    }
# Line 1987  instance of OP_RECURSE. Line 2149  instance of OP_RECURSE.
2149    
2150  Arguments:  Arguments:
2151    code        points to start of expression    code        points to start of expression
2152    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2153    
2154  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2155  */  */
2156    
2157  static const uschar *  static const pcre_uchar *
2158  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2159  {  {
2160  for (;;)  for (;;)
2161    {    {
# Line 2032  for (;;) Line 2194  for (;;)
2194        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2195        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2196        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2197        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2198            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2199        break;        break;
2200    
2201        case OP_MARK:        case OP_MARK:
# Line 2048  for (;;) Line 2211  for (;;)
2211    
2212      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2213    
2214      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2215    
2216      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2217      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2218      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2219    
2220  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2221      if (utf8) switch(c)      if (utf) switch(c)
2222        {        {
2223        case OP_CHAR:        case OP_CHAR:
2224        case OP_CHARI:        case OP_CHARI:
# Line 2085  for (;;) Line 2248  for (;;)
2248        case OP_MINQUERYI:        case OP_MINQUERYI:
2249        case OP_POSQUERY:        case OP_POSQUERY:
2250        case OP_POSQUERYI:        case OP_POSQUERYI:
2251        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2252        break;        break;
2253        }        }
2254  #else  #else
2255      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2256  #endif  #endif
2257      }      }
2258    }    }
# Line 2112  bracket whose current branch will alread Line 2275  bracket whose current branch will alread
2275  Arguments:  Arguments:
2276    code        points to start of search    code        points to start of search
2277    endcode     points to where to stop    endcode     points to where to stop
2278    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2279    cd          contains pointers to tables etc.    cd          contains pointers to tables etc.
2280    
2281  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2282  */  */
2283    
2284  static BOOL  static BOOL
2285  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2286    compile_data *cd)    BOOL utf, compile_data *cd)
2287  {  {
2288  register int c;  register int c;
2289  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2290       code < endcode;       code < endcode;
2291       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2292    {    {
2293    const uschar *ccode;    const pcre_uchar *ccode;
2294    
2295    c = *code;    c = *code;
2296    
# Line 2150  for (code = first_significant_code(code Line 2313  for (code = first_significant_code(code
2313    
2314    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2315      {      {
2316      const uschar *scode;      const pcre_uchar *scode;
2317      BOOL empty_branch;      BOOL empty_branch;
2318    
2319      /* Test for forward reference */      /* Test for forward reference */
# Line 2168  for (code = first_significant_code(code Line 2331  for (code = first_significant_code(code
2331    
2332      do      do
2333        {        {
2334        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf, cd))
2335          {          {
2336          empty_branch = TRUE;          empty_branch = TRUE;
2337          break;          break;
# Line 2186  for (code = first_significant_code(code Line 2349  for (code = first_significant_code(code
2349    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2350        c == OP_BRAPOSZERO)        c == OP_BRAPOSZERO)
2351      {      {
2352      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2353      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
2354      c = *code;      c = *code;
2355      continue;      continue;
# Line 2224  for (code = first_significant_code(code Line 2387  for (code = first_significant_code(code
2387        empty_branch = FALSE;        empty_branch = FALSE;
2388        do        do
2389          {          {
2390          if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2391            empty_branch = TRUE;            empty_branch = TRUE;
2392          code += GET(code, 1);          code += GET(code, 1);
2393          }          }
# Line 2242  for (code = first_significant_code(code Line 2405  for (code = first_significant_code(code
2405      {      {
2406      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2407      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2408      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2409      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2410      here. */      here. */
2411    
2412  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2413      case OP_XCLASS:      case OP_XCLASS:
2414      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2415      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2254  for (code = first_significant_code(code Line 2417  for (code = first_significant_code(code
2417    
2418      case OP_CLASS:      case OP_CLASS:
2419      case OP_NCLASS:      case OP_NCLASS:
2420      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2421    
2422  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2423      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2424  #endif  #endif
2425    
# Line 2329  for (code = first_significant_code(code Line 2492  for (code = first_significant_code(code
2492      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2493      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2494      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2495      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP
2496          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2497      break;      break;
2498    
2499      /* End of branch */      /* End of branch */
# Line 2344  for (code = first_significant_code(code Line 2508  for (code = first_significant_code(code
2508      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2509      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
2510    
2511  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2512      case OP_STAR:      case OP_STAR:
2513      case OP_STARI:      case OP_STARI:
2514      case OP_MINSTAR:      case OP_MINSTAR:
# Line 2357  for (code = first_significant_code(code Line 2521  for (code = first_significant_code(code
2521      case OP_MINQUERYI:      case OP_MINQUERYI:
2522      case OP_POSQUERY:      case OP_POSQUERY:
2523      case OP_POSQUERYI:      case OP_POSQUERYI:
2524      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2525      break;      break;
2526    
2527      case OP_UPTO:      case OP_UPTO:
# Line 2366  for (code = first_significant_code(code Line 2530  for (code = first_significant_code(code
2530      case OP_MINUPTOI:      case OP_MINUPTOI:
2531      case OP_POSUPTO:      case OP_POSUPTO:
2532      case OP_POSUPTOI:      case OP_POSUPTOI:
2533      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2534      break;      break;
2535  #endif  #endif
2536    
# Line 2410  Arguments: Line 2574  Arguments:
2574    code        points to start of the recursion    code        points to start of the recursion
2575    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2576    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2577    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2578    cd          pointers to tables etc    cd          pointers to tables etc
2579    
2580  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2581  */  */
2582    
2583  static BOOL  static BOOL
2584  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2585    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2586  {  {
2587  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2588    {    {
2589    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2590      return FALSE;      return FALSE;
2591    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2592    }    }
# Line 2474  Returns:   TRUE or FALSE Line 2638  Returns:   TRUE or FALSE
2638  */  */
2639    
2640  static BOOL  static BOOL
2641  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2642  {  {
2643  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2644  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
# Line 2518  Returns:     a value representing the na Line 2682  Returns:     a value representing the na
2682  */  */
2683    
2684  static int  static int
2685  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2686  {  {
2687  const char *pn = posix_names;  const char *pn = posix_names;
2688  register int yield = 0;  register int yield = 0;
2689  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2690    {    {
2691    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2692      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2693    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2694    yield++;    yield++;
2695    }    }
# Line 2557  value in the reference (which is a group Line 2721  value in the reference (which is a group
2721  Arguments:  Arguments:
2722    group      points to the start of the group    group      points to the start of the group
2723    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2724    utf8       TRUE in UTF-8 mode    utf        TRUE in UTF-8 / UTF-16 mode
2725    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2726    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm   the hwm forward reference pointer at the start of the group
2727    
# Line 2565  Returns:     nothing Line 2729  Returns:     nothing
2729  */  */
2730    
2731  static void  static void
2732  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2733    uschar *save_hwm)    pcre_uchar *save_hwm)
2734  {  {
2735  uschar *ptr = group;  pcre_uchar *ptr = group;
2736    
2737  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2738    {    {
2739    int offset;    int offset;
2740    uschar *hc;    pcre_uchar *hc;
2741    
2742    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2743    reference. */    reference. */
# Line 2618  Arguments: Line 2782  Arguments:
2782  Returns:         new code pointer  Returns:         new code pointer
2783  */  */
2784    
2785  static uschar *  static pcre_uchar *
2786  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2787  {  {
2788  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2789  *code++ = 255;  *code++ = 255;
2790  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2791  PUT(code, LINK_SIZE, 0);                       /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2792  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2793  }  }
2794    
2795    
# Line 2647  Returns:             nothing Line 2811  Returns:             nothing
2811  */  */
2812    
2813  static void  static void
2814  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2815  {  {
2816  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2817  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
# Line 2730  switch(ptype) Line 2894  switch(ptype)
2894            prop->chartype == ucp_Lt) == negated;            prop->chartype == ucp_Lt) == negated;
2895    
2896    case PT_GC:    case PT_GC:
2897    return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2898    
2899    case PT_PC:    case PT_PC:
2900    return (pdata == prop->chartype) == negated;    return (pdata == prop->chartype) == negated;
# Line 2741  switch(ptype) Line 2905  switch(ptype)
2905    /* These are specials */    /* These are specials */
2906    
2907    case PT_ALNUM:    case PT_ALNUM:
2908    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2909            _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2910    
2911    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2912    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2913            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2914            == negated;            == negated;
2915    
2916    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2917    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2918            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2919            c == CHAR_FF || c == CHAR_CR)            c == CHAR_FF || c == CHAR_CR)
2920            == negated;            == negated;
2921    
2922    case PT_WORD:    case PT_WORD:
2923    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2924            _pcre_ucp_gentype[prop->chartype] == ucp_N ||            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2925            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
2926    }    }
2927  return FALSE;  return FALSE;
# Line 2776  sense to automatically possessify the re Line 2940  sense to automatically possessify the re
2940    
2941  Arguments:  Arguments:
2942    previous      pointer to the repeated opcode    previous      pointer to the repeated opcode
2943    utf8          TRUE in UTF-8 mode    utf           TRUE in UTF-8 / UTF-16 mode
2944    ptr           next character in pattern    ptr           next character in pattern
2945    options       options bits    options       options bits
2946    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 2785  Returns:        TRUE if possessifying is Line 2949  Returns:        TRUE if possessifying is
2949  */  */
2950    
2951  static BOOL  static BOOL
2952  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2953    int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2954  {  {
2955  int c, next;  pcre_int32 c, next;
2956  int op_code = *previous++;  int op_code = *previous++;
2957    
2958  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2797  if ((options & PCRE_EXTENDED) != 0) Line 2961  if ((options & PCRE_EXTENDED) != 0)
2961    {    {
2962    for (;;)    for (;;)
2963      {      {
2964      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2965      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2966        {        {
2967        ptr++;        ptr++;
# Line 2805  if ((options & PCRE_EXTENDED) != 0) Line 2969  if ((options & PCRE_EXTENDED) != 0)
2969          {          {
2970          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2971          ptr++;          ptr++;
2972  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2973          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
2974  #endif  #endif
2975          }          }
2976        }        }
# Line 2824  if (*ptr == CHAR_BACKSLASH) Line 2988  if (*ptr == CHAR_BACKSLASH)
2988    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
2989    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
2990    }    }
2991    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
2992    {    {
2993  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2994    if (utf8) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
2995  #endif  #endif
2996    next = *ptr++;    next = *ptr++;
2997    }    }
   
2998  else return FALSE;  else return FALSE;
2999    
3000  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2841  if ((options & PCRE_EXTENDED) != 0) Line 3003  if ((options & PCRE_EXTENDED) != 0)
3003    {    {
3004    for (;;)    for (;;)
3005      {      {
3006      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3007      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3008        {        {
3009        ptr++;        ptr++;
# Line 2849  if ((options & PCRE_EXTENDED) != 0) Line 3011  if ((options & PCRE_EXTENDED) != 0)
3011          {          {
3012          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3013          ptr++;          ptr++;
3014  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3015          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3016  #endif  #endif
3017          }          }
3018        }        }
# Line 2861  if ((options & PCRE_EXTENDED) != 0) Line 3023  if ((options & PCRE_EXTENDED) != 0)
3023  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
3024    
3025  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3026    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3027      return FALSE;      return FALSE;
3028    
3029  /* Now compare the next item with the previous opcode. First, handle cases when  /* Now compare the next item with the previous opcode. First, handle cases when
# Line 2870  the next item is a character. */ Line 3032  the next item is a character. */
3032  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3033    {    {
3034    case OP_CHAR:    case OP_CHAR:
3035  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3036    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3037  #else  #else
3038    c = *previous;    c = *previous;
# Line 2882  if (next >= 0) switch(op_code) Line 3044  if (next >= 0) switch(op_code)
3044    high-valued characters. */    high-valued characters. */
3045    
3046    case OP_CHARI:    case OP_CHARI:
3047  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3048    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3049  #else  #else
3050    c = *previous;    c = *previous;
3051  #endif  #endif
3052    if (c == next) return FALSE;    if (c == next) return FALSE;
3053  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3054    if (utf8)    if (utf)
3055      {      {
3056      unsigned int othercase;      unsigned int othercase;
3057      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2901  if (next >= 0) switch(op_code) Line 3063  if (next >= 0) switch(op_code)
3063      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3064      }      }
3065    else    else
3066  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3067    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3068    
3069    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
3070    opcodes are not used for multi-byte characters, because they are coded using    opcodes are not used for multi-byte characters, because they are coded using
# Line 2913  if (next >= 0) switch(op_code) Line 3075  if (next >= 0) switch(op_code)
3075    
3076    case OP_NOTI:    case OP_NOTI:
3077    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
3078  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3079    if (utf8)    if (utf)
3080      {      {
3081      unsigned int othercase;      unsigned int othercase;
3082      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2926  if (next >= 0) switch(op_code) Line 3088  if (next >= 0) switch(op_code)
3088      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3089      }      }
3090    else    else
3091  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3092    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3093    
3094    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3095    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
# Line 3018  switch(op_code) Line 3180  switch(op_code)
3180    {    {
3181    case OP_CHAR:    case OP_CHAR:
3182    case OP_CHARI:    case OP_CHARI:
3183  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3184    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3185  #else  #else
3186    c = *previous;    c = *previous;
# Line 3123  switch(op_code) Line 3285  switch(op_code)
3285        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
3286    
3287        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3288          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3289            return FALSE;            return FALSE;
3290    
3291        /* Do the property check. */        /* Do the property check. */
# Line 3201  Arguments: Line 3363  Arguments:
3363    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3364    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3365    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3366    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3367    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3368    bcptr          points to current branch chain    bcptr          points to current branch chain
3369    cond_depth     conditional nesting depth    cond_depth     conditional nesting depth
3370    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
# Line 3214  Returns:         TRUE on success Line 3376  Returns:         TRUE on success
3376  */  */
3377    
3378  static BOOL  static BOOL
3379  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3380    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3381    int cond_depth, compile_data *cd, int *lengthptr)    pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3382      compile_data *cd, int *lengthptr)
3383  {  {
3384  int repeat_type, op_type;  int repeat_type, op_type;
3385  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3386  int bravalue = 0;  int bravalue = 0;
3387  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3388  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3389  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3390  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3391  int options = *optionsptr;               /* May change dynamically */  int options = *optionsptr;               /* May change dynamically */
3392  int after_manual_callout = 0;  int after_manual_callout = 0;
3393  int length_prevgroup = 0;  int length_prevgroup = 0;
3394  register int c;  register int c;
3395  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3396  uschar *last_code = code;  pcre_uchar *last_code = code;
3397  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3398  uschar *tempcode;  pcre_uchar *tempcode;
3399  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3400  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3401  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3402  const uschar *tempptr;  const pcre_uchar *tempptr;
3403  const uschar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
3404  uschar *previous = NULL;  pcre_uchar *previous = NULL;
3405  uschar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
3406  uschar *save_hwm = NULL;  pcre_uchar *save_hwm = NULL;
3407  uschar classbits[32];  pcre_uint8 classbits[32];
3408    
3409  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3410  must not do this for other options (e.g. PCRE_EXTENDED) because they may change  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3411  dynamically as we process the pattern. */  dynamically as we process the pattern. */
3412    
3413  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3414  BOOL class_utf8;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3415  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
3416  uschar *class_utf8data;  pcre_uchar utf_chars[6];
 uschar *class_utf8data_base;  
 uschar utf8_char[6];  
3417  #else  #else
3418  BOOL utf8 = FALSE;  BOOL utf = FALSE;
3419    #endif
3420    
3421    /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3422    
3423    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3424    BOOL xclass;
3425    pcre_uchar *class_uchardata;
3426    pcre_uchar *class_uchardata_base;
3427  #endif  #endif
3428    
3429  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 3268  greedy_non_default = greedy_default ^ 1; Line 3437  greedy_non_default = greedy_default ^ 1;
3437    
3438  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3439  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3440  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3441  find one.  find one.
3442    
3443  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3444  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3445  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3446  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3447    
3448  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3449    
3450  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3451  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3452  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3453  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3454    value. This is used only for ASCII characters. */
3455    
3456  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3457    
3458  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3459    
# Line 3295  for (;; ptr++) Line 3465  for (;; ptr++)
3465    BOOL is_quantifier;    BOOL is_quantifier;
3466    BOOL is_recurse;    BOOL is_recurse;
3467    BOOL reset_bracount;    BOOL reset_bracount;
3468    int class_charcount;    int class_has_8bitchar;
3469    int class_lastchar;    int class_single_char;
3470    int newoptions;    int newoptions;
3471    int recno;    int recno;
3472    int refsign;    int refsign;
3473    int skipbytes;    int skipbytes;
3474    int subreqbyte;    int subreqchar;
3475    int subfirstbyte;    int subfirstchar;
3476    int terminator;    int terminator;
3477    int mclength;    int mclength;
3478    int tempbracount;    int tempbracount;
3479    uschar mcbuffer[8];    pcre_uchar mcbuffer[8];
3480    
3481    /* Get next byte in the pattern */    /* Get next character in the pattern */
3482    
3483    c = *ptr;    c = *ptr;
3484    
# Line 3330  for (;; ptr++) Line 3500  for (;; ptr++)
3500  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3501      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3502  #endif  #endif
3503      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3504            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3505        {        {
3506        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3507        goto FAILED;        goto FAILED;
# Line 3353  for (;; ptr++) Line 3524  for (;; ptr++)
3524        }        }
3525    
3526      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3527      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3528        c));        (int)(code - last_code), c, c));
3529    
3530      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3531      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
3532      if "previous" is NULL, reset the current code pointer to the start. */      if "previous" is NULL, reset the current code pointer to the start. */
# Line 3364  for (;; ptr++) Line 3535  for (;; ptr++)
3535        {        {
3536        if (previous > orig_code)        if (previous > orig_code)
3537          {          {
3538          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3539          code -= previous - orig_code;          code -= previous - orig_code;
3540          previous = orig_code;          previous = orig_code;
3541          }          }
# Line 3380  for (;; ptr++) Line 3551  for (;; ptr++)
3551    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3552    reference list. */    reference list. */
3553    
3554    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3555               WORK_SIZE_SAFETY_MARGIN)
3556      {      {
3557      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3558      goto FAILED;      goto FAILED;
# Line 3432  for (;; ptr++) Line 3604  for (;; ptr++)
3604    
3605    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3606      {      {
3607      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3608      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3609        {        {
3610        ptr++;        ptr++;
# Line 3440  for (;; ptr++) Line 3612  for (;; ptr++)
3612          {          {
3613          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3614          ptr++;          ptr++;
3615  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3616          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3617  #endif  #endif
3618          }          }
3619        if (*ptr != 0) continue;        if (*ptr != 0) continue;
# Line 3465  for (;; ptr++) Line 3637  for (;; ptr++)
3637      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3638      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3639      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3640      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3641      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3642      *codeptr = code;      *codeptr = code;
3643      *ptrptr = ptr;      *ptrptr = ptr;
3644      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3490  for (;; ptr++) Line 3662  for (;; ptr++)
3662      previous = NULL;      previous = NULL;
3663      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3664        {        {
3665        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3666        *code++ = OP_CIRCM;        *code++ = OP_CIRCM;
3667        }        }
3668      else *code++ = OP_CIRC;      else *code++ = OP_CIRC;
# Line 3502  for (;; ptr++) Line 3674  for (;; ptr++)
3674      break;      break;
3675    
3676      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3677      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3678    
3679      case CHAR_DOT:      case CHAR_DOT:
3680      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3681      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3682      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3683      previous = code;      previous = code;
3684      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3685      break;      break;
# Line 3562  for (;; ptr++) Line 3734  for (;; ptr++)
3734          {          {
3735          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
3736            ptr++;            ptr++;
3737          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
                           STR_Q STR_BACKSLASH STR_E, 3) == 0)  
3738            ptr += 3;            ptr += 3;
3739          else          else
3740            break;            break;
# Line 3582  for (;; ptr++) Line 3753  for (;; ptr++)
3753          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3754        {        {
3755        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3756        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3757        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3758        break;        break;
3759        }        }
3760    
# Line 3593  for (;; ptr++) Line 3764  for (;; ptr++)
3764    
3765      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3766    
3767      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3768      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3769      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3770        a single character. */
3771    
3772      class_charcount = 0;      class_has_8bitchar = 0;
3773      class_lastchar = -1;      class_single_char = 0;
3774    
3775      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3776      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
3777      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3778      */      */
3779    
3780      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3781    
3782  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3783      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3784      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3785      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3786  #endif  #endif
3787    
3788      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3619  for (;; ptr++) Line 3791  for (;; ptr++)
3791    
3792      if (c != 0) do      if (c != 0) do
3793        {        {
3794        const uschar *oldptr;        const pcre_uchar *oldptr;
3795    
3796  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3797        if (utf8 && c > 127)        if (utf && HAS_EXTRALEN(c))
3798          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3799          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3800          }          }
3801    #endif
3802    
3803        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3804          /* In the pre-compile phase, accumulate the length of any extra
3805        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3806        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
3807        (which is on the stack). */        (which is on the stack). */
3808    
3809        if (lengthptr != NULL)        if (lengthptr != NULL)
3810          {          {
3811          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_uchardata - class_uchardata_base;
3812          class_utf8data = class_utf8data_base;          class_uchardata = class_uchardata_base;
3813          }          }
   
3814  #endif  #endif
3815    
3816        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3665  for (;; ptr++) Line 3838  for (;; ptr++)
3838          {          {
3839          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3840          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3841          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3842          uschar pbits[32];          pcre_uint8 pbits[32];
3843    
3844          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
3845            {            {
# Line 3721  for (;; ptr++) Line 3894  for (;; ptr++)
3894          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3895    
3896          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3897            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3898    
3899          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3900    
# Line 3752  for (;; ptr++) Line 3925  for (;; ptr++)
3925            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3926    
3927          ptr = tempptr + 1;          ptr = tempptr + 1;
3928          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3929            class_has_8bitchar = 1;
3930            /* Every class contains at least two characters. */
3931            class_single_char = 2;
3932          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3933          }          }
3934    
3935        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3936        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3937        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3938        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3939        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3940        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3941          as literal characters (by default), or are faulted if
3942        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3943    
3944        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3770  for (;; ptr++) Line 3947  for (;; ptr++)
3947          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3948    
3949          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3950            else if (-c == ESC_N)            /* \N is not supported in a class */
3951              {
3952              *errorcodeptr = ERR71;
3953              goto FAILED;
3954              }
3955          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3956            {            {
3957            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3783  for (;; ptr++) Line 3965  for (;; ptr++)
3965    
3966          if (c < 0)          if (c < 0)
3967            {            {
3968            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3969            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3970              class_has_8bitchar++;
3971              /* Every class contains at least two characters. */
3972              class_single_char += 2;
3973    
3974            switch (-c)            switch (-c)
3975              {              {
# Line 3797  for (;; ptr++) Line 3982  for (;; ptr++)
3982              case ESC_SU:              case ESC_SU:
3983              nestptr = ptr;              nestptr = ptr;
3984              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3985              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
3986              continue;              continue;
3987  #endif  #endif
3988              case ESC_d:              case ESC_d:
# Line 3838  for (;; ptr++) Line 4023  for (;; ptr++)
4023              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
4024              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
4025              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
4026  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4027              if (utf8)              xclass = TRUE;
4028                *class_uchardata++ = XCL_SINGLE;
4029                *class_uchardata++ = 0x1680;
4030                *class_uchardata++ = XCL_SINGLE;
4031                *class_uchardata++ = 0x180e;
4032                *class_uchardata++ = XCL_RANGE;
4033                *class_uchardata++ = 0x2000;
4034                *class_uchardata++ = 0x200a;
4035                *class_uchardata++ = XCL_SINGLE;
4036                *class_uchardata++ = 0x202f;
4037                *class_uchardata++ = XCL_SINGLE;
4038                *class_uchardata++ = 0x205f;
4039                *class_uchardata++ = XCL_SINGLE;
4040                *class_uchardata++ = 0x3000;
4041    #elif defined SUPPORT_UTF
4042                if (utf)
4043                {                {
4044                class_utf8 = TRUE;                xclass = TRUE;
4045                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4046                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4047                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4048                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4049                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4050                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4051                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4052                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4053                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4054                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4055                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4056                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4057                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4058                }                }
4059  #endif  #endif
4060              continue;              continue;
# Line 3872  for (;; ptr++) Line 4072  for (;; ptr++)
4072                  }                  }
4073                classbits[c] |= x;                classbits[c] |= x;
4074                }                }
4075    #ifndef COMPILE_PCRE8
4076  #ifdef SUPPORT_UTF8              xclass = TRUE;
4077              if (utf8)              *class_uchardata++ = XCL_RANGE;
4078                *class_uchardata++ = 0x0100;
4079                *class_uchardata++ = 0x167f;
4080                *class_uchardata++ = XCL_RANGE;
4081                *class_uchardata++ = 0x1681;
4082                *class_uchardata++ = 0x180d;
4083                *class_uchardata++ = XCL_RANGE;
4084                *class_uchardata++ = 0x180f;
4085                *class_uchardata++ = 0x1fff;
4086                *class_uchardata++ = XCL_RANGE;
4087                *class_uchardata++ = 0x200b;
4088                *class_uchardata++ = 0x202e;
4089                *class_uchardata++ = XCL_RANGE;
4090                *class_uchardata++ = 0x2030;
4091                *class_uchardata++ = 0x205e;
4092                *class_uchardata++ = XCL_RANGE;
4093                *class_uchardata++ = 0x2060;
4094                *class_uchardata++ = 0x2fff;
4095                *class_uchardata++ = XCL_RANGE;
4096                *class_uchardata++ = 0x3001;
4097    #ifdef SUPPORT_UTF
4098                if (utf)
4099                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4100                else
4101    #endif
4102                  *class_uchardata++ = 0xffff;
4103    #elif defined SUPPORT_UTF
4104                if (utf)
4105                {                {
4106                class_utf8 = TRUE;                xclass = TRUE;
4107                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4108                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4109                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4110                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4111                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4112                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4113                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4114                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4115                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4116                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4117                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4118                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4119                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4120                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4121                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4122                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4123                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4124                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4125                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4126                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4127                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4128                }                }
4129  #endif  #endif
4130              continue;              continue;
# Line 3908  for (;; ptr++) Line 4135  for (;; ptr++)
4135              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
4136              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, 0x0d); /* CR */
4137              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, 0x85); /* NEL */
4138  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4139              if (utf8)              xclass = TRUE;
4140                *class_uchardata++ = XCL_RANGE;
4141                *class_uchardata++ = 0x2028;
4142                *class_uchardata++ = 0x2029;
4143    #elif defined SUPPORT_UTF
4144                if (utf)
4145                {                {
4146                class_utf8 = TRUE;                xclass = TRUE;
4147                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4148                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4149                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4150                }                }
4151  #endif  #endif
4152              continue;              continue;
# Line 3936  for (;; ptr++) Line 4168  for (;; ptr++)
4168                classbits[c] |= x;                classbits[c] |= x;
4169                }                }
4170    
4171  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4172              if (utf8)              xclass = TRUE;
4173                *class_uchardata++ = XCL_RANGE;
4174                *class_uchardata++ = 0x0100;
4175                *class_uchardata++ = 0x2027;
4176                *class_uchardata++ = XCL_RANGE;
4177                *class_uchardata++ = 0x202a;
4178    #ifdef SUPPORT_UTF
4179                if (utf)
4180                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4181                else
4182    #endif
4183                  *class_uchardata++ = 0xffff;
4184    #elif defined SUPPORT_UTF
4185                if (utf)
4186                {                {
4187                class_utf8 = TRUE;                xclass = TRUE;
4188                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4189                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4190                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4191                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4192                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4193                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4194                }                }
4195  #endif  #endif
4196              continue;              continue;
# Line 3958  for (;; ptr++) Line 4203  for (;; ptr++)
4203                int pdata;                int pdata;
4204                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4205                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
4206                class_utf8 = TRUE;                xclass = TRUE;
4207                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_uchardata++ = ((-c == ESC_p) != negated)?
4208                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4209                *class_utf8data++ = ptype;                *class_uchardata++ = ptype;
4210                *class_utf8data++ = pdata;                *class_uchardata++ = pdata;
4211                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4212                continue;                continue;
4213                }                }
4214  #endif  #endif
# Line 3977  for (;; ptr++) Line 4222  for (;; ptr++)
4222                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4223                goto FAILED;                goto FAILED;
4224                }                }
4225              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4226              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4227                c = *ptr;                /* Get the final character and fall through */
4228              break;              break;
4229              }              }
4230            }            }
4231    
4232          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4233          greater than 256 in UTF-8 mode. */          greater than 256. */
4234    
4235          }   /* End of backslash handling */          }   /* End of backslash handling */
4236    
# Line 4032  for (;; ptr++) Line 4278  for (;; ptr++)
4278            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4279            }            }
4280    
4281  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4282          if (utf8)          if (utf)
4283            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4284            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4285            }            }
# Line 4077  for (;; ptr++) Line 4323  for (;; ptr++)
4323    
4324          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4325    
4326            /* Since we found a character range, single character optimizations
4327            cannot be done anymore. */
4328            class_single_char = 2;
4329    
4330          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4331          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4332          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4333          available. */          available. */
4334    
4335  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4336          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4337    #elif defined  SUPPORT_UTF
4338            if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4339    #elif !(defined COMPILE_PCRE8)
4340            if (d > 255)
4341    #endif
4342    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4343            {            {
4344            class_utf8 = TRUE;            xclass = TRUE;
4345    
4346            /* With UCP support, we can find the other case equivalents of            /* With UCP support, we can find the other case equivalents of
4347            the relevant characters. There may be several ranges. Optimize how            the relevant characters. There may be several ranges. Optimize how
4348            they fit with the basic range. */            they fit with the basic range. */
4349    
4350  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4351    #ifndef COMPILE_PCRE8
4352              if (utf && (options & PCRE_CASELESS) != 0)
4353    #else
4354            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4355    #endif
4356              {              {
4357              unsigned int occ, ocd;              unsigned int occ, ocd;
4358              unsigned int cc = c;              unsigned int cc = c;
# Line 4118  for (;; ptr++) Line 4378  for (;; ptr++)
4378    
4379                if (occ == ocd)                if (occ == ocd)
4380                  {                  {
4381                  *class_utf8data++ = XCL_SINGLE;                  *class_uchardata++ = XCL_SINGLE;
4382                  }                  }
4383                else                else
4384                  {                  {
4385                  *class_utf8data++ = XCL_RANGE;                  *class_uchardata++ = XCL_RANGE;
4386                  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);                  class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4387                  }                  }
4388                class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);                class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4389                }                }
4390              }              }
4391  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 4133  for (;; ptr++) Line 4393  for (;; ptr++)
4393            /* Now record the original range, possibly modified for UCP caseless            /* Now record the original range, possibly modified for UCP caseless
4394            overlapping ranges. */            overlapping ranges. */
4395    
4396            *class_utf8data++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4397            class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #ifdef SUPPORT_UTF
4398            class_utf8data += _pcre_ord2utf8(d, class_utf8data);  #ifndef COMPILE_PCRE8
4399              if (utf)
4400                {
4401                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4402                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4403                }
4404              else
4405                {
4406                *class_uchardata++ = c;
4407                *class_uchardata++ = d;
4408                }
4409    #else
4410              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4411              class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4412    #endif
4413    #else /* SUPPORT_UTF */
4414              *class_uchardata++ = c;
4415              *class_uchardata++ = d;
4416    #endif /* SUPPORT_UTF */
4417    
4418            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4419            caseless matching for UTF-8 characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
4420            for the smaller ones. */            for the smaller ones. As for 16 bit characters without UTF, we
4421              can still use  */
4422    
4423  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4424            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4425  #else            if (utf)
4426            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  #endif
4427                continue;    /* With next character in the class */
4428    #endif  /* SUPPORT_UCP */
4429    
4430    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4431              if (utf)
4432                {
4433                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4434                /* Adjust upper limit and fall through to set up the map */
4435                d = 127;
4436                }
4437              else
4438                {
4439                if (c > 255) continue;
4440                /* Adjust upper limit and fall through to set up the map */
4441                d = 255;
4442                }
4443    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4444              if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4445            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
   
4446            d = 127;            d = 127;
4447    #else
4448  #endif  /* SUPPORT_UCP */            if (c > 255) continue;
4449              /* Adjust upper limit and fall through to set up the map */
4450              d = 255;
4451    #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4452            }            }
4453  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4454    
4455          /* We use the bit map for all cases when not in UTF-8 mode; else          /* We use the bit map for 8 bit mode, or when the characters fall
4456          ranges that lie entirely within 0-127 when there is UCP support; else          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
         for partial ranges without UCP support. */  
4457    
4458          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4459    
4460          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4461    
# Line 4168  for (;; ptr++) Line 4464  for (;; ptr++)
4464            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4465            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4466              {              {
4467              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c]; /* flip case */
4468              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
4469              }              }
4470            }            }
# Line 4182  for (;; ptr++) Line 4478  for (;; ptr++)
4478    
4479        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4480    
4481        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4482          if (class_single_char < 2) class_single_char++;
4483    
4484  #ifdef SUPPORT_UTF8        /* If class_charcount is 1, we saw precisely one character. As long as
4485        if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))        there were no negated characters >= 128 and there was no use of \p or \P,
4486          in other words, no use of any XCLASS features, we can optimize.
4487    
4488          In UTF-8 mode, we can optimize the negative case only if there were no
4489          characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4490          operate on single-bytes characters only. This is an historical hangover.
4491          Maybe one day we can tidy these opcodes to handle multi-byte characters.
4492    
4493          The optimization throws away the bit map. We turn the item into a
4494          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4495          Note that OP_NOT[I] does not support multibyte characters. In the positive
4496          case, it can cause firstchar to be set. Otherwise, there can be no first
4497          char if this item is first, whatever repeat count may follow. In the case
4498          of reqchar, save the previous value for reinstating. */
4499    
4500    #ifdef SUPPORT_UTF
4501          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4502            && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4503    #else
4504          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4505    #endif
4506          {          {
4507          class_utf8 = TRUE;          ptr++;
4508          *class_utf8data++ = XCL_SINGLE;          zeroreqchar = reqchar;
4509          class_utf8data += _pcre_ord2utf8(c, class_utf8data);  
4510            /* The OP_NOT[I] opcodes work on single characters only. */
4511    
4512            if (negate_class)
4513              {
4514              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4515              zerofirstchar = firstchar;
4516              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4517              *code++ = c;
4518              goto NOT_CHAR;
4519              }
4520    
4521            /* For a single, positive character, get the value into mcbuffer, and
4522            then we can handle this with the normal one-character code. */
4523    
4524    #ifdef SUPPORT_UTF
4525            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4526              mclength = PRIV(ord2utf)(c, mcbuffer);
4527            else
4528    #endif
4529              {
4530              mcbuffer[0] = c;
4531              mclength = 1;
4532              }
4533            goto ONE_CHAR;
4534            }       /* End of 1-char optimization */
4535    
4536          /* Handle a character that cannot go in the bit map. */
4537    
4538    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4539          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4540    #elif defined SUPPORT_UTF
4541          if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4542    #elif !(defined COMPILE_PCRE8)
4543          if (c > 255)
4544    #endif
4545    
4546    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4547            {
4548            xclass = TRUE;
4549            *class_uchardata++ = XCL_SINGLE;
4550    #ifdef SUPPORT_UTF
4551    #ifndef COMPILE_PCRE8
4552            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4553            if (!utf)
4554              *class_uchardata++ = c;
4555            else
4556    #endif
4557              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4558    #else /* SUPPORT_UTF */
4559            *class_uchardata++ = c;
4560    #endif /* SUPPORT_UTF */
4561    
4562  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4563    #ifdef COMPILE_PCRE8
4564          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4565    #else
4566            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4567            if (utf && (options & PCRE_CASELESS) != 0)
4568    #endif
4569            {            {
4570            unsigned int othercase;            unsigned int othercase;
4571            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((othercase = UCD_OTHERCASE(c)) != c)
4572              {              {
4573              *class_utf8data++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4574              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4575              }              }
4576            }            }
4577  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
4578    
4579          }          }
4580        else        else
4581  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4582    
4583        /* Handle a single-byte character */        /* Handle a single-byte character */
4584          {          {
4585            class_has_8bitchar = 1;
4586          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4587          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4588            {            {
4589            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c]; /* flip case */
4590            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4591            }            }
         class_charcount++;  
         class_lastchar = c;  
4592          }          }
4593        }        }
4594    
# Line 4237  for (;; ptr++) Line 4609  for (;; ptr++)
4609        goto FAILED;        goto FAILED;
4610        }        }
4611    
4612      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4613      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4614      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
4615      optimize.  
4616        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4617      In UTF-8 mode, we can optimize the negative case only if there were no      zerofirstchar = firstchar;
4618      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      zeroreqchar = reqchar;
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstbyte to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqbyte, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF8  
     if (class_charcount == 1 && !class_utf8 &&  
       (!utf8 || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqbyte = reqbyte;  
   
       /* The OP_NOT[I] opcodes work on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
         zerofirstbyte = firstbyte;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF8  
       if (utf8 && class_lastchar > 127)  
         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqbyte setting must remain unchanged after any kind of  
     repeat. */  
   
     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
     zerofirstbyte = firstbyte;  
     zeroreqbyte = reqbyte;  
4619    
4620      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4621      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
# Line 4306  for (;; ptr++) Line 4625  for (;; ptr++)
4625      be listed) there are no characters < 256, we can omit the bitmap in the      be listed) there are no characters < 256, we can omit the bitmap in the
4626      actual compiled code. */      actual compiled code. */
4627    
4628  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4629      if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4630    #elif !defined COMPILE_PCRE8
4631        if (xclass && !should_flip_negation)
4632    #endif
4633    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4634        {        {
4635        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4636        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
4637        code += LINK_SIZE;        code += LINK_SIZE;
4638        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT:0;
4639    
4640        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4641        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4642    
4643        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4644          {          {
4645          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4646          memmove(code + 32, code, class_utf8data - code);          memmove(code + (32 / sizeof(pcre_uchar)), code,
4647              IN_UCHARS(class_uchardata - code));
4648          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
4649          code = class_utf8data + 32;          code = class_uchardata + (32 / sizeof(pcre_uchar));
4650          }          }
4651        else code = class_utf8data;        else code = class_uchardata;
4652    
4653        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4654    
4655        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4656        break;   /* End of class handling */        break;   /* End of class handling */
4657        }        }
4658  #endif  #endif
# Line 4340  for (;; ptr++) Line 4664  for (;; ptr++)
4664      negating it if necessary. */      negating it if necessary. */
4665    
4666      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4667      if (negate_class)      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
       {  
       if (lengthptr == NULL)    /* Save time in the pre-compile phase */  
         for (c = 0; c < 32; c++) code[c] = ~classbits[c];  
       }  
     else  
4668        {        {
4669          if (negate_class)
4670            for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4671        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4672        }        }
4673      code += 32;      code += 32 / sizeof(pcre_uchar);
4674        NOT_CHAR:
4675      break;      break;
4676    
4677    
# Line 4386  for (;; ptr++) Line 4708  for (;; ptr++)
4708    
4709      if (repeat_min == 0)      if (repeat_min == 0)
4710        {        {
4711        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */        firstchar = zerofirstchar;    /* Adjust for zero repeat */
4712        reqbyte = zeroreqbyte;        /* Ditto */        reqchar = zeroreqchar;        /* Ditto */
4713        }        }
4714    
4715      /* Remember whether this is a variable length repeat */      /* Remember whether this is a variable length repeat */
# Line 4426  for (;; ptr++) Line 4748  for (;; ptr++)
4748      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4749      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4750      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4751    
4752      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4753        {        {
4754        memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4755        *previous = OP_ONCE;        *previous = OP_ONCE;
4756        PUT(previous, 1, 2 + 2*LINK_SIZE);        PUT(previous, 1, 2 + 2*LINK_SIZE);
4757        previous[2 + 2*LINK_SIZE] = OP_KET;        previous[2 + 2*LINK_SIZE] = OP_KET;
# Line 4452  for (;; ptr++) Line 4774  for (;; ptr++)
4774    
4775      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4776      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
4777      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqchar - it might not be if a sequence such as x{3} is
4778      the first thing in a branch because the x will have gone into firstbyte      the first thing in a branch because the x will have gone into firstchar
4779      instead.  */      instead.  */
4780    
4781      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI)
4782        {        {
4783        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4784    
4785        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF characters that take up more than one character. It's
4786        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4787        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus UTF_LENGTH to flag that
4788        length rather than a small character. */        it's a length rather than a small character. */
4789    
4790  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4791        if (utf8 && (code[-1] & 0x80) != 0)        if (utf && NOT_FIRSTCHAR(code[-1]))
4792          {          {
4793          uschar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4794          while((*lastchar & 0xc0) == 0x80) lastchar--;          BACKCHAR(lastchar);
4795          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4796          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4797          c |= 0x80;                      /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4798          }          }
4799        else        else
4800  #endif  #endif /* SUPPORT_UTF */
   
       /* Handle the case of a single byte - either with no UTF8 support, or  
       with UTF-8 disabled, or for a UTF-8 character < 128. */  
4801    
4802          /* Handle the case of a single charater - either with no UTF support, or
4803          with UTF disabled, or for a single character UTF character. */
4804          {          {
4805          c = code[-1];          c = code[-1];
4806          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
4807          }          }
4808    
4809        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4492  for (;; ptr++) Line 4813  for (;; ptr++)
4813    
4814        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4815            repeat_max < 0 &&            repeat_max < 0 &&
4816            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4817          {          {
4818          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4819          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4513  for (;; ptr++) Line 4834  for (;; ptr++)
4834        c = previous[1];        c = previous[1];
4835        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4836            repeat_max < 0 &&            repeat_max < 0 &&
4837            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4838          {          {
4839          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4840          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4530  for (;; ptr++) Line 4851  for (;; ptr++)
4851    
4852      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
4853        {        {
4854        uschar *oldcode;        pcre_uchar *oldcode;
4855        int prop_type, prop_value;        int prop_type, prop_value;
4856        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4857        c = *previous;        c = *previous;
4858    
4859        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4860            repeat_max < 0 &&            repeat_max < 0 &&
4861            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4862          {          {
4863          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4864          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4617  for (;; ptr++) Line 4938  for (;; ptr++)
4938          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
4939          Unicode property match, there are two extra bytes that define the          Unicode property match, there are two extra bytes that define the
4940          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
4941          c, with the 0x80 bit as a flag. */          c, with the UTF_LENGTH bit as a flag. */
4942    
4943          if (repeat_max < 0)          if (repeat_max < 0)
4944            {            {
4945  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4946            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4947              {              {
4948              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4949              code += c & 7;              code += c & 7;
4950              }              }
4951            else            else
# Line 4646  for (;; ptr++) Line 4967  for (;; ptr++)
4967    
4968          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
4969            {            {
4970  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4971            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4972              {              {
4973              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4974              code += c & 7;              code += c & 7;
4975              }              }
4976            else            else
# Line 4676  for (;; ptr++) Line 4997  for (;; ptr++)
4997    
4998        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
4999    
5000  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5001        if (utf8 && c >= 128)        if (utf && (c & UTF_LENGTH) != 0)
5002          {          {
5003          memcpy(code, utf8_char, c & 7);          memcpy(code, utf_chars, IN_UCHARS(c & 7));
5004          code += c & 7;          code += c & 7;
5005          }          }
5006        else        else
# Line 4703  for (;; ptr++) Line 5024  for (;; ptr++)
5024    
5025      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
5026               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
5027  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5028               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
5029  #endif  #endif
5030               *previous == OP_REF ||               *previous == OP_REF ||
# Line 4752  for (;; ptr++) Line 5073  for (;; ptr++)
5073        {        {
5074        register int i;        register int i;
5075        int len = (int)(code - previous);        int len = (int)(code - previous);
5076        uschar *bralink = NULL;        pcre_uchar *bralink = NULL;
5077        uschar *brazeroptr = NULL;        pcre_uchar *brazeroptr = NULL;
5078    
5079        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5080        we just ignore the repeat. */        we just ignore the repeat. */
# Line 4806  for (;; ptr++) Line 5127  for (;; ptr++)
5127          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5128            {            {
5129            *code = OP_END;            *code = OP_END;
5130            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf, cd, save_hwm);
5131            memmove(previous+1, previous, len);            memmove(previous + 1, previous, IN_UCHARS(len));
5132            code++;            code++;
5133            if (repeat_max == 0)            if (repeat_max == 0)
5134              {              {
# Line 4830  for (;; ptr++) Line 5151  for (;; ptr++)
5151            {            {
5152            int offset;            int offset;
5153            *code = OP_END;            *code = OP_END;
5154            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);            adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5155            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5156            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
5157            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
5158            *previous++ = OP_BRA;            *previous++ = OP_BRA;
# Line 4877  for (;; ptr++) Line 5198  for (;; ptr++)
5198              *lengthptr += delta;              *lengthptr += delta;
5199              }              }
5200    
5201            /* This is compiling for real */            /* This is compiling for real. If there is a set first byte for
5202              the group, and we have not yet set a "required byte", set it. Make
5203              sure there is enough workspace for copying forward references before
5204              doing the copy. */
5205    
5206            else            else
5207              {              {
5208              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5209    
5210              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5211                {                {
5212                uschar *hc;                pcre_uchar *hc;
5213                uschar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5214                memcpy(code, previous, len);                memcpy(code, previous, IN_UCHARS(len));
5215    
5216                  while (cd->hwm > cd->start_workspace + cd->workspace_size -
5217                         WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5218                    {
5219                    int save_offset = save_hwm - cd->start_workspace;
5220                    int this_offset = this_hwm - cd->start_workspace;
5221                    *errorcodeptr = expand_workspace(cd);
5222                    if (*errorcodeptr != 0) goto FAILED;
5223                    save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5224                    this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5225                    }
5226    
5227                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5228                  {                  {
5229                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 4936  for (;; ptr++) Line 5273  for (;; ptr++)
5273    
5274          else for (i = repeat_max - 1; i >= 0; i--)          else for (i = repeat_max - 1; i >= 0; i--)
5275            {            {
5276            uschar *hc;            pcre_uchar *hc;
5277            uschar *this_hwm = cd->hwm;            pcre_uchar *this_hwm = cd->hwm;
5278    
5279            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
5280    
# Line 4953  for (;; ptr++) Line 5290  for (;; ptr++)
5290              PUTINC(code, 0, offset);              PUTINC(code, 0, offset);
5291              }              }
5292    
5293            memcpy(code, previous, len);            memcpy(code, previous, IN_UCHARS(len));
5294    
5295              /* Ensure there is enough workspace for forward references before
5296              copying them. */
5297    
5298              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5299                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5300                {
5301                int save_offset = save_hwm - cd->start_workspace;
5302                int this_offset = this_hwm - cd->start_workspace;
5303                *errorcodeptr = expand_workspace(cd);
5304                if (*errorcodeptr != 0) goto FAILED;
5305                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5306                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5307                }
5308    
5309            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5310              {              {
5311              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 4970  for (;; ptr++) Line 5322  for (;; ptr++)
5322            {            {
5323            int oldlinkoffset;            int oldlinkoffset;
5324            int offset = (int)(code - bralink + 1);            int offset = (int)(code - bralink + 1);
5325            uschar *bra = code - offset;            pcre_uchar *bra = code - offset;
5326            oldlinkoffset = GET(bra, 1);            oldlinkoffset = GET(bra, 1);
5327            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5328            *code++ = OP_KET;            *code++ = OP_KET;
# Line 4984  for (;; ptr++) Line 5336  for (;; ptr++)
5336        ONCE brackets can be converted into non-capturing brackets, as the        ONCE brackets can be converted into non-capturing brackets, as the
5337        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5338        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5339    
5340        Otherwise, when we are doing the actual compile phase, check to see        Otherwise, when we are doing the actual compile phase, check to see
5341        whether this group is one that could match an empty string. If so,        whether this group is one that could match an empty string. If so,
5342        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5343        that runtime checking can be done. [This check is also applied to ONCE        that runtime checking can be done. [This check is also applied to ONCE
5344        groups at runtime, but in a different way.]        groups at runtime, but in a different way.]
5345    
5346        Then, if the quantifier was possessive and the bracket is not a        Then, if the quantifier was possessive and the bracket is not a
5347        conditional, we convert the BRA code to the POS form, and the KET code to        conditional, we convert the BRA code to the POS form, and the KET code to
5348        KETRPOS. (It turns out to be convenient at runtime to detect this kind of        KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5349        subpattern at both the start and at the end.) The use of special opcodes        subpattern at both the start and at the end.) The use of special opcodes
5350        makes it possible to reduce greatly the stack usage in pcre_exec(). If        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5351        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5352    
5353        Then, if the minimum number of matches is 1 or 0, cancel the possessive        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5354        flag so that the default action below, of wrapping everything inside        flag so that the default action below, of wrapping everything inside
5355        atomic brackets, does not happen. When the minimum is greater than 1,        atomic brackets, does not happen. When the minimum is greater than 1,
5356        there will be earlier copies of the group, and so we still have to wrap        there will be earlier copies of the group, and so we still have to wrap
5357        the whole thing. */        the whole thing. */
5358    
5359        else        else
5360          {          {
5361          uschar *ketcode = code - 1 - LINK_SIZE;          pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5362          uschar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5363    
5364          /* Convert possessive ONCE brackets to non-capturing */          /* Convert possessive ONCE brackets to non-capturing */
5365    
5366          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5367              possessive_quantifier) *bracode = OP_BRA;              possessive_quantifier) *bracode = OP_BRA;
5368    
5369          /* For non-possessive ONCE brackets, all we need to do is to          /* For non-possessive ONCE brackets, all we need to do is to
5370          set the KET. */          set the KET. */
5371    
5372          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5373            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5374    
5375          /* Handle non-ONCE brackets and possessive ONCEs (which have been          /* Handle non-ONCE brackets and possessive ONCEs (which have been
5376          converted to non-capturing above). */          converted to non-capturing above). */
5377    
5378          else          else
5379            {            {
5380            /* In the compile phase, check for empty string matching. */            /* In the compile phase, check for empty string matching. */
5381    
5382            if (lengthptr == NULL)            if (lengthptr == NULL)
5383              {              {
5384              uschar *scode = bracode;              pcre_uchar *scode = bracode;
5385              do              do
5386                {                {
5387                if (could_be_empty_branch(scode, ketcode, utf8, cd))                if (could_be_empty_branch(scode, ketcode, utf, cd))
5388                  {                  {
5389                  *bracode += OP_SBRA - OP_BRA;                  *bracode += OP_SBRA - OP_BRA;
5390                  break;                  break;
# Line 5041  for (;; ptr++) Line 5393  for (;; ptr++)
5393                }                }
5394              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5395              }              }
5396    
5397            /* Handle possessive quantifiers. */            /* Handle possessive quantifiers. */
5398    
5399            if (possessive_quantifier)            if (possessive_quantifier)
# Line 5050  for (;; ptr++) Line 5402  for (;; ptr++)
5402              repeated non-capturing bracket, because we have not invented POS              repeated non-capturing bracket, because we have not invented POS
5403              versions of the COND opcodes. Because we are moving code along, we              versions of the COND opcodes. Because we are moving code along, we
5404              must ensure that any pending recursive references are updated. */              must ensure that any pending recursive references are updated. */
5405    
5406              if (*bracode == OP_COND || *bracode == OP_SCOND)              if (*bracode == OP_COND || *bracode == OP_SCOND)
5407                {                {
5408                int nlen = (int)(code - bracode);                int nlen = (int)(code - bracode);
5409                *code = OP_END;                *code = OP_END;
5410                adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);                adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5411                memmove(bracode + 1+LINK_SIZE, bracode, nlen);                memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5412                code += 1 + LINK_SIZE;                code += 1 + LINK_SIZE;
5413                nlen += 1 + LINK_SIZE;                nlen += 1 + LINK_SIZE;
5414                *bracode = OP_BRAPOS;                *bracode = OP_BRAPOS;
5415                *code++ = OP_KETRPOS;                *code++ = OP_KETRPOS;
5416                PUTINC(code, 0, nlen);                PUTINC(code, 0, nlen);
5417                PUT(bracode, 1, nlen);                PUT(bracode, 1, nlen);
5418                }                }
5419    
5420              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5421    
5422              else              else
5423                {                {
5424                *bracode += 1;              /* Switch to xxxPOS opcodes */                *bracode += 1;              /* Switch to xxxPOS opcodes */
5425                *ketcode = OP_KETRPOS;                *ketcode = OP_KETRPOS;
5426                }                }
5427    
5428              /* If the minimum is zero, mark it as possessive, then unset the              /* If the minimum is zero, mark it as possessive, then unset the
5429              possessive flag when the minimum is 0 or 1. */              possessive flag when the minimum is 0 or 1. */
5430    
5431              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5432              if (repeat_min < 2) possessive_quantifier = FALSE;              if (repeat_min < 2) possessive_quantifier = FALSE;
5433              }              }
5434    
5435            /* Non-possessive quantifier */            /* Non-possessive quantifier */
5436    
5437            else *ketcode = OP_KETRMAX + repeat_type;            else *ketcode = OP_KETRMAX + repeat_type;
5438            }            }
5439          }          }
# Line 5125  for (;; ptr++) Line 5477  for (;; ptr++)
5477        int len;        int len;
5478    
5479        if (*tempcode == OP_TYPEEXACT)        if (*tempcode == OP_TYPEEXACT)
5480          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
5481            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);            ((tempcode[1 + IMM2_SIZE] == OP_PROP
5482              || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5483    
5484        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5485          {          {
5486          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
5487  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5488          if (utf8 && tempcode[-1] >= 0xc0)          if (utf && HAS_EXTRALEN(tempcode[-1]))
5489            tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];            tempcode += GET_EXTRALEN(tempcode[-1]);
5490  #endif  #endif
5491          }          }
5492    
# Line 5170  for (;; ptr++) Line 5523  for (;; ptr++)
5523    
5524          default:          default:
5525          *code = OP_END;          *code = OP_END;
5526          adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5527          memmove(tempcode + 1+LINK_SIZE, tempcode, len);          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5528          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
5529          len += 1 + LINK_SIZE;          len += 1 + LINK_SIZE;
5530          tempcode[0] = OP_ONCE;          tempcode[0] = OP_ONCE;
# Line 5183  for (;; ptr++) Line 5536  for (;; ptr++)
5536        }        }
5537    
5538      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
5539      "follows varying string" flag for subsequently encountered reqbytes if      "follows varying string" flag for subsequently encountered reqchars if
5540      it isn't already set and we have just passed a varying length item. */      it isn't already set and we have just passed a varying length item. */
5541    
5542      END_REPEAT:      END_REPEAT:
# Line 5206  for (;; ptr++) Line 5559  for (;; ptr++)
5559    
5560      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
5561    
5562      if (*(++ptr) == CHAR_ASTERISK &&      ptr++;
5563           ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5564             || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5565        {        {
5566        int i, namelen;        int i, namelen;
5567        int arglen = 0;        int arglen = 0;
5568        const char *vn = verbnames;        const char *vn = verbnames;
5569        const uschar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5570        const uschar *arg = NULL;        const pcre_uchar *arg = NULL;
5571        previous = NULL;        previous = NULL;
5572        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        ptr++;
5573          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5574        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5575    
5576        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
# Line 5240  for (;; ptr++) Line 5595  for (;; ptr++)
5595        for (i = 0; i < verbcount; i++)        for (i = 0; i < verbcount; i++)
5596          {          {
5597          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5598              strncmp((char *)name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5599            {            {
5600            /* Check for open captures before ACCEPT and convert it to            /* Check for open captures before ACCEPT and convert it to
5601            ASSERT_ACCEPT if in an assertion. */            ASSERT_ACCEPT if in an assertion. */
# Line 5261  for (;; ptr++) Line 5616  for (;; ptr++)
5616                }                }
5617              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5618    
5619              /* Do not set firstbyte after *ACCEPT */              /* Do not set firstchar after *ACCEPT */
5620              if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5621              }              }
5622    
5623            /* Handle other cases with/without an argument */            /* Handle other cases with/without an argument */
# Line 5288  for (;; ptr++) Line 5643  for (;; ptr++)
5643              *code = verbs[i].op_arg;              *code = verbs[i].op_arg;
5644              if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;              if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5645              *code++ = arglen;              *code++ = arglen;
5646              memcpy(code, arg, arglen);              memcpy(code, arg, IN_UCHARS(arglen));
5647              code += arglen;              code += arglen;
5648              *code++ = 0;              *code++ = 0;
5649              }              }
# Line 5311  for (;; ptr++) Line 5666  for (;; ptr++)
5666        {        {
5667        int i, set, unset, namelen;        int i, set, unset, namelen;
5668        int *optset;        int *optset;
5669        const uschar *name;        const pcre_uchar *name;
5670        uschar *slot;        pcre_uchar *slot;
5671    
5672        switch (*(++ptr))        switch (*(++ptr))
5673          {          {
# Line 5365  for (;; ptr++) Line 5720  for (;; ptr++)
5720            break;            break;
5721    
5722          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Most other conditions use OP_CREF (a couple change to OP_RREF
5723          below), and all need to skip 3 bytes at the start of the group. */          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5724    
5725          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
5726          skipbytes = 3;          skipbytes = 1+IMM2_SIZE;
5727          refsign = -1;          refsign = -1;
5728    
5729          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
# Line 5401  for (;; ptr++) Line 5756  for (;; ptr++)
5756    
5757          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
5758    
5759          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5760            {            {
5761            ptr += 1;  /* To get the right offset */            ptr += 1;  /* To get the right offset */
5762            *errorcodeptr = ERR28;            *errorcodeptr = ERR28;
# Line 5412  for (;; ptr++) Line 5767  for (;; ptr++)
5767    
5768          recno = 0;          recno = 0;
5769          name = ++ptr;          name = ++ptr;
5770          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5771            {            {
5772            if (recno >= 0)            if (recno >= 0)
5773              recno = ((digitab[*ptr] & ctype_digit) != 0)?              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
               recno * 10 + *ptr - CHAR_0 : -1;  
5774            ptr++;            ptr++;
5775            }            }
5776          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
# Line 5464  for (;; ptr++) Line 5818  for (;; ptr++)
5818          slot = cd->name_table;          slot = cd->name_table;
5819          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
5820            {            {
5821            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5822            slot += cd->name_entry_size;            slot += cd->name_entry_size;
5823            }            }
5824    
# Line 5480  for (;; ptr++) Line 5834  for (;; ptr++)
5834          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
5835    
5836          else if ((i = find_parens(cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
5837                          (options & PCRE_EXTENDED) != 0, utf8)) > 0)                          (options & PCRE_EXTENDED) != 0, utf)) > 0)
5838            {            {
5839            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
5840            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
# Line 5506  for (;; ptr++) Line 5860  for (;; ptr++)
5860            recno = 0;            recno = 0;
5861            for (i = 1; i < namelen; i++)            for (i = 1; i < namelen; i++)
5862              {              {
5863              if ((digitab[name[i]] & ctype_digit) == 0)              if (!IS_DIGIT(name[i]))
5864                {                {
5865                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
5866                goto FAILED;                goto FAILED;
# Line 5521  for (;; ptr++) Line 5875  for (;; ptr++)
5875          /* Similarly, check for the (?(DEFINE) "condition", which is always          /* Similarly, check for the (?(DEFINE) "condition", which is always
5876          false. */          false. */
5877    
5878          else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)          else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5879            {            {
5880            code[1+LINK_SIZE] = OP_DEF;            code[1+LINK_SIZE] = OP_DEF;
5881            skipbytes = 1;            skipbytes = 1;
# Line 5584  for (;; ptr++) Line 5938  for (;; ptr++)
5938            break;            break;
5939    
5940            default:                /* Could be name define, else bad */            default:                /* Could be name define, else bad */
5941            if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;            if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5942                goto DEFINE_NAME;
5943            ptr++;                  /* Correct offset for error */            ptr++;                  /* Correct offset for error */
5944            *errorcodeptr = ERR24;            *errorcodeptr = ERR24;
5945            goto FAILED;            goto FAILED;
# Line 5606  for (;; ptr++) Line 5961  for (;; ptr++)
5961          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
5962            {            {
5963            int n = 0;            int n = 0;
5964            while ((digitab[*(++ptr)] & ctype_digit) != 0)            ptr++;
5965              n = n * 10 + *ptr - CHAR_0;            while(IS_DIGIT(*ptr))
5966                n = n * 10 + *ptr++ - CHAR_0;
5967            if (*ptr != CHAR_RIGHT_PARENTHESIS)            if (*ptr != CHAR_RIGHT_PARENTHESIS)
5968              {              {
5969              *errorcodeptr = ERR39;              *errorcodeptr = ERR39;
# Line 5652  for (;; ptr++) Line 6008  for (;; ptr++)
6008              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6009            name = ++ptr;            name = ++ptr;
6010    
6011            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6012            namelen = (int)(ptr - name);            namelen = (int)(ptr - name);
6013    
6014            /* In the pre-compile phase, just do a syntax check. */            /* In the pre-compile phase, just do a syntax check. */
# Line 5669  for (;; ptr++) Line 6025  for (;; ptr++)
6025                *errorcodeptr = ERR49;                *errorcodeptr = ERR49;
6026                goto FAILED;                goto FAILED;
6027                }                }
6028              if (namelen + 3 > cd->name_entry_size)              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6029                {                {
6030                cd->name_entry_size = namelen + 3;                cd->name_entry_size = namelen + IMM2_SIZE + 1;
6031                if (namelen > MAX_NAME_SIZE)                if (namelen > MAX_NAME_SIZE)
6032                  {                  {
6033                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR48;
# Line 5700  for (;; ptr++) Line 6056  for (;; ptr++)
6056    
6057              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
6058                {                {
6059                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6060                if (crc == 0)                if (crc == 0)
6061                  {                  {
6062                  if (slot[2+namelen] == 0)                  if (slot[IMM2_SIZE+namelen] == 0)
6063                    {                    {
6064                    if (GET2(slot, 0) != cd->bracount + 1 &&                    if (GET2(slot, 0) != cd->bracount + 1 &&
6065                        (options & PCRE_DUPNAMES) == 0)                        (options & PCRE_DUPNAMES) == 0)
# Line 5724  for (;; ptr++) Line 6080  for (;; ptr++)
6080                if (crc < 0)                if (crc < 0)
6081                  {                  {
6082                  memmove(slot + cd->name_entry_size, slot,                  memmove(slot + cd->name_entry_size, slot,
6083                    (cd->names_found - i) * cd->name_entry_size);                    IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
6084                  break;                  break;
6085                  }                  }
6086    
# Line 5738  for (;; ptr++) Line 6094  for (;; ptr++)
6094    
6095              if (!dupname)              if (!dupname)
6096                {                {
6097                uschar *cslot = cd->name_table;                pcre_uchar *cslot = cd->name_table;
6098                for (i = 0; i < cd->names_found; i++)                for (i = 0; i < cd->names_found; i++)
6099                  {                  {
6100                  if (cslot != slot)                  if (cslot != slot)
# Line 5755  for (;; ptr++) Line 6111  for (;; ptr++)
6111                }                }
6112    
6113              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
6114              memcpy(slot + 2, name, namelen);              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6115              slot[2+namelen] = 0;              slot[IMM2_SIZE + namelen] = 0;
6116              }              }
6117            }            }
6118    
# Line 5782  for (;; ptr++) Line 6138  for (;; ptr++)
6138    
6139          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
6140          name = ++ptr;          name = ++ptr;
6141          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6142          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6143    
6144          /* In the pre-compile phase, do a syntax check. We used to just set          /* In the pre-compile phase, do a syntax check. We used to just set
# Line 5794  for (;; ptr++) Line 6150  for (;; ptr++)
6150    
6151          if (lengthptr != NULL)          if (lengthptr != NULL)
6152            {            {
6153            const uschar *temp;            const pcre_uchar *temp;
6154    
6155            if (namelen == 0)            if (namelen == 0)
6156              {              {
# Line 5824  for (;; ptr++) Line 6180  for (;; ptr++)
6180            temp = cd->end_pattern;            temp = cd->end_pattern;
6181            cd->end_pattern = ptr;            cd->end_pattern = ptr;
6182            recno = find_parens(cd, name, namelen,            recno = find_parens(cd, name, namelen,
6183              (options & PCRE_EXTENDED) != 0, utf8);              (options & PCRE_EXTENDED) != 0, utf);
6184            cd->end_pattern = temp;            cd->end_pattern = temp;
6185            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
6186            }            }
# Line 5839  for (;; ptr++) Line 6195  for (;; ptr++)
6195            slot = cd->name_table;            slot = cd->name_table;
6196            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
6197              {              {
6198              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6199                  slot[2+namelen] == 0)                  slot[IMM2_SIZE+namelen] == 0)
6200                break;                break;
6201              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6202              }              }
# Line 5851  for (;; ptr++) Line 6207  for (;; ptr++)
6207              }              }
6208            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
6209                      find_parens(cd, name, namelen,                      find_parens(cd, name, namelen,
6210                        (options & PCRE_EXTENDED) != 0, utf8)) <= 0)                        (options & PCRE_EXTENDED) != 0, utf)) <= 0)
6211              {              {
6212              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
6213              goto FAILED;              goto FAILED;
# Line 5876  for (;; ptr++) Line 6232  for (;; ptr++)
6232          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6233          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6234            {            {
6235            const uschar *called;            const pcre_uchar *called;
6236            terminator = CHAR_RIGHT_PARENTHESIS;            terminator = CHAR_RIGHT_PARENTHESIS;
6237    
6238            /* Come here from the \g<...> and \g'...' code (Oniguruma            /* Come here from the \g<...> and \g'...' code (Oniguruma
# Line 5890  for (;; ptr++) Line 6246  for (;; ptr++)
6246            if ((refsign = *ptr) == CHAR_PLUS)            if ((refsign = *ptr) == CHAR_PLUS)
6247              {              {
6248              ptr++;              ptr++;
6249              if ((digitab[*ptr] & ctype_digit) == 0)              if (!IS_DIGIT(*ptr))
6250                {                {
6251                *errorcodeptr = ERR63;                *errorcodeptr = ERR63;
6252                goto FAILED;                goto FAILED;
# Line 5898  for (;; ptr++) Line 6254  for (;; ptr++)
6254              }              }
6255            else if (refsign == CHAR_MINUS)            else if (refsign == CHAR_MINUS)
6256              {              {
6257              if ((digitab[ptr[1]] & ctype_digit) == 0)              if (!IS_DIGIT(ptr[1]))
6258                goto OTHER_CHAR_AFTER_QUERY;                goto OTHER_CHAR_AFTER_QUERY;
6259              ptr++;              ptr++;
6260              }              }
6261    
6262            recno = 0;            recno = 0;
6263            while((digitab[*ptr] & ctype_digit) != 0)            while(IS_DIGIT(*ptr))
6264              recno = recno * 10 + *ptr++ - CHAR_0;              recno = recno * 10 + *ptr++ - CHAR_0;
6265    
6266            if (*ptr != terminator)            if (*ptr != terminator)
# Line 5955  for (;; ptr++) Line 6311  for (;; ptr++)
6311              {              {
6312              *code = OP_END;              *code = OP_END;
6313              if (recno != 0)              if (recno != 0)
6314                called = _pcre_find_bracket(cd->start_code, utf8, recno);                called = PRIV(find_bracket)(cd->start_code, utf, recno);
6315    
6316              /* Forward reference */              /* Forward reference */
6317    
6318              if (called == NULL)              if (called == NULL)
6319                {                {
6320                if (find_parens(cd, NULL, recno,                if (find_parens(cd, NULL, recno,
6321                      (options & PCRE_EXTENDED) != 0, utf8) < 0)                      (options & PCRE_EXTENDED) != 0, utf) < 0)
6322                  {                  {
6323                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
6324                  goto FAILED;                  goto FAILED;
# Line 5973  for (;; ptr++) Line 6329  for (;; ptr++)
6329                of the group. Then remember the forward reference. */                of the group. Then remember the forward reference. */
6330    
6331                called = cd->start_code + recno;                called = cd->start_code + recno;
6332                  if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6333                      WORK_SIZE_SAFETY_MARGIN)
6334                    {
6335                    *errorcodeptr = expand_workspace(cd);
6336                    if (*errorcodeptr != 0) goto FAILED;
6337                    }
6338                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6339                }                }
6340    
# Line 5986  for (;; ptr++) Line 6348  for (;; ptr++)
6348              conditional subpatterns will be picked up then. */              conditional subpatterns will be picked up then. */
6349    
6350              else if (GET(called, 1) == 0 && cond_depth <= 0 &&              else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6351                       could_be_empty(called, code, bcptr, utf8, cd))                       could_be_empty(called, code, bcptr, utf, cd))
6352                {                {
6353                *errorcodeptr = ERR40;                *errorcodeptr = ERR40;
6354                goto FAILED;                goto FAILED;
6355                }                }
6356              }              }
6357    
6358            /* Insert the recursion/subroutine item. */            /* Insert the recursion/subroutine item. It does not have a set first
6359              character (relevant if it is repeated, because it will then be
6360              wrapped with ONCE brackets). */
6361    
6362            *code = OP_RECURSE;            *code = OP_RECURSE;
6363            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
6364            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
6365              groupsetfirstchar = FALSE;
6366            }            }
6367    
6368          /* Can't determine a first byte now */          /* Can't determine a first byte now */
6369    
6370          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6371          continue;          continue;
6372    
6373    
# Line 6059  for (;; ptr++) Line 6424  for (;; ptr++)
6424          both phases.          both phases.
6425    
6426          If we are not at the pattern start, reset the greedy defaults and the          If we are not at the pattern start, reset the greedy defaults and the
6427          case value for firstbyte and reqbyte. */          case value for firstchar and reqchar. */
6428    
6429          if (*ptr == CHAR_RIGHT_PARENTHESIS)          if (*ptr == CHAR_RIGHT_PARENTHESIS)
6430            {            {
# Line 6072  for (;; ptr++) Line 6437  for (;; ptr++)
6437              {              {
6438              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6439              greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
6440              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6441              }              }
6442    
6443            /* Change options at this level, and pass them back for use            /* Change options at this level, and pass them back for use
# Line 6109  for (;; ptr++) Line 6474  for (;; ptr++)
6474        NUMBERED_GROUP:        NUMBERED_GROUP:
6475        cd->bracount += 1;        cd->bracount += 1;
6476        PUT2(code, 1+LINK_SIZE, cd->bracount);        PUT2(code, 1+LINK_SIZE, cd->bracount);
6477        skipbytes = 2;        skipbytes = IMM2_SIZE;
6478        }        }
6479    
6480      /* Process nested bracketed regex. Assertions used not to be repeatable,      /* Process nested bracketed regex. Assertions used not to be repeatable,
# Line 6135  for (;; ptr++) Line 6500  for (;; ptr++)
6500           skipbytes,                       /* Skip over bracket number */           skipbytes,                       /* Skip over bracket number */
6501           cond_depth +           cond_depth +
6502             ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */             ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
6503           &subfirstbyte,                   /* For possible first char */           &subfirstchar,                   /* For possible first char */
6504           &subreqbyte,                     /* For possible last char */           &subreqchar,                     /* For possible last char */
6505           bcptr,                           /* Current branch chain */           bcptr,                           /* Current branch chain */
6506           cd,                              /* Tables block */           cd,                              /* Tables block */
6507           (lengthptr == NULL)? NULL :      /* Actual compile phase */           (lengthptr == NULL)? NULL :      /* Actual compile phase */
# Line 6164  for (;; ptr++) Line 6529  for (;; ptr++)
6529    
6530      if (bravalue == OP_COND && lengthptr == NULL)      if (bravalue == OP_COND && lengthptr == NULL)
6531        {        {
6532        uschar *tc = code;        pcre_uchar *tc = code;
6533        int condcount = 0;        int condcount = 0;
6534    
6535        do {        do {
# Line 6187  for (;; ptr++) Line 6552  for (;; ptr++)
6552          }          }
6553    
6554        /* A "normal" conditional group. If there is just one branch, we must not        /* A "normal" conditional group. If there is just one branch, we must not
6555        make use of its firstbyte or reqbyte, because this is equivalent to an        make use of its firstchar or reqchar, because this is equivalent to an
6556        empty second branch. */        empty second branch. */
6557    
6558        else        else
# Line 6197  for (;; ptr++) Line 6562  for (;; ptr++)
6562            *errorcodeptr = ERR27;            *errorcodeptr = ERR27;
6563            goto FAILED;            goto FAILED;
6564            }            }
6565          if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;          if (condcount == 1) subfirstchar = subreqchar = REQ_NONE;
6566          }          }
6567        }        }
6568    
# Line 6241  for (;; ptr++) Line 6606  for (;; ptr++)
6606      /* Handle updating of the required and first characters for other types of      /* Handle updating of the required and first characters for other types of
6607      group. Update for normal brackets of all kinds, and conditions with two      group. Update for normal brackets of all kinds, and conditions with two
6608      branches (see code above). If the bracket is followed by a quantifier with      branches (see code above). If the bracket is followed by a quantifier with
6609      zero repeat, we have to back off. Hence the definition of zeroreqbyte and      zero repeat, we have to back off. Hence the definition of zeroreqchar and
6610      zerofirstbyte outside the main loop so that they can be accessed for the      zerofirstchar outside the main loop so that they can be accessed for the
6611      back off. */      back off. */
6612    
6613      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
6614      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
6615      groupsetfirstbyte = FALSE;      groupsetfirstchar = FALSE;
6616    
6617      if (bravalue >= OP_ONCE)      if (bravalue >= OP_ONCE)
6618        {        {
6619        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstchar in this branch, take it from the
6620        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
6621        than one can replicate it as reqbyte if necessary. If the subpattern has        than one can replicate it as reqchar if necessary. If the subpattern has
6622        no firstbyte, set "none" for the whole branch. In both cases, a zero        no firstchar, set "none" for the whole branch. In both cases, a zero
6623        repeat forces firstbyte to "none". */        repeat forces firstchar to "none". */
6624    
6625        if (firstbyte == REQ_UNSET)        if (firstchar == REQ_UNSET)
6626          {          {
6627          if (subfirstbyte >= 0)          if (subfirstchar >= 0)
6628            {            {
6629            firstbyte = subfirstbyte;            firstchar = subfirstchar;
6630            groupsetfirstbyte = TRUE;            groupsetfirstchar = TRUE;
6631            }            }
6632          else firstbyte = REQ_NONE;          else firstchar = REQ_NONE;
6633          zerofirstbyte = REQ_NONE;          zerofirstchar = REQ_NONE;
6634          }          }
6635    
6636        /* If firstbyte was previously set, convert the subpattern's firstbyte        /* If firstchar was previously set, convert the subpattern's firstchar
6637        into reqbyte if there wasn't one, using the vary flag that was in        into reqchar if there wasn't one, using the vary flag that was in
6638        existence beforehand. */        existence beforehand. */
6639    
6640        else if (subfirstbyte >= 0 && subreqbyte < 0)        else if (subfirstchar >= 0 && subreqchar < 0)
6641          subreqbyte = subfirstbyte | tempreqvary;          subreqchar = subfirstchar | tempreqvary;
6642    
6643        /* If the subpattern set a required byte (or set a first byte that isn't        /* If the subpattern set a required byte (or set a first byte that isn't
6644        really the first byte - see above), set it. */        really the first byte - see above), set it. */
6645    
6646        if (subreqbyte >= 0) reqbyte = subreqbyte;        if (subreqchar >= 0) reqchar = subreqchar;
6647        }        }
6648    
6649      /* For a forward assertion, we take the reqbyte, if set. This can be      /* For a forward assertion, we take the reqchar, if set. This can be
6650      helpful if the pattern that follows the assertion doesn't set a different      helpful if the pattern that follows the assertion doesn't set a different
6651      char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte      char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
6652      for an assertion, however because it leads to incorrect effect for patterns      for an assertion, however because it leads to incorrect effect for patterns
6653      such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead      such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
6654      of a firstbyte. This is overcome by a scan at the end if there's no      of a firstchar. This is overcome by a scan at the end if there's no
6655      firstbyte, looking for an asserted first char. */      firstchar, looking for an asserted first char. */
6656    
6657      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqchar >= 0) reqchar = subreqchar;
6658      break;     /* End of processing '(' */      break;     /* End of processing '(' */
6659    
6660    
# Line 6322  for (;; ptr++) Line 6687  for (;; ptr++)
6687        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
6688        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
6689    
6690        if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)        if (firstchar == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
6691          firstbyte = REQ_NONE;          firstchar = REQ_NONE;
6692    
6693        /* Set values to reset to if this is followed by a zero repeat. */        /* Set values to reset to if this is followed by a zero repeat. */
6694    
6695        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
6696        zeroreqbyte = reqbyte;        zeroreqchar = reqchar;
6697    
6698        /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'        /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6699        is a subroutine call by number (Oniguruma syntax). In fact, the value        is a subroutine call by number (Oniguruma syntax). In fact, the value
# Line 6339  for (;; ptr++) Line 6704  for (;; ptr++)
6704    
6705        if (-c == ESC_g)        if (-c == ESC_g)
6706          {          {
6707          const uschar *p;          const pcre_uchar *p;
6708          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
6709          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6710            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
# Line 6356  for (;; ptr++) Line 6721  for (;; ptr++)
6721    
6722          if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)          if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6723            {            {
6724            BOOL isnumber = TRUE;            BOOL is_a_number = TRUE;
6725            for (p = ptr + 1; *p != 0 && *p != terminator; p++)            for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6726              {              {
6727              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;              if (!MAX_255(*p)) { is_a_number = FALSE; break; }
6728                if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;
6729              if ((cd->ctypes[*p] & ctype_word) == 0) break;              if ((cd->ctypes[*p] & ctype_word) == 0) break;
6730              }              }
6731            if (*p != terminator)            if (*p != terminator)
# Line 6367  for (;; ptr++) Line 6733  for (;; ptr++)
6733              *errorcodeptr = ERR57;              *errorcodeptr = ERR57;
6734              break;              break;
6735              }              }
6736            if (isnumber)            if (is_a_number)
6737              {              {
6738              ptr++;              ptr++;
6739              goto HANDLE_NUMERICAL_RECURSION;              goto HANDLE_NUMERICAL_RECURSION;
# Line 6379  for (;; ptr++) Line 6745  for (;; ptr++)
6745          /* Test a signed number in angle brackets or quotes. */          /* Test a signed number in angle brackets or quotes. */
6746    
6747          p = ptr + 2;          p = ptr + 2;
6748          while ((digitab[*p] & ctype_digit) != 0) p++;          while (IS_DIGIT(*p)) p++;
6749          if (*p != terminator)          if (*p != terminator)
6750            {            {
6751            *errorcodeptr = ERR57;            *errorcodeptr = ERR57;
# Line 6407  for (;; ptr++) Line 6773  for (;; ptr++)
6773          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
6774          }          }
6775    
6776        /* Back references are handled specially; must disable firstbyte if        /* Back references are handled specially; must disable firstchar if
6777        not set to cope with cases like (?=(\w+))\1: which would otherwise set        not set to cope with cases like (?=(\w+))\1: which would otherwise set
6778        ':' later. */        ':' later. */
6779    
# Line 6417  for (;; ptr++) Line 6783  for (;; ptr++)
6783          recno = -c - ESC_REF;          recno = -c - ESC_REF;
6784    
6785          HANDLE_REFERENCE:    /* Come here from named backref handling */          HANDLE_REFERENCE:    /* Come here from named backref handling */
6786          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6787          previous = code;          previous = code;
6788          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
6789          PUT2INC(code, 0, recno);          PUT2INC(code, 0, recno);
# Line 6481  for (;; ptr++) Line 6847  for (;; ptr++)
6847  #endif  #endif
6848          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6849          so that it works in DFA mode and in lookbehinds. */          so that it works in DFA mode and in lookbehinds. */
6850    
6851            {            {
6852            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6853            *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;            *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
6854            }            }
6855          }          }
6856        continue;        continue;
# Line 6494  for (;; ptr++) Line 6860  for (;; ptr++)
6860      a value > 127. We set its representation in the length/buffer, and then      a value > 127. We set its representation in the length/buffer, and then
6861      handle it as a data character. */      handle it as a data character. */
6862    
6863  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
6864      if (utf8 && c > 127)      if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6865        mclength = _pcre_ord2utf8(c, mcbuffer);        mclength = PRIV(ord2utf)(c, mcbuffer);
6866      else      else
6867  #endif  #endif
6868    
# Line 6517  for (;; ptr++) Line 6883  for (;; ptr++)
6883      mclength = 1;      mclength = 1;
6884      mcbuffer[0] = c;      mcbuffer[0] = c;
6885    
6886  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
6887      if (utf8 && c >= 0xc0)      if (utf && HAS_EXTRALEN(c))
6888        {        ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
       while ((ptr[1] & 0xc0) == 0x80)  
         mcbuffer[mclength++] = *(++ptr);  
       }  
6889  #endif  #endif
6890    
6891      /* At this point we have the character's bytes in mcbuffer, and the length      /* At this point we have the character's bytes in mcbuffer, and the length
# Line 6540  for (;; ptr++) Line 6903  for (;; ptr++)
6903    
6904      /* Set the first and required bytes appropriately. If no previous first      /* Set the first and required bytes appropriately. If no previous first
6905      byte, set it from this character, but revert to none on a zero repeat.      byte, set it from this character, but revert to none on a zero repeat.
6906      Otherwise, leave the firstbyte value alone, and don't change it on a zero      Otherwise, leave the firstchar value alone, and don't change it on a zero
6907      repeat. */      repeat. */
6908    
6909      if (firstbyte == REQ_UNSET)      if (firstchar == REQ_UNSET)
6910        {        {
6911        zerofirstbyte = REQ_NONE;        zerofirstchar = REQ_NONE;
6912        zeroreqbyte = reqbyte;        zeroreqchar = reqchar;
6913    
6914        /* If the character is more than one byte long, we can set firstbyte        /* If the character is more than one byte long, we can set firstchar
6915        only if it is not to be matched caselessly. */        only if it is not to be matched caselessly. */
6916    
6917        if (mclength == 1 || req_caseopt == 0)        if (mclength == 1 || req_caseopt == 0)
6918          {          {
6919          firstbyte = mcbuffer[0] | req_caseopt;          firstchar = mcbuffer[0] | req_caseopt;
6920          if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;          if (mclength != 1) reqchar = code[-1] | cd->req_varyopt;
6921          }          }
6922        else firstbyte = reqbyte = REQ_NONE;        else firstchar = reqchar = REQ_NONE;
6923        }        }
6924    
6925      /* firstbyte was previously set; we can set reqbyte only if the length is      /* firstchar was previously set; we can set reqchar only if the length is
6926      1 or the matching is caseful. */      1 or the matching is caseful. */
6927    
6928      else      else
6929        {        {
6930        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
6931        zeroreqbyte = reqbyte;        zeroreqchar = reqchar;
6932        if (mclength == 1 || req_caseopt == 0)        if (mclength == 1 || req_caseopt == 0)
6933          reqbyte = code[-1] | req_caseopt | cd->req_varyopt;          reqchar = code[-1] | req_caseopt | cd->req_varyopt;
6934        }        }
6935    
6936      break;            /* End of literal character handling */      break;            /* End of literal character handling */
# Line 6607  Arguments: Line 6970  Arguments:
6970    reset_bracount TRUE to reset the count for each branch    reset_bracount TRUE to reset the count for each branch
6971    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
6972    cond_depth     depth of nesting for conditional subpatterns    cond_depth     depth of nesting for conditional subpatterns
6973    firstbyteptr   place to put the first required character, or a negative number    firstcharptr   place to put the first required character, or a negative number
6974    reqbyteptr     place to put the last required character, or a negative number    reqcharptr     place to put the last required character, or a negative number
6975    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
6976    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
6977    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
# Line 6618  Returns:         TRUE on success Line 6981  Returns:         TRUE on success
6981  */  */
6982    
6983  static BOOL  static BOOL
6984  compile_regex(int options, uschar **codeptr, const uschar **ptrptr,  compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
6985    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6986    int cond_depth, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    int cond_depth, pcre_int32 *firstcharptr, pcre_int32 *reqcharptr,
6987    compile_data *cd, int *lengthptr)    branch_chain *bcptr, compile_data *cd, int *lengthptr)
6988  {  {
6989  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
6990  uschar *code = *codeptr;  pcre_uchar *code = *codeptr;
6991  uschar *last_branch = code;  pcre_uchar *last_branch = code;
6992  uschar *start_bracket = code;  pcre_uchar *start_bracket = code;
6993  uschar *reverse_count = NULL;  pcre_uchar *reverse_count = NULL;
6994  open_capitem capitem;  open_capitem capitem;
6995  int capnumber = 0;  int capnumber = 0;
6996  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
6997  int branchfirstbyte, branchreqbyte;  pcre_int32 branchfirstchar, branchreqchar;
6998  int length;  int length;
6999  int orig_bracount;  int orig_bracount;
7000  int max_bracount;  int max_bracount;
# Line 6640  branch_chain bc; Line 7003  branch_chain bc;
7003  bc.outer = bcptr;  bc.outer = bcptr;
7004  bc.current_branch = code;  bc.current_branch = code;
7005    
7006  firstbyte = reqbyte = REQ_UNSET;  firstchar = reqchar = REQ_UNSET;
7007    
7008  /* Accumulate the length for use in the pre-compile phase. Start with the  /* Accumulate the length for use in the pre-compile phase. Start with the
7009  length of the BRA and KET and any extra bytes that are required at the  length of the BRA and KET and any extra bytes that are required at the
# Line 6699  for (;;) Line 7062  for (;;)
7062    /* Now compile the branch; in the pre-compile phase its length gets added    /* Now compile the branch; in the pre-compile phase its length gets added
7063    into the length. */    into the length. */
7064    
7065    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
7066          &branchreqbyte, &bc, cond_depth, cd,          &branchreqchar, &bc, cond_depth, cd,
7067          (lengthptr == NULL)? NULL : &length))          (lengthptr == NULL)? NULL : &length))
7068      {      {
7069      *ptrptr = ptr;      *ptrptr = ptr;
# Line 6716  for (;;) Line 7079  for (;;)
7079    
7080    if (lengthptr == NULL)    if (lengthptr == NULL)
7081      {      {
7082      /* If this is the first branch, the firstbyte and reqbyte values for the      /* If this is the first branch, the firstchar and reqchar values for the
7083      branch become the values for the regex. */      branch become the values for the regex. */
7084    
7085      if (*last_branch != OP_ALT)      if (*last_branch != OP_ALT)
7086        {        {
7087        firstbyte = branchfirstbyte;        firstchar = branchfirstchar;
7088        reqbyte = branchreqbyte;        reqchar = branchreqchar;
7089        }        }
7090    
7091      /* If this is not the first branch, the first char and reqbyte have to      /* If this is not the first branch, the first char and reqchar have to
7092      match the values from all the previous branches, except that if the      match the values from all the previous branches, except that if the
7093      previous value for reqbyte didn't have REQ_VARY set, it can still match,      previous value for reqchar didn't have REQ_VARY set, it can still match,
7094      and we set REQ_VARY for the regex. */      and we set REQ_VARY for the regex. */
7095    
7096      else      else
7097        {        {
7098        /* If we previously had a firstbyte, but it doesn't match the new branch,        /* If we previously had a firstchar, but it doesn't match the new branch,
7099        we have to abandon the firstbyte for the regex, but if there was        we have to abandon the firstchar for the regex, but if there was
7100        previously no reqbyte, it takes on the value of the old firstbyte. */        previously no reqchar, it takes on the value of the old firstchar. */
7101    
7102        if (firstbyte >= 0 && firstbyte != branchfirstbyte)        if (firstchar >= 0 && firstchar != branchfirstchar)
7103          {          {
7104          if (reqbyte < 0) reqbyte = firstbyte;          if (reqchar < 0) reqchar = firstchar;
7105          firstbyte = REQ_NONE;          firstchar = REQ_NONE;
7106          }          }
7107    
7108        /* If we (now or from before) have no firstbyte, a firstbyte from the        /* If we (now or from before) have no firstchar, a firstchar from the
7109        branch becomes a reqbyte if there isn't a branch reqbyte. */        branch becomes a reqchar if there isn't a branch reqchar. */
7110    
7111        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)        if (firstchar < 0 && branchfirstchar >= 0 && branchreqchar < 0)
7112            branchreqbyte = branchfirstbyte;            branchreqchar = branchfirstchar;
7113    
7114        /* Now ensure that the reqbytes match */        /* Now ensure that the reqchars match */
7115    
7116        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))        if ((reqchar & ~REQ_VARY) != (branchreqchar & ~REQ_VARY))
7117          reqbyte = REQ_NONE;          reqchar = REQ_NONE;
7118        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */        else reqchar |= branchreqchar;   /* To "or" REQ_VARY */
7119        }        }
7120    
7121      /* If lookbehind, check that this branch matches a fixed-length string, and      /* If lookbehind, check that this branch matches a fixed-length string, and
# Line 6822  for (;;) Line 7185  for (;;)
7185        if (cd->open_caps->flag)        if (cd->open_caps->flag)
7186          {          {
7187          memmove(start_bracket + 1 + LINK_SIZE, start_bracket,          memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7188            code - start_bracket);            IN_UCHARS(code - start_bracket));
7189          *start_bracket = OP_ONCE;          *start_bracket = OP_ONCE;
7190          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
7191          PUT(start_bracket, 1, (int)(code - start_bracket));          PUT(start_bracket, 1, (int)(code - start_bracket));
# Line 6842  for (;;) Line 7205  for (;;)
7205    
7206      *codeptr = code;      *codeptr = code;
7207      *ptrptr = ptr;      *ptrptr = ptr;
7208      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
7209      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
7210      if (lengthptr != NULL)      if (lengthptr != NULL)
7211        {        {
7212        if (OFLOW_MAX - *lengthptr < length)        if (OFLOW_MAX - *lengthptr < length)
# Line 6924  Returns:     TRUE or FALSE Line 7287  Returns:     TRUE or FALSE
7287  */  */
7288    
7289  static BOOL  static BOOL
7290  is_anchored(register const uschar *code, unsigned int bracket_map,  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
7291    unsigned int backref_map)    unsigned int backref_map)
7292  {  {
7293  do {  do {
7294     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const pcre_uchar *scode = first_significant_code(
7295       FALSE);       code + PRIV(OP_lengths)[*code], FALSE);
7296     register int op = *scode;     register int op = *scode;
7297    
7298     /* Non-capturing brackets */     /* Non-capturing brackets */
# Line 7001  Returns:         TRUE or FALSE Line 7364  Returns:         TRUE or FALSE
7364  */  */
7365    
7366  static BOOL  static BOOL
7367  is_startline(const uschar *code, unsigned int bracket_map,  is_startline(const pcre_uchar *code, unsigned int bracket_map,
7368    unsigned int backref_map)    unsigned int backref_map)
7369  {  {
7370  do {  do {
7371     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const pcre_uchar *scode = first_significant_code(
7372       FALSE);       code + PRIV(OP_lengths)[*code], FALSE);
7373     register int op = *scode;     register int op = *scode;
7374    
7375     /* If we are at the start of a conditional assertion group, *both* the     /* If we are at the start of a conditional assertion group, *both* the
# Line 7017  do { Line 7380  do {
7380     if (op == OP_COND)     if (op == OP_COND)
7381       {       {
7382       scode += 1 + LINK_SIZE;       scode += 1 + LINK_SIZE;
7383       if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];       if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
7384       switch (*scode)       switch (*scode)
7385         {         {
7386         case OP_CREF:         case OP_CREF:
# Line 7104  Returns:     -1 or the fixed first char Line 7467  Returns:     -1 or the fixed first char
7467  */  */
7468    
7469  static int  static int
7470  find_firstassertedchar(const uschar *code, BOOL inassert)  find_firstassertedchar(const pcre_uchar *code, BOOL inassert)
7471  {  {
7472  register int c = -1;  register int c = -1;
7473  do {  do {
7474     int d;     int d;
7475     int xl = (*code == OP_CBRA || *code == OP_SCBRA ||     int xl = (*code == OP_CBRA || *code == OP_SCBRA ||