/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 723 by ph10, Sat Oct 8 15:55:23 2011 UTC revision 964 by ph10, Fri May 4 13:03:39 2012 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2011 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
# Line 88  so this number is very generous. Line 92  so this number is very generous.
92  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
93  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
94  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
96    filled up by repetitions of forward references, for example patterns like
97    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98    that the workspace is expanded using malloc() in this situation. The value
99    below is therefore a minimum, and we put a maximum on it for safety. The
100    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101    kicks in at the same number of forward references in all cases. */
102    
103  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105    
106  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
107  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
108    
109  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
110    
111    /* Private flags added to firstchar and reqchar. */
112    
113    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
114    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
115    
116    /* Repeated character flags. */
117    
118    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
119    
120  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 231  static const char posix_names[] = Line 250  static const char posix_names[] =
250    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
252    
253  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
254    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255    
256  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 266  substitutes must be in the order of the Line 285  substitutes must be in the order of the
285  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
286    
287  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
288  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
289    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
292    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
295      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297    static const pcre_uchar string_pXsp[] = {
298      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300    static const pcre_uchar string_PXwd[] = {
301      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303    static const pcre_uchar string_pXwd[] = {
304      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    
307    static const pcre_uchar *substitutes[] = {
308      string_PNd,           /* \D */
309      string_pNd,           /* \d */
310      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
311      string_pXsp,          /* \s */
312      string_PXwd,          /* \W */
313      string_pXwd           /* \w */
314  };  };
315    
316  static const uschar *posix_substitutes[] = {  static const pcre_uchar string_pL[] =   {
317    (uschar *)"\\p{L}",     /* alpha */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318    (uschar *)"\\p{Ll}",    /* lower */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319    (uschar *)"\\p{Lu}",    /* upper */  static const pcre_uchar string_pLl[] =  {
320    (uschar *)"\\p{Xan}",   /* alnum */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321    NULL,                   /* ascii */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pLu[] =  {
323    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325    NULL,                   /* graph */  static const pcre_uchar string_pXan[] = {
326    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327    NULL,                   /* punct */    CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_h[] =    {
329    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_h, '\0' };
330    NULL,                   /* xdigit */  static const pcre_uchar string_pXps[] = {
331      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333    static const pcre_uchar string_PL[] =   {
334      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336    static const pcre_uchar string_PLl[] =  {
337      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339    static const pcre_uchar string_PLu[] =  {
340      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342    static const pcre_uchar string_PXan[] = {
343      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345    static const pcre_uchar string_H[] =    {
346      CHAR_BACKSLASH, CHAR_H, '\0' };
347    static const pcre_uchar string_PXps[] = {
348      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350    
351    static const pcre_uchar *posix_substitutes[] = {
352      string_pL,            /* alpha */
353      string_pLl,           /* lower */
354      string_pLu,           /* upper */
355      string_pXan,          /* alnum */
356      NULL,                 /* ascii */
357      string_h,             /* blank */
358      NULL,                 /* cntrl */
359      string_pNd,           /* digit */
360      NULL,                 /* graph */
361      NULL,                 /* print */
362      NULL,                 /* punct */
363      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
364      string_pXwd,          /* word */
365      NULL,                 /* xdigit */
366    /* Negated cases */    /* Negated cases */
367    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
368    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
369    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
370    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
371    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
372    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
373    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
374    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
375    NULL,                   /* ^graph */    NULL,                 /* ^graph */
376    NULL,                   /* ^print */    NULL,                 /* ^print */
377    NULL,                   /* ^punct */    NULL,                 /* ^punct */
378    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
379    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */
380    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
381  };  };
382  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383  #endif  #endif
384    
385  #define STRING(a)  # a  #define STRING(a)  # a
# Line 365  static const char error_texts[] = Line 438  static const char error_texts[] =
438    /* 30 */    /* 30 */
439    "unknown POSIX class name\0"    "unknown POSIX class name\0"
440    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
441    "this version of PCRE is not compiled with PCRE_UTF8 support\0"    "this version of PCRE is compiled without UTF support\0"
442    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
443    "character value in \\x{...} sequence is too large\0"    "character value in \\x{...} sequence is too large\0"
444    /* 35 */    /* 35 */
# Line 388  static const char error_texts[] = Line 461  static const char error_texts[] =
461    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
462    /* 50 */    /* 50 */
463    "repeated subpattern is too long\0"    /** DEAD **/    "repeated subpattern is too long\0"    /** DEAD **/
464    "octal value is greater than \\377 (not in UTF-8 mode)\0"    "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
465    "internal error: overran compiling workspace\0"    "internal error: overran compiling workspace\0"
466    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
467    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
# Line 407  static const char error_texts[] = Line 480  static const char error_texts[] =
480    /* 65 */    /* 65 */
481    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
482    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
483    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with Unicode property support\0"
484    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
485    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486      /* 70 */
487      "internal error: unknown opcode in find_fixedlength()\0"
488      "\\N is not supported in a class\0"
489      "too many forward references\0"
490      "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491      "invalid UTF-16 string\0"
492      /* 75 */
493      "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
494    ;    ;
495    
496  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 428  For convenience, we use the same bit def Line 509  For convenience, we use the same bit def
509    
510  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
511    
512    /* Using a simple comparison for decimal numbers rather than a memory read
513    is much faster, and the resulting code is simpler (the compiler turns it
514    into a subtraction and unsigned comparison). */
515    
516    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
517    
518  #ifndef EBCDIC  #ifndef EBCDIC
519    
520  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
521  UTF-8 mode. */  UTF-8 mode. */
522    
523  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
524    {    {
525    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
526    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 472  static const unsigned char digitab[] = Line 559  static const unsigned char digitab[] =
559    
560  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
561    
562  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
563    {    {
564    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
565    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 507  static const unsigned char digitab[] = Line 594  static const unsigned char digitab[] =
594    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
595    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
596    
597  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
598    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
599    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
600    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 546  static const unsigned char ebcdic_charta Line 633  static const unsigned char ebcdic_charta
633  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
634    
635  static BOOL  static BOOL
636    compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
637      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
638    
639    
# Line 578  return s; Line 665  return s;
665    
666    
667  /*************************************************  /*************************************************
668    *           Expand the workspace                 *
669    *************************************************/
670    
671    /* This function is called during the second compiling phase, if the number of
672    forward references fills the existing workspace, which is originally a block on
673    the stack. A larger block is obtained from malloc() unless the ultimate limit
674    has been reached or the increase will be rather small.
675    
676    Argument: pointer to the compile data block
677    Returns:  0 if all went well, else an error number
678    */
679    
680    static int
681    expand_workspace(compile_data *cd)
682    {
683    pcre_uchar *newspace;
684    int newsize = cd->workspace_size * 2;
685    
686    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
687    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
688        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
689     return ERR72;
690    
691    newspace = (PUBL(malloc))(IN_UCHARS(newsize));
692    if (newspace == NULL) return ERR21;
693    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
694    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
695    if (cd->workspace_size > COMPILE_WORK_SIZE)
696      (PUBL(free))((void *)cd->start_workspace);
697    cd->start_workspace = newspace;
698    cd->workspace_size = newsize;
699    return 0;
700    }
701    
702    
703    
704    /*************************************************
705  *            Check for counted repeat            *  *            Check for counted repeat            *
706  *************************************************/  *************************************************/
707    
# Line 593  Returns:    TRUE or FALSE Line 717  Returns:    TRUE or FALSE
717  */  */
718    
719  static BOOL  static BOOL
720  is_counted_repeat(const uschar *p)  is_counted_repeat(const pcre_uchar *p)
721  {  {
722  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
723  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
724    while (IS_DIGIT(*p)) p++;
725  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
726    
727  if (*p++ != CHAR_COMMA) return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
728  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
729    
730  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
731  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
732    while (IS_DIGIT(*p)) p++;
733    
734  return (*p == CHAR_RIGHT_CURLY_BRACKET);  return (*p == CHAR_RIGHT_CURLY_BRACKET);
735  }  }
# Line 635  Returns:         zero or positive => a d Line 761  Returns:         zero or positive => a d
761  */  */
762    
763  static int  static int
764  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
765    int options, BOOL isclass)    int options, BOOL isclass)
766  {  {
767  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
768  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
769  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
770    pcre_int32 c;
771    int i;
772    
773  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
774  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
# Line 654  in a table. A non-zero result is somethi Line 782  in a table. A non-zero result is somethi
782  Otherwise further processing may be required. */  Otherwise further processing may be required. */
783    
784  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
785  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
786    else if (c < CHAR_0 || c > CHAR_z) {}
787  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
788    
789  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
790  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
791    else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
792  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
793  #endif  #endif
794    
# Line 666  else if ((i = escapes[c - 0x48]) != 0) Line 796  else if ((i = escapes[c - 0x48]) != 0)
796    
797  else  else
798    {    {
799    const uschar *oldptr;    const pcre_uchar *oldptr;
800    BOOL braced, negated;    BOOL braced, negated;
801    
802    switch (c)    switch (c)
# Line 676  else Line 806  else
806    
807      case CHAR_l:      case CHAR_l:
808      case CHAR_L:      case CHAR_L:
809        *errorcodeptr = ERR37;
810        break;
811    
812      case CHAR_u:      case CHAR_u:
813        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
814          {
815          /* In JavaScript, \u must be followed by four hexadecimal numbers.
816          Otherwise it is a lowercase u letter. */
817          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
818            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
819            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
820            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
821            {
822            c = 0;
823            for (i = 0; i < 4; ++i)
824              {
825              register int cc = *(++ptr);
826    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
827              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
828              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
829    #else           /* EBCDIC coding */
830              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
831              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
832    #endif
833              }
834            }
835          }
836        else
837          *errorcodeptr = ERR37;
838        break;
839    
840      case CHAR_U:      case CHAR_U:
841      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
842        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
843      break;      break;
844    
845      /* In a character class, \g is just a literal "g". Outside a character      /* In a character class, \g is just a literal "g". Outside a character
# Line 710  else Line 871  else
871    
872      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
873        {        {
874        const uschar *p;        const pcre_uchar *p;
875        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
876          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
877        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
878          {          {
879          c = -ESC_k;          c = -ESC_k;
# Line 730  else Line 891  else
891        }        }
892      else negated = FALSE;      else negated = FALSE;
893    
894        /* The integer range is limited by the machine's int representation. */
895      c = 0;      c = 0;
896      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
897          {
898          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
899            {
900            c = -1;
901            break;
902            }
903        c = c * 10 + *(++ptr) - CHAR_0;        c = c * 10 + *(++ptr) - CHAR_0;
904          }
905      if (c < 0)   /* Integer overflow */      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
906        {        {
907          while (IS_DIGIT(ptr[1]))
908            ptr++;
909        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
910        break;        break;
911        }        }
# Line 783  else Line 953  else
953      if (!isclass)      if (!isclass)
954        {        {
955        oldptr = ptr;        oldptr = ptr;
956          /* The integer range is limited by the machine's int representation. */
957        c -= CHAR_0;        c -= CHAR_0;
958        while ((digitab[ptr[1]] & ctype_digit) != 0)        while (IS_DIGIT(ptr[1]))
959            {
960            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
961              {
962              c = -1;
963              break;
964              }
965          c = c * 10 + *(++ptr) - CHAR_0;          c = c * 10 + *(++ptr) - CHAR_0;
966        if (c < 0)    /* Integer overflow */          }
967          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
968          {          {
969            while (IS_DIGIT(ptr[1]))
970              ptr++;
971          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
972          break;          break;
973          }          }
# Line 813  else Line 993  else
993      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
994      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
995      significant 8 bits of octal numbers (I think this is what early Perls used      significant 8 bits of octal numbers (I think this is what early Perls used
996      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
997      than 3 octal digits. */      but no more than 3 octal digits. */
998    
999      case CHAR_0:      case CHAR_0:
1000      c -= CHAR_0;      c -= CHAR_0;
1001      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1002          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
1003      if (!utf8 && c > 255) *errorcodeptr = ERR51;  #ifdef COMPILE_PCRE8
1004        if (!utf && c > 0xff) *errorcodeptr = ERR51;
1005    #endif
1006      break;      break;
1007    
1008      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
1009      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1010      treated as a data character. */      If not, { is treated as a data character. */
1011    
1012      case CHAR_x:      case CHAR_x:
1013        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1014          {
1015          /* In JavaScript, \x must be followed by two hexadecimal numbers.
1016          Otherwise it is a lowercase x letter. */
1017          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1018            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1019            {
1020            c = 0;
1021            for (i = 0; i < 2; ++i)
1022              {
1023              register int cc = *(++ptr);
1024    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1025              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1026              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1027    #else           /* EBCDIC coding */
1028              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1029              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1030    #endif
1031              }
1032            }
1033          break;
1034          }
1035    
1036      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1037        {        {
1038        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
       int count = 0;  
1039    
1040        c = 0;        c = 0;
1041        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1042          {          {
1043          register int cc = *pt++;          register int cc = *pt++;
1044          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
         count++;  
1045    
1046  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1047          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
# Line 847  else Line 1050  else
1050          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1051          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1052  #endif  #endif
1053    
1054    #ifdef COMPILE_PCRE8
1055            if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1056    #else
1057    #ifdef COMPILE_PCRE16
1058            if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1059    #endif
1060    #endif
1061            }
1062    
1063          if (c < 0)
1064            {
1065            while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1066            *errorcodeptr = ERR34;
1067          }          }
1068    
1069        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1070          {          {
1071          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1072          ptr = pt;          ptr = pt;
1073          break;          break;
1074          }          }
# Line 863  else Line 1080  else
1080      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1081    
1082      c = 0;      c = 0;
1083      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1084        {        {
1085        int cc;                                  /* Some compilers don't like */        int cc;                                  /* Some compilers don't like */
1086        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
# Line 961  Returns:         type value from ucp_typ Line 1178  Returns:         type value from ucp_typ
1178  */  */
1179    
1180  static int  static int
1181  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1182  {  {
1183  int c, i, bot, top;  int c, i, bot, top;
1184  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1185  char name[32];  pcre_uchar name[32];
1186    
1187  c = *(++ptr);  c = *(++ptr);
1188  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 982  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1199  if (c == CHAR_LEFT_CURLY_BRACKET)
1199      *negptr = TRUE;      *negptr = TRUE;
1200      ptr++;      ptr++;
1201      }      }
1202    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1203      {      {
1204      c = *(++ptr);      c = *(++ptr);
1205      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 1006  else Line 1223  else
1223  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1224    
1225  bot = 0;  bot = 0;
1226  top = _pcre_utt_size;  top = PRIV(utt_size);
1227    
1228  while (bot < top)  while (bot < top)
1229    {    {
1230    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1231    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1232    if (c == 0)    if (c == 0)
1233      {      {
1234      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1235      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1236      }      }
1237    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1238    }    }
# Line 1053  Returns:         pointer to '}' on succe Line 1270  Returns:         pointer to '}' on succe
1270                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1271  */  */
1272    
1273  static const uschar *  static const pcre_uchar *
1274  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1275  {  {
1276  int min = 0;  int min = 0;
1277  int max = -1;  int max = -1;
# Line 1062  int max = -1; Line 1279  int max = -1;
1279  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1280  an integer overflow. */  an integer overflow. */
1281    
1282  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1283  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1284    {    {
1285    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1077  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1294  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1294    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1295      {      {
1296      max = 0;      max = 0;
1297      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1298      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1299        {        {
1300        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1132  Arguments: Line 1349  Arguments:
1349    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1350    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1351    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1352    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1353    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1354    
1355  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1356  */  */
1357    
1358  static int  static int
1359  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1360    BOOL xmode, BOOL utf8, int *count)    BOOL xmode, BOOL utf, int *count)
1361  {  {
1362  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1363  int start_count = *count;  int start_count = *count;
1364  int hwm_count = start_count;  int hwm_count = start_count;
1365  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1209  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1426  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1426          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1427        {        {
1428        int term;        int term;
1429        const uschar *thisname;        const pcre_uchar *thisname;
1430        *count += 1;        *count += 1;
1431        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1432        term = *ptr++;        term = *ptr++;
# Line 1217  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1434  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1434        thisname = ptr;        thisname = ptr;
1435        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1436        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1437            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, lorn) == 0)
1438          return *count;          return *count;
1439        term++;        term++;
1440        }        }
# Line 1260  for (; ptr < cd->end_pattern; ptr++) Line 1477  for (; ptr < cd->end_pattern; ptr++)
1477          {          {
1478          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1479            ptr+= 2;            ptr+= 2;
1480          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1481                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1482            ptr += 4;            ptr += 4;
1483          else          else
# Line 1308  for (; ptr < cd->end_pattern; ptr++) Line 1525  for (; ptr < cd->end_pattern; ptr++)
1525        {        {
1526        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1527        ptr++;        ptr++;
1528  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1529        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;        if (utf) FORWARDCHAR(ptr);
1530  #endif  #endif
1531        }        }
1532      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
# Line 1320  for (; ptr < cd->end_pattern; ptr++) Line 1537  for (; ptr < cd->end_pattern; ptr++)
1537    
1538    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1539      {      {
1540      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1541      if (rc > 0) return rc;      if (rc > 0) return rc;
1542      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1543      }      }
# Line 1366  Arguments: Line 1583  Arguments:
1583    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1584    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1585    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1586    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1587    
1588  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1589  */  */
1590    
1591  static int  static int
1592  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1593    BOOL utf8)    BOOL utf)
1594  {  {
1595  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1596  int count = 0;  int count = 0;
1597  int rc;  int rc;
1598    
# Line 1386  matching closing parens. That is why we Line 1603  matching closing parens. That is why we
1603    
1604  for (;;)  for (;;)
1605    {    {
1606    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1607    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1608    }    }
1609    
# Line 1413  Arguments: Line 1630  Arguments:
1630  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1631  */  */
1632    
1633  static const uschar*  static const pcre_uchar*
1634  first_significant_code(const uschar *code, BOOL skipassert)  first_significant_code(const pcre_uchar *code, BOOL skipassert)
1635  {  {
1636  for (;;)  for (;;)
1637    {    {
# Line 1425  for (;;) Line 1642  for (;;)
1642      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1643      if (!skipassert) return code;      if (!skipassert) return code;
1644      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1645      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1646      break;      break;
1647    
1648      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1439  for (;;) Line 1656  for (;;)
1656      case OP_RREF:      case OP_RREF:
1657      case OP_NRREF:      case OP_NRREF:
1658      case OP_DEF:      case OP_DEF:
1659      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1660      break;      break;
1661    
1662      default:      default:
# Line 1469  and doing the check at the end; a flag s Line 1686  and doing the check at the end; a flag s
1686    
1687  Arguments:  Arguments:
1688    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1689    utf8     TRUE in UTF-8 mode    utf      TRUE in UTF-8 / UTF-16 mode
1690    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1691    cd       the "compile data" structure    cd       the "compile data" structure
1692    
1693  Returns:   the fixed length,  Returns:   the fixed length,
1694               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1695               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1696               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1697                 or -4 if an unknown opcode was encountered (internal error)
1698  */  */
1699    
1700  static int  static int
1701  find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1702  {  {
1703  int length = -1;  int length = -1;
1704    
1705  register int branchlength = 0;  register int branchlength = 0;
1706  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1707    
1708  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1709  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1493  branch, check the length against that of Line 1711  branch, check the length against that of
1711  for (;;)  for (;;)
1712    {    {
1713    int d;    int d;
1714    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1715    register int op = *cc;    register int op = *cc;
1716    
1717    switch (op)    switch (op)
1718      {      {
1719      /* We only need to continue for OP_CBRA (normal capturing bracket) and      /* We only need to continue for OP_CBRA (normal capturing bracket) and
1720      OP_BRA (normal non-capturing bracket) because the other variants of these      OP_BRA (normal non-capturing bracket) because the other variants of these
1721      opcodes are all concerned with unlimited repeated groups, which of course      opcodes are all concerned with unlimited repeated groups, which of course
1722      are not of fixed length. They will cause a -1 response from the default      are not of fixed length. */
     case of this switch. */  
1723    
1724      case OP_CBRA:      case OP_CBRA:
1725      case OP_BRA:      case OP_BRA:
1726      case OP_ONCE:      case OP_ONCE:
1727      case OP_ONCE_NC:      case OP_ONCE_NC:
1728      case OP_COND:      case OP_COND:
1729      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1730      if (d < 0) return d;      if (d < 0) return d;
1731      branchlength += d;      branchlength += d;
1732      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1733      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1734      break;      break;
1735    
1736      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1737      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1738      END it's the end of the outer call. All can be handled by the same code.      an ALT. If it is END it's the end of the outer call. All can be handled by
1739      Note that we must not include the OP_KETRxxx opcodes here, because they      the same code. Note that we must not include the OP_KETRxxx opcodes here,
1740      all imply an unlimited repeat. */      because they all imply an unlimited repeat. */
1741    
1742      case OP_ALT:      case OP_ALT:
1743      case OP_KET:      case OP_KET:
1744      case OP_END:      case OP_END:
1745        case OP_ACCEPT:
1746        case OP_ASSERT_ACCEPT:
1747      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1748        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1749      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1537  for (;;) Line 1757  for (;;)
1757    
1758      case OP_RECURSE:      case OP_RECURSE:
1759      if (!atend) return -3;      if (!atend) return -3;
1760      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1761      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1762      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1763      d = find_fixedlength(cs + 2, utf8, atend, cd);      d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1764      if (d < 0) return d;      if (d < 0) return d;
1765      branchlength += d;      branchlength += d;
1766      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1553  for (;;) Line 1773  for (;;)
1773      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1774      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1775      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1776      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1777        break;
1778    
1779      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1780    
1781      case OP_REVERSE:      case OP_MARK:
1782      case OP_CREF:      case OP_PRUNE_ARG:
1783      case OP_NCREF:      case OP_SKIP_ARG:
1784      case OP_RREF:      case OP_THEN_ARG:
1785      case OP_NRREF:      cc += cc[1] + PRIV(OP_lengths)[*cc];
1786      case OP_DEF:      break;
1787    
1788      case OP_CALLOUT:      case OP_CALLOUT:
     case OP_SOD:  
     case OP_SOM:  
     case OP_SET_SOM:  
     case OP_EOD:  
     case OP_EODN:  
1789      case OP_CIRC:      case OP_CIRC:
1790      case OP_CIRCM:      case OP_CIRCM:
1791        case OP_CLOSE:
1792        case OP_COMMIT:
1793        case OP_CREF:
1794        case OP_DEF:
1795      case OP_DOLL:      case OP_DOLL:
1796      case OP_DOLLM:      case OP_DOLLM:
1797        case OP_EOD:
1798        case OP_EODN:
1799        case OP_FAIL:
1800        case OP_NCREF:
1801        case OP_NRREF:
1802      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1803        case OP_PRUNE:
1804        case OP_REVERSE:
1805        case OP_RREF:
1806        case OP_SET_SOM:
1807        case OP_SKIP:
1808        case OP_SOD:
1809        case OP_SOM:
1810        case OP_THEN:
1811      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1812      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1813      break;      break;
1814    
1815      /* Handle literal characters */      /* Handle literal characters */
# Line 1586  for (;;) Line 1820  for (;;)
1820      case OP_NOTI:      case OP_NOTI:
1821      branchlength++;      branchlength++;
1822      cc += 2;      cc += 2;
1823  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1824      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1825  #endif  #endif
1826      break;      break;
1827    
# Line 1595  for (;;) Line 1829  for (;;)
1829      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1830    
1831      case OP_EXACT:      case OP_EXACT:
1832        case OP_EXACTI:
1833        case OP_NOTEXACT:
1834        case OP_NOTEXACTI:
1835      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1836      cc += 4;      cc += 2 + IMM2_SIZE;
1837  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1838      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1839  #endif  #endif
1840      break;      break;
1841    
1842      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1843      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1844      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1845      cc += 4;      cc += 1 + IMM2_SIZE + 1;
1846      break;      break;
1847    
1848      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1615  for (;;) Line 1852  for (;;)
1852      cc += 2;      cc += 2;
1853      /* Fall through */      /* Fall through */
1854    
1855        case OP_HSPACE:
1856        case OP_VSPACE:
1857        case OP_NOT_HSPACE:
1858        case OP_NOT_VSPACE:
1859      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1860      case OP_DIGIT:      case OP_DIGIT:
1861      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1627  for (;;) Line 1868  for (;;)
1868      cc++;      cc++;
1869      break;      break;
1870    
1871      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1872        otherwise \C is coded as OP_ALLANY. */
1873    
1874      case OP_ANYBYTE:      case OP_ANYBYTE:
1875      return -2;      return -2;
1876    
1877      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1878    
1879  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1880      case OP_XCLASS:      case OP_XCLASS:
1881      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1882      /* Fall through */      /* Fall through */
1883  #endif  #endif
1884    
1885      case OP_CLASS:      case OP_CLASS:
1886      case OP_NCLASS:      case OP_NCLASS:
1887      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1888    
1889      switch (*cc)      switch (*cc)
1890        {        {
1891          case OP_CRPLUS:
1892          case OP_CRMINPLUS:
1893        case OP_CRSTAR:        case OP_CRSTAR:
1894        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1895        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1654  for (;;) Line 1898  for (;;)
1898    
1899        case OP_CRRANGE:        case OP_CRRANGE:
1900        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1901        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1902        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1903        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1904        break;        break;
1905    
1906        default:        default:
# Line 1666  for (;;) Line 1910  for (;;)
1910    
1911      /* Anything else is variable length */      /* Anything else is variable length */
1912    
1913      default:      case OP_ANYNL:
1914        case OP_BRAMINZERO:
1915        case OP_BRAPOS:
1916        case OP_BRAPOSZERO:
1917        case OP_BRAZERO:
1918        case OP_CBRAPOS:
1919        case OP_EXTUNI:
1920        case OP_KETRMAX:
1921        case OP_KETRMIN:
1922        case OP_KETRPOS:
1923        case OP_MINPLUS:
1924        case OP_MINPLUSI:
1925        case OP_MINQUERY:
1926        case OP_MINQUERYI:
1927        case OP_MINSTAR:
1928        case OP_MINSTARI:
1929        case OP_MINUPTO:
1930        case OP_MINUPTOI:
1931        case OP_NOTMINPLUS:
1932        case OP_NOTMINPLUSI:
1933        case OP_NOTMINQUERY:
1934        case OP_NOTMINQUERYI:
1935        case OP_NOTMINSTAR:
1936        case OP_NOTMINSTARI:
1937        case OP_NOTMINUPTO:
1938        case OP_NOTMINUPTOI:
1939        case OP_NOTPLUS:
1940        case OP_NOTPLUSI:
1941        case OP_NOTPOSPLUS:
1942        case OP_NOTPOSPLUSI:
1943        case OP_NOTPOSQUERY:
1944        case OP_NOTPOSQUERYI:
1945        case OP_NOTPOSSTAR:
1946        case OP_NOTPOSSTARI:
1947        case OP_NOTPOSUPTO:
1948        case OP_NOTPOSUPTOI:
1949        case OP_NOTQUERY:
1950        case OP_NOTQUERYI:
1951        case OP_NOTSTAR:
1952        case OP_NOTSTARI:
1953        case OP_NOTUPTO:
1954        case OP_NOTUPTOI:
1955        case OP_PLUS:
1956        case OP_PLUSI:
1957        case OP_POSPLUS:
1958        case OP_POSPLUSI:
1959        case OP_POSQUERY:
1960        case OP_POSQUERYI:
1961        case OP_POSSTAR:
1962        case OP_POSSTARI:
1963        case OP_POSUPTO:
1964        case OP_POSUPTOI:
1965        case OP_QUERY:
1966        case OP_QUERYI:
1967        case OP_REF:
1968        case OP_REFI:
1969        case OP_SBRA:
1970        case OP_SBRAPOS:
1971        case OP_SCBRA:
1972        case OP_SCBRAPOS:
1973        case OP_SCOND:
1974        case OP_SKIPZERO:
1975        case OP_STAR:
1976        case OP_STARI:
1977        case OP_TYPEMINPLUS:
1978        case OP_TYPEMINQUERY:
1979        case OP_TYPEMINSTAR:
1980        case OP_TYPEMINUPTO:
1981        case OP_TYPEPLUS:
1982        case OP_TYPEPOSPLUS:
1983        case OP_TYPEPOSQUERY:
1984        case OP_TYPEPOSSTAR:
1985        case OP_TYPEPOSUPTO:
1986        case OP_TYPEQUERY:
1987        case OP_TYPESTAR:
1988        case OP_TYPEUPTO:
1989        case OP_UPTO:
1990        case OP_UPTOI:
1991      return -1;      return -1;
1992    
1993        /* Catch unrecognized opcodes so that when new ones are added they
1994        are not forgotten, as has happened in the past. */
1995    
1996        default:
1997        return -4;
1998      }      }
1999    }    }
2000  /* Control never gets here */  /* Control never gets here */
# Line 1688  length. Line 2015  length.
2015    
2016  Arguments:  Arguments:
2017    code        points to start of expression    code        points to start of expression
2018    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2019    number      the required bracket number or negative to find a lookbehind    number      the required bracket number or negative to find a lookbehind
2020    
2021  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2022  */  */
2023    
2024  const uschar *  const pcre_uchar *
2025  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2026  {  {
2027  for (;;)  for (;;)
2028    {    {
# Line 1713  for (;;) Line 2040  for (;;)
2040    
2041    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
2042      {      {
2043      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
2044      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2045      }      }
2046    
2047    /* Handle capturing bracket */    /* Handle capturing bracket */
# Line 1723  for (;;) Line 2050  for (;;)
2050             c == OP_CBRAPOS || c == OP_SCBRAPOS)             c == OP_CBRAPOS || c == OP_SCBRAPOS)
2051      {      {
2052      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
2053      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2054      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2055      }      }
2056    
2057    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
# Line 1752  for (;;) Line 2079  for (;;)
2079        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2080        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2081        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2082        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2083            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2084        break;        break;
2085    
2086        case OP_MARK:        case OP_MARK:
# Line 1768  for (;;) Line 2096  for (;;)
2096    
2097      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2098    
2099      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2100    
2101    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2102    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2103    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2104    
2105  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2106      if (utf8) switch(c)      if (utf) switch(c)
2107        {        {
2108        case OP_CHAR:        case OP_CHAR:
2109        case OP_CHARI:        case OP_CHARI:
# Line 1805  for (;;) Line 2133  for (;;)
2133        case OP_MINQUERYI:        case OP_MINQUERYI:
2134        case OP_POSQUERY:        case OP_POSQUERY:
2135        case OP_POSQUERYI:        case OP_POSQUERYI:
2136        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2137        break;        break;
2138        }        }
2139  #else  #else
2140      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2141  #endif  #endif
2142      }      }
2143    }    }
# Line 1826  instance of OP_RECURSE. Line 2154  instance of OP_RECURSE.
2154    
2155  Arguments:  Arguments:
2156    code        points to start of expression    code        points to start of expression
2157    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2158    
2159  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2160  */  */
2161    
2162  static const uschar *  static const pcre_uchar *
2163  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2164  {  {
2165  for (;;)  for (;;)
2166    {    {
# Line 1871  for (;;) Line 2199  for (;;)
2199        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2200        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2201        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2202        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2203            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2204        break;        break;
2205    
2206        case OP_MARK:        case OP_MARK:
# Line 1887  for (;;) Line 2216  for (;;)
2216    
2217      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2218    
2219      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2220    
2221      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2222      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2223      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2224    
2225  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2226      if (utf8) switch(c)      if (utf) switch(c)
2227        {        {
2228        case OP_CHAR:        case OP_CHAR:
2229        case OP_CHARI:        case OP_CHARI:
2230          case OP_NOT:
2231          case OP_NOTI:
2232        case OP_EXACT:        case OP_EXACT:
2233        case OP_EXACTI:        case OP_EXACTI:
2234          case OP_NOTEXACT:
2235          case OP_NOTEXACTI:
2236        case OP_UPTO:        case OP_UPTO:
2237        case OP_UPTOI:        case OP_UPTOI:
2238          case OP_NOTUPTO:
2239          case OP_NOTUPTOI:
2240        case OP_MINUPTO:        case OP_MINUPTO:
2241        case OP_MINUPTOI:        case OP_MINUPTOI:
2242          case OP_NOTMINUPTO:
2243          case OP_NOTMINUPTOI:
2244        case OP_POSUPTO:        case OP_POSUPTO:
2245        case OP_POSUPTOI:        case OP_POSUPTOI:
2246          case OP_NOTPOSUPTO:
2247          case OP_NOTPOSUPTOI:
2248        case OP_STAR:        case OP_STAR:
2249        case OP_STARI:        case OP_STARI:
2250          case OP_NOTSTAR:
2251          case OP_NOTSTARI:
2252        case OP_MINSTAR:        case OP_MINSTAR:
2253        case OP_MINSTARI:        case OP_MINSTARI:
2254          case OP_NOTMINSTAR:
2255          case OP_NOTMINSTARI:
2256        case OP_POSSTAR:        case OP_POSSTAR:
2257        case OP_POSSTARI:        case OP_POSSTARI:
2258          case OP_NOTPOSSTAR:
2259          case OP_NOTPOSSTARI:
2260        case OP_PLUS:        case OP_PLUS:
2261        case OP_PLUSI:        case OP_PLUSI:
2262          case OP_NOTPLUS:
2263          case OP_NOTPLUSI:
2264        case OP_MINPLUS:        case OP_MINPLUS:
2265        case OP_MINPLUSI:        case OP_MINPLUSI:
2266          case OP_NOTMINPLUS:
2267          case OP_NOTMINPLUSI:
2268        case OP_POSPLUS:        case OP_POSPLUS:
2269        case OP_POSPLUSI:        case OP_POSPLUSI:
2270          case OP_NOTPOSPLUS:
2271          case OP_NOTPOSPLUSI:
2272        case OP_QUERY:        case OP_QUERY:
2273        case OP_QUERYI:        case OP_QUERYI:
2274          case OP_NOTQUERY:
2275          case OP_NOTQUERYI:
2276        case OP_MINQUERY:        case OP_MINQUERY:
2277        case OP_MINQUERYI:        case OP_MINQUERYI:
2278          case OP_NOTMINQUERY:
2279          case OP_NOTMINQUERYI:
2280        case OP_POSQUERY:        case OP_POSQUERY:
2281        case OP_POSQUERYI:        case OP_POSQUERYI:
2282        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_NOTPOSQUERY:
2283          case OP_NOTPOSQUERYI:
2284          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2285        break;        break;
2286        }        }
2287  #else  #else
2288      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2289  #endif  #endif
2290      }      }
2291    }    }
# Line 1951  bracket whose current branch will alread Line 2308  bracket whose current branch will alread
2308  Arguments:  Arguments:
2309    code        points to start of search    code        points to start of search
2310    endcode     points to where to stop    endcode     points to where to stop
2311    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2312    cd          contains pointers to tables etc.    cd          contains pointers to tables etc.
2313    
2314  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2315  */  */
2316    
2317  static BOOL  static BOOL
2318  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2319    compile_data *cd)    BOOL utf, compile_data *cd)
2320  {  {
2321  register int c;  register int c;
2322  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2323       code < endcode;       code < endcode;
2324       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2325    {    {
2326    const uschar *ccode;    const pcre_uchar *ccode;
2327    
2328    c = *code;    c = *code;
2329    
# Line 1989  for (code = first_significant_code(code Line 2346  for (code = first_significant_code(code
2346    
2347    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2348      {      {
2349      const uschar *scode;      const pcre_uchar *scode;
2350      BOOL empty_branch;      BOOL empty_branch;
2351    
2352      /* Test for forward reference */      /* Test for forward reference */
# Line 2007  for (code = first_significant_code(code Line 2364  for (code = first_significant_code(code
2364    
2365      do      do
2366        {        {
2367        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf, cd))
2368          {          {
2369          empty_branch = TRUE;          empty_branch = TRUE;
2370          break;          break;
# Line 2025  for (code = first_significant_code(code Line 2382  for (code = first_significant_code(code
2382    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2383        c == OP_BRAPOSZERO)        c == OP_BRAPOSZERO)
2384      {      {
2385      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2386      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
2387      c = *code;      c = *code;
2388      continue;      continue;
# Line 2063  for (code = first_significant_code(code Line 2420  for (code = first_significant_code(code
2420        empty_branch = FALSE;        empty_branch = FALSE;
2421        do        do
2422          {          {
2423          if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2424            empty_branch = TRUE;            empty_branch = TRUE;
2425          code += GET(code, 1);          code += GET(code, 1);
2426          }          }
# Line 2081  for (code = first_significant_code(code Line 2438  for (code = first_significant_code(code
2438      {      {
2439      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2440      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2441      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2442      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2443      here. */      here. */
2444    
2445  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2446      case OP_XCLASS:      case OP_XCLASS:
2447      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2448      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2093  for (code = first_significant_code(code Line 2450  for (code = first_significant_code(code
2450    
2451      case OP_CLASS:      case OP_CLASS:
2452      case OP_NCLASS:      case OP_NCLASS:
2453      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2454    
2455  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2456      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2457  #endif  #endif
2458    
# Line 2168  for (code = first_significant_code(code Line 2525  for (code = first_significant_code(code
2525      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2526      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2527      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2528      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP
2529          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2530      break;      break;
2531    
2532      /* End of branch */      /* End of branch */
# Line 2183  for (code = first_significant_code(code Line 2541  for (code = first_significant_code(code
2541      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2542      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
2543    
2544  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2545      case OP_STAR:      case OP_STAR:
2546      case OP_STARI:      case OP_STARI:
2547      case OP_MINSTAR:      case OP_MINSTAR:
# Line 2196  for (code = first_significant_code(code Line 2554  for (code = first_significant_code(code
2554      case OP_MINQUERYI:      case OP_MINQUERYI:
2555      case OP_POSQUERY:      case OP_POSQUERY:
2556      case OP_POSQUERYI:      case OP_POSQUERYI:
2557      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2558      break;      break;
2559    
2560      case OP_UPTO:      case OP_UPTO:
# Line 2205  for (code = first_significant_code(code Line 2563  for (code = first_significant_code(code
2563      case OP_MINUPTOI:      case OP_MINUPTOI:
2564      case OP_POSUPTO:      case OP_POSUPTO:
2565      case OP_POSUPTOI:      case OP_POSUPTOI:
2566      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2567      break;      break;
2568  #endif  #endif
2569    
# Line 2249  Arguments: Line 2607  Arguments:
2607    code        points to start of the recursion    code        points to start of the recursion
2608    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2609    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2610    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2611    cd          pointers to tables etc    cd          pointers to tables etc
2612    
2613  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2614  */  */
2615    
2616  static BOOL  static BOOL
2617  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2618    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2619  {  {
2620  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2621    {    {
2622    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2623      return FALSE;      return FALSE;
2624    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2625    }    }
# Line 2313  Returns:   TRUE or FALSE Line 2671  Returns:   TRUE or FALSE
2671  */  */
2672    
2673  static BOOL  static BOOL
2674  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2675  {  {
2676  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2677  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
# Line 2357  Returns:     a value representing the na Line 2715  Returns:     a value representing the na
2715  */  */
2716    
2717  static int  static int
2718  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2719  {  {
2720  const char *pn = posix_names;  const char *pn = posix_names;
2721  register int yield = 0;  register int yield = 0;
2722  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2723    {    {
2724    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2725      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2726    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2727    yield++;    yield++;
2728    }    }
# Line 2396  value in the reference (which is a group Line 2754  value in the reference (which is a group
2754  Arguments:  Arguments:
2755    group      points to the start of the group    group      points to the start of the group
2756    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2757    utf8       TRUE in UTF-8 mode    utf        TRUE in UTF-8 / UTF-16 mode
2758    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2759    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm   the hwm forward reference pointer at the start of the group
2760    
# Line 2404  Returns:     nothing Line 2762  Returns:     nothing
2762  */  */
2763    
2764  static void  static void
2765  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2766    uschar *save_hwm)    pcre_uchar *save_hwm)
2767  {  {
2768  uschar *ptr = group;  pcre_uchar *ptr = group;
2769    
2770  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2771    {    {
2772    int offset;    int offset;
2773    uschar *hc;    pcre_uchar *hc;
2774    
2775    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2776    reference. */    reference. */
# Line 2457  Arguments: Line 2815  Arguments:
2815  Returns:         new code pointer  Returns:         new code pointer
2816  */  */
2817    
2818  static uschar *  static pcre_uchar *
2819  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2820  {  {
2821  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2822  *code++ = 255;  *code++ = 255;
2823  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2824  PUT(code, LINK_SIZE, 0);                       /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2825  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2826  }  }
2827    
2828    
# Line 2486  Returns:             nothing Line 2844  Returns:             nothing
2844  */  */
2845    
2846  static void  static void
2847  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2848  {  {
2849  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2850  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
# Line 2569  switch(ptype) Line 2927  switch(ptype)
2927            prop->chartype == ucp_Lt) == negated;            prop->chartype == ucp_Lt) == negated;
2928    
2929    case PT_GC:    case PT_GC:
2930    return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2931    
2932    case PT_PC:    case PT_PC:
2933    return (pdata == prop->chartype) == negated;    return (pdata == prop->chartype) == negated;
# Line 2580  switch(ptype) Line 2938  switch(ptype)
2938    /* These are specials */    /* These are specials */
2939    
2940    case PT_ALNUM:    case PT_ALNUM:
2941    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2942            _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2943    
2944    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2945    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2946            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2947            == negated;            == negated;
2948    
2949    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2950    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2951            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2952            c == CHAR_FF || c == CHAR_CR)            c == CHAR_FF || c == CHAR_CR)
2953            == negated;            == negated;
2954    
2955    case PT_WORD:    case PT_WORD:
2956    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2957            _pcre_ucp_gentype[prop->chartype] == ucp_N ||            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2958            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
2959    }    }
2960  return FALSE;  return FALSE;
# Line 2615  sense to automatically possessify the re Line 2973  sense to automatically possessify the re
2973    
2974  Arguments:  Arguments:
2975    previous      pointer to the repeated opcode    previous      pointer to the repeated opcode
2976    utf8          TRUE in UTF-8 mode    utf           TRUE in UTF-8 / UTF-16 mode
2977    ptr           next character in pattern    ptr           next character in pattern
2978    options       options bits    options       options bits
2979    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 2624  Returns:        TRUE if possessifying is Line 2982  Returns:        TRUE if possessifying is
2982  */  */
2983    
2984  static BOOL  static BOOL
2985  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2986    int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2987  {  {
2988  int c, next;  pcre_int32 c, next;
2989  int op_code = *previous++;  int op_code = *previous++;
2990    
2991  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2636  if ((options & PCRE_EXTENDED) != 0) Line 2994  if ((options & PCRE_EXTENDED) != 0)
2994    {    {
2995    for (;;)    for (;;)
2996      {      {
2997      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2998      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2999        {        {
3000        ptr++;        ptr++;
# Line 2644  if ((options & PCRE_EXTENDED) != 0) Line 3002  if ((options & PCRE_EXTENDED) != 0)
3002          {          {
3003          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3004          ptr++;          ptr++;
3005  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3006          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3007  #endif  #endif
3008          }          }
3009        }        }
# Line 2663  if (*ptr == CHAR_BACKSLASH) Line 3021  if (*ptr == CHAR_BACKSLASH)
3021    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
3022    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
3023    }    }
3024    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
3025    {    {
3026  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3027    if (utf8) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
3028  #endif  #endif
3029    next = *ptr++;    next = *ptr++;
3030    }    }
   
3031  else return FALSE;  else return FALSE;
3032    
3033  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2680  if ((options & PCRE_EXTENDED) != 0) Line 3036  if ((options & PCRE_EXTENDED) != 0)
3036    {    {
3037    for (;;)    for (;;)
3038      {      {
3039      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3040      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3041        {        {
3042        ptr++;        ptr++;
# Line 2688  if ((options & PCRE_EXTENDED) != 0) Line 3044  if ((options & PCRE_EXTENDED) != 0)
3044          {          {
3045          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3046          ptr++;          ptr++;
3047  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3048          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3049  #endif  #endif
3050          }          }
3051        }        }
# Line 2700  if ((options & PCRE_EXTENDED) != 0) Line 3056  if ((options & PCRE_EXTENDED) != 0)
3056  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
3057    
3058  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3059    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3060      return FALSE;      return FALSE;
3061    
3062  /* Now compare the next item with the previous opcode. First, handle cases when  /* Now compare the next item with the previous opcode. First, handle cases when
# Line 2709  the next item is a character. */ Line 3065  the next item is a character. */
3065  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3066    {    {
3067    case OP_CHAR:    case OP_CHAR:
3068  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3069    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3070  #else  #else
3071    c = *previous;    c = *previous;
# Line 2721  if (next >= 0) switch(op_code) Line 3077  if (next >= 0) switch(op_code)
3077    high-valued characters. */    high-valued characters. */
3078    
3079    case OP_CHARI:    case OP_CHARI:
3080  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3081    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3082  #else  #else
3083    c = *previous;    c = *previous;
3084  #endif  #endif
3085    if (c == next) return FALSE;    if (c == next) return FALSE;
3086  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3087    if (utf8)    if (utf)
3088      {      {
3089      unsigned int othercase;      unsigned int othercase;
3090      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2740  if (next >= 0) switch(op_code) Line 3096  if (next >= 0) switch(op_code)
3096      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3097      }      }
3098    else    else
3099  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3100    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
   
   /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These  
   opcodes are not used for multi-byte characters, because they are coded using  
   an XCLASS instead. */  
3101    
3102    case OP_NOT:    case OP_NOT:
3103    return (c = *previous) == next;  #ifdef SUPPORT_UTF
3104      GETCHARTEST(c, previous);
3105    #else
3106      c = *previous;
3107    #endif
3108      return c == next;
3109    
3110    case OP_NOTI:    case OP_NOTI:
3111    if ((c = *previous) == next) return TRUE;  #ifdef SUPPORT_UTF
3112  #ifdef SUPPORT_UTF8    GETCHARTEST(c, previous);
3113    if (utf8)  #else
3114      c = *previous;
3115    #endif
3116      if (c == next) return TRUE;
3117    #ifdef SUPPORT_UTF
3118      if (utf)
3119      {      {
3120      unsigned int othercase;      unsigned int othercase;
3121      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
3122  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3123      othercase = UCD_OTHERCASE(next);      othercase = UCD_OTHERCASE((unsigned int)next);
3124  #else  #else
3125      othercase = NOTACHAR;      othercase = NOTACHAR;
3126  #endif  #endif
3127      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3128      }      }
3129    else    else
3130  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3131    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3132    
3133    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3134    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3135    
3136    case OP_DIGIT:    case OP_DIGIT:
3137    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3138    
3139    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3140    return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3141    
3142    case OP_WHITESPACE:    case OP_WHITESPACE:
3143    return next > 127 || (cd->ctypes[next] & ctype_space) == 0;    return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3144    
3145    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3146    return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3147    
3148    case OP_WORDCHAR:    case OP_WORDCHAR:
3149    return next > 127 || (cd->ctypes[next] & ctype_word) == 0;    return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3150    
3151    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3152    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3153    
3154    case OP_HSPACE:    case OP_HSPACE:
3155    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
# Line 2857  switch(op_code) Line 3219  switch(op_code)
3219    {    {
3220    case OP_CHAR:    case OP_CHAR:
3221    case OP_CHARI:    case OP_CHARI:
3222  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3223    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3224  #else  #else
3225    c = *previous;    c = *previous;
# Line 2865  switch(op_code) Line 3227  switch(op_code)
3227    switch(-next)    switch(-next)
3228      {      {
3229      case ESC_d:      case ESC_d:
3230      return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;      return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3231    
3232      case ESC_D:      case ESC_D:
3233      return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3234    
3235      case ESC_s:      case ESC_s:
3236      return c > 127 || (cd->ctypes[c] & ctype_space) == 0;      return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3237    
3238      case ESC_S:      case ESC_S:
3239      return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3240    
3241      case ESC_w:      case ESC_w:
3242      return c > 127 || (cd->ctypes[c] & ctype_word) == 0;      return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3243    
3244      case ESC_W:      case ESC_W:
3245      return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3246    
3247      case ESC_h:      case ESC_h:
3248      case ESC_H:      case ESC_H:
# Line 2962  switch(op_code) Line 3324  switch(op_code)
3324        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
3325    
3326        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3327          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3328            return FALSE;            return FALSE;
3329    
3330        /* Do the property check. */        /* Do the property check. */
# Line 2989  switch(op_code) Line 3351  switch(op_code)
3351    return next == -ESC_d;    return next == -ESC_d;
3352    
3353    case OP_WHITESPACE:    case OP_WHITESPACE:
3354    return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;    return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3355    
3356    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3357    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3358    
3359    case OP_HSPACE:    case OP_HSPACE:
3360    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
# Line 3040  Arguments: Line 3402  Arguments:
3402    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3403    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3404    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3405    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3406    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3407    bcptr          points to current branch chain    bcptr          points to current branch chain
3408    cond_depth     conditional nesting depth    cond_depth     conditional nesting depth
3409    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
# Line 3053  Returns:         TRUE on success Line 3415  Returns:         TRUE on success
3415  */  */
3416    
3417  static BOOL  static BOOL
3418  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3419    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3420    int cond_depth, compile_data *cd, int *lengthptr)    pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3421      compile_data *cd, int *lengthptr)
3422  {  {
3423  int repeat_type, op_type;  int repeat_type, op_type;
3424  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3425  int bravalue = 0;  int bravalue = 0;
3426  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3427  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3428  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3429  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3430  int options = *optionsptr;               /* May change dynamically */  int options = *optionsptr;               /* May change dynamically */
3431  int after_manual_callout = 0;  int after_manual_callout = 0;
3432  int length_prevgroup = 0;  int length_prevgroup = 0;
3433  register int c;  register int c;
3434  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3435  uschar *last_code = code;  pcre_uchar *last_code = code;
3436  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3437  uschar *tempcode;  pcre_uchar *tempcode;
3438  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3439  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3440  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3441  const uschar *tempptr;  const pcre_uchar *tempptr;
3442  const uschar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
3443  uschar *previous = NULL;  pcre_uchar *previous = NULL;
3444  uschar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
3445  uschar *save_hwm = NULL;  pcre_uchar *save_hwm = NULL;
3446  uschar classbits[32];  pcre_uint8 classbits[32];
3447    
3448  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3449  must not do this for other options (e.g. PCRE_EXTENDED) because they may change  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3450  dynamically as we process the pattern. */  dynamically as we process the pattern. */
3451    
3452  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3453  BOOL class_utf8;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3454  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
3455  uschar *class_utf8data;  pcre_uchar utf_chars[6];
 uschar *class_utf8data_base;  
 uschar utf8_char[6];  
3456  #else  #else
3457  BOOL utf8 = FALSE;  BOOL utf = FALSE;
3458    #endif
3459    
3460    /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3461    
3462    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3463    BOOL xclass;
3464    pcre_uchar *class_uchardata;
3465    pcre_uchar *class_uchardata_base;
3466  #endif  #endif
3467    
3468  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 3107  greedy_non_default = greedy_default ^ 1; Line 3476  greedy_non_default = greedy_default ^ 1;
3476    
3477  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3478  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3479  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3480  find one.  find one.
3481    
3482  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3483  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3484  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3485  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3486    
3487  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3488    
3489  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3490  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3491  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3492  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3493    value. This is used only for ASCII characters. */
3494    
3495  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3496    
3497  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3498    
# Line 3134  for (;; ptr++) Line 3504  for (;; ptr++)
3504    BOOL is_quantifier;    BOOL is_quantifier;
3505    BOOL is_recurse;    BOOL is_recurse;
3506    BOOL reset_bracount;    BOOL reset_bracount;
3507    int class_charcount;    int class_has_8bitchar;
3508    int class_lastchar;    int class_single_char;
3509    int newoptions;    int newoptions;
3510    int recno;    int recno;
3511    int refsign;    int refsign;
3512    int skipbytes;    int skipbytes;
3513    int subreqbyte;    int subreqchar;
3514    int subfirstbyte;    int subfirstchar;
3515    int terminator;    int terminator;
3516    int mclength;    int mclength;
3517    int tempbracount;    int tempbracount;
3518    uschar mcbuffer[8];    pcre_uchar mcbuffer[8];
3519    
3520    /* Get next byte in the pattern */    /* Get next character in the pattern */
3521    
3522    c = *ptr;    c = *ptr;
3523    
# Line 3169  for (;; ptr++) Line 3539  for (;; ptr++)
3539  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3540      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3541  #endif  #endif
3542      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3543            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3544        {        {
3545        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3546        goto FAILED;        goto FAILED;
# Line 3192  for (;; ptr++) Line 3563  for (;; ptr++)
3563        }        }
3564    
3565      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3566      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3567          (int)(code - last_code), c, c));
3568    
3569      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3570      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 3202  for (;; ptr++) Line 3574  for (;; ptr++)
3574        {        {
3575        if (previous > orig_code)        if (previous > orig_code)
3576          {          {
3577          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3578          code -= previous - orig_code;          code -= previous - orig_code;
3579          previous = orig_code;          previous = orig_code;
3580          }          }
# Line 3218  for (;; ptr++) Line 3590  for (;; ptr++)
3590    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3591    reference list. */    reference list. */
3592    
3593    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3594               WORK_SIZE_SAFETY_MARGIN)
3595      {      {
3596      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3597      goto FAILED;      goto FAILED;
# Line 3270  for (;; ptr++) Line 3643  for (;; ptr++)
3643    
3644    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3645      {      {
3646      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3647      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3648        {        {
3649        ptr++;        ptr++;
# Line 3278  for (;; ptr++) Line 3651  for (;; ptr++)
3651          {          {
3652          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3653          ptr++;          ptr++;
3654  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3655          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3656  #endif  #endif
3657          }          }
3658        if (*ptr != 0) continue;        if (*ptr != 0) continue;
# Line 3303  for (;; ptr++) Line 3676  for (;; ptr++)
3676      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3677      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3678      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3679      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3680      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3681      *codeptr = code;      *codeptr = code;
3682      *ptrptr = ptr;      *ptrptr = ptr;
3683      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3328  for (;; ptr++) Line 3701  for (;; ptr++)
3701      previous = NULL;      previous = NULL;
3702      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3703        {        {
3704        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3705        *code++ = OP_CIRCM;        *code++ = OP_CIRCM;
3706        }        }
3707      else *code++ = OP_CIRC;      else *code++ = OP_CIRC;
# Line 3340  for (;; ptr++) Line 3713  for (;; ptr++)
3713      break;      break;
3714    
3715      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3716      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3717    
3718      case CHAR_DOT:      case CHAR_DOT:
3719      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3720      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3721      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3722      previous = code;      previous = code;
3723      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3724      break;      break;
# Line 3400  for (;; ptr++) Line 3773  for (;; ptr++)
3773          {          {
3774          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
3775            ptr++;            ptr++;
3776          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
                           STR_Q STR_BACKSLASH STR_E, 3) == 0)  
3777            ptr += 3;            ptr += 3;
3778          else          else
3779            break;            break;
# Line 3420  for (;; ptr++) Line 3792  for (;; ptr++)
3792          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3793        {        {
3794        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3795        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3796        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3797        break;        break;
3798        }        }
3799    
# Line 3431  for (;; ptr++) Line 3803  for (;; ptr++)
3803    
3804      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3805    
3806      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3807      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3808      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3809        a single character. */
3810    
3811      class_charcount = 0;      class_has_8bitchar = 0;
3812      class_lastchar = -1;      class_single_char = 0;
3813    
3814      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3815      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
3816      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3817      */      */
3818    
3819      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3820    
3821  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3822      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3823      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3824      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3825  #endif  #endif
3826    
3827      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3457  for (;; ptr++) Line 3830  for (;; ptr++)
3830    
3831      if (c != 0) do      if (c != 0) do
3832        {        {
3833        const uschar *oldptr;        const pcre_uchar *oldptr;
3834    
3835  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3836        if (utf8 && c > 127)        if (utf && HAS_EXTRALEN(c))
3837          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3838          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3839          }          }
3840    #endif
3841    
3842        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3843          /* In the pre-compile phase, accumulate the length of any extra
3844        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3845        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
3846        (which is on the stack). */        (which is on the stack). */
3847    
3848        if (lengthptr != NULL)        if (lengthptr != NULL)
3849          {          {
3850          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_uchardata - class_uchardata_base;
3851          class_utf8data = class_utf8data_base;          class_uchardata = class_uchardata_base;
3852          }          }
   
3853  #endif  #endif
3854    
3855        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3503  for (;; ptr++) Line 3877  for (;; ptr++)
3877          {          {
3878          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3879          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3880          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3881          uschar pbits[32];          pcre_uint8 pbits[32];
3882    
3883          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
3884            {            {
# Line 3559  for (;; ptr++) Line 3933  for (;; ptr++)
3933          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3934    
3935          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3936            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3937    
3938          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3939    
# Line 3590  for (;; ptr++) Line 3964  for (;; ptr++)
3964            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3965    
3966          ptr = tempptr + 1;          ptr = tempptr + 1;
3967          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3968            class_has_8bitchar = 1;
3969            /* Every class contains at least two characters. */
3970            class_single_char = 2;
3971          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3972          }          }
3973    
3974        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3975        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3976        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3977        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3978        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3979        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3980          as literal characters (by default), or are faulted if
3981        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3982    
3983        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3608  for (;; ptr++) Line 3986  for (;; ptr++)
3986          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3987    
3988          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3989            else if (-c == ESC_N)            /* \N is not supported in a class */
3990              {
3991              *errorcodeptr = ERR71;
3992              goto FAILED;
3993              }
3994          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3995            {            {
3996            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3621  for (;; ptr++) Line 4004  for (;; ptr++)
4004    
4005          if (c < 0)          if (c < 0)
4006            {            {
4007            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
4008            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
4009              class_has_8bitchar++;
4010              /* Every class contains at least two characters. */
4011              class_single_char += 2;
4012    
4013            switch (-c)            switch (-c)
4014              {              {
# Line 3635  for (;; ptr++) Line 4021  for (;; ptr++)
4021              case ESC_SU:              case ESC_SU:
4022              nestptr = ptr;              nestptr = ptr;
4023              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
4024              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
4025              continue;              continue;
4026  #endif  #endif
4027              case ESC_d:              case ESC_d:
# Line 3676  for (;; ptr++) Line 4062  for (;; ptr++)
4062              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
4063              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
4064              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
4065  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4066              if (utf8)              xclass = TRUE;
4067                *class_uchardata++ = XCL_SINGLE;
4068                *class_uchardata++ = 0x1680;
4069                *class_uchardata++ = XCL_SINGLE;
4070                *class_uchardata++ = 0x180e;
4071                *class_uchardata++ = XCL_RANGE;
4072                *class_uchardata++ = 0x2000;
4073                *class_uchardata++ = 0x200a;
4074                *class_uchardata++ = XCL_SINGLE;
4075                *class_uchardata++ = 0x202f;
4076                *class_uchardata++ = XCL_SINGLE;
4077                *class_uchardata++ = 0x205f;
4078                *class_uchardata++ = XCL_SINGLE;
4079                *class_uchardata++ = 0x3000;
4080    #elif defined SUPPORT_UTF
4081                if (utf)
4082                {                {
4083                class_utf8 = TRUE;                xclass = TRUE;
4084                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4085                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4086                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4087                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4088                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4089                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4090                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4091                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4092                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4093                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4094                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4095                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4096                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4097                }                }
4098  #endif  #endif
4099              continue;              continue;
# Line 3710  for (;; ptr++) Line 4111  for (;; ptr++)
4111                  }                  }
4112                classbits[c] |= x;                classbits[c] |= x;
4113                }                }
4114    #ifndef COMPILE_PCRE8
4115  #ifdef SUPPORT_UTF8              xclass = TRUE;
4116              if (utf8)              *class_uchardata++ = XCL_RANGE;
4117                *class_uchardata++ = 0x0100;
4118                *class_uchardata++ = 0x167f;
4119                *class_uchardata++ = XCL_RANGE;
4120                *class_uchardata++ = 0x1681;
4121                *class_uchardata++ = 0x180d;
4122                *class_uchardata++ = XCL_RANGE;
4123                *class_uchardata++ = 0x180f;
4124                *class_uchardata++ = 0x1fff;
4125                *class_uchardata++ = XCL_RANGE;
4126                *class_uchardata++ = 0x200b;
4127                *class_uchardata++ = 0x202e;
4128                *class_uchardata++ = XCL_RANGE;
4129                *class_uchardata++ = 0x2030;
4130                *class_uchardata++ = 0x205e;
4131                *class_uchardata++ = XCL_RANGE;
4132                *class_uchardata++ = 0x2060;
4133                *class_uchardata++ = 0x2fff;
4134                *class_uchardata++ = XCL_RANGE;
4135                *class_uchardata++ = 0x3001;
4136    #ifdef SUPPORT_UTF
4137                if (utf)
4138                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4139                else
4140    #endif
4141                  *class_uchardata++ = 0xffff;
4142    #elif defined SUPPORT_UTF
4143                if (utf)
4144                {                {
4145                class_utf8 = TRUE;                xclass = TRUE;
4146                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4147                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4148                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4149                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4150                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4151                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4152                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4153                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4154                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4155                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4156                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4157                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4158                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4159                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4160                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4161                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4162                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4163                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4164                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4165                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4166                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4167                }                }
4168  #endif  #endif
4169              continue;              continue;
# Line 3746  for (;; ptr++) Line 4174  for (;; ptr++)
4174              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
4175              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, 0x0d); /* CR */
4176              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, 0x85); /* NEL */
4177  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4178              if (utf8)              xclass = TRUE;
4179                *class_uchardata++ = XCL_RANGE;
4180                *class_uchardata++ = 0x2028;
4181                *class_uchardata++ = 0x2029;
4182    #elif defined SUPPORT_UTF
4183                if (utf)
4184                {                {
4185                class_utf8 = TRUE;                xclass = TRUE;
4186                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4187                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4188                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4189                }                }
4190  #endif  #endif
4191              continue;              continue;
# Line 3774  for (;; ptr++) Line 4207  for (;; ptr++)
4207                classbits[c] |= x;                classbits[c] |= x;
4208                }                }
4209    
4210  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4211              if (utf8)              xclass = TRUE;
4212                *class_uchardata++ = XCL_RANGE;
4213                *class_uchardata++ = 0x0100;
4214                *class_uchardata++ = 0x2027;
4215                *class_uchardata++ = XCL_RANGE;
4216                *class_uchardata++ = 0x202a;
4217    #ifdef SUPPORT_UTF
4218                if (utf)
4219                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4220                else
4221    #endif
4222                  *class_uchardata++ = 0xffff;
4223    #elif defined SUPPORT_UTF
4224                if (utf)
4225                {                {
4226                class_utf8 = TRUE;                xclass = TRUE;
4227                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4228                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4229                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4230                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4231                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4232                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4233                }                }
4234  #endif  #endif
4235              continue;              continue;
# Line 3796  for (;; ptr++) Line 4242  for (;; ptr++)
4242                int pdata;                int pdata;
4243                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4244                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
4245                class_utf8 = TRUE;                xclass = TRUE;
4246                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_uchardata++ = ((-c == ESC_p) != negated)?
4247                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4248                *class_utf8data++ = ptype;                *class_uchardata++ = ptype;
4249                *class_utf8data++ = pdata;                *class_uchardata++ = pdata;
4250                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4251                continue;                continue;
4252                }                }
4253  #endif  #endif
# Line 3815  for (;; ptr++) Line 4261  for (;; ptr++)
4261                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4262                goto FAILED;                goto FAILED;
4263                }                }
4264              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4265              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4266                c = *ptr;                /* Get the final character and fall through */
4267              break;              break;
4268              }              }
4269            }            }
4270    
4271          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4272          greater than 256 in UTF-8 mode. */          greater than 256. */
4273    
4274          }   /* End of backslash handling */          }   /* End of backslash handling */
4275    
# Line 3870  for (;; ptr++) Line 4317  for (;; ptr++)
4317            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4318            }            }
4319    
4320  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4321          if (utf8)          if (utf)
4322            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4323            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4324            }            }
# Line 3915  for (;; ptr++) Line 4362  for (;; ptr++)
4362    
4363          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4364    
4365            /* Since we found a character range, single character optimizations
4366            cannot be done anymore. */
4367            class_single_char = 2;
4368    
4369          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4370          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4371          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4372          available. */          available. */
4373    
4374  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4375          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4376    #elif defined  SUPPORT_UTF
4377            if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4378    #elif !(defined COMPILE_PCRE8)
4379            if (d > 255)
4380    #endif
4381    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4382            {            {
4383            class_utf8 = TRUE;            xclass = TRUE;
4384    
4385            /* With UCP support, we can find the other case equivalents of            /* With UCP support, we can find the other case equivalents of
4386            the relevant characters. There may be several ranges. Optimize how            the relevant characters. There may be several ranges. Optimize how
4387            they fit with the basic range. */            they fit with the basic range. */
4388    
4389  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4390    #ifndef COMPILE_PCRE8
4391              if (utf && (options & PCRE_CASELESS) != 0)
4392    #else
4393            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4394    #endif
4395              {              {
4396              unsigned int occ, ocd;              unsigned int occ, ocd;
4397              unsigned int cc = c;              unsigned int cc = c;
# Line 3956  for (;; ptr++) Line 4417  for (;; ptr++)
4417    
4418                if (occ == ocd)                if (occ == ocd)
4419                  {                  {
4420                  *class_utf8data++ = XCL_SINGLE;                  *class_uchardata++ = XCL_SINGLE;
4421                  }                  }
4422                else                else
4423                  {                  {
4424                  *class_utf8data++ = XCL_RANGE;                  *class_uchardata++ = XCL_RANGE;
4425                  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);                  class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4426                  }                  }
4427                class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);                class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4428                }                }
4429              }              }
4430  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 3971  for (;; ptr++) Line 4432  for (;; ptr++)
4432            /* Now record the original range, possibly modified for UCP caseless            /* Now record the original range, possibly modified for UCP caseless
4433            overlapping ranges. */            overlapping ranges. */
4434    
4435            *class_utf8data++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4436            class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #ifdef SUPPORT_UTF
4437            class_utf8data += _pcre_ord2utf8(d, class_utf8data);  #ifndef COMPILE_PCRE8
4438              if (utf)
4439                {
4440                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4441                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4442                }
4443              else
4444                {
4445                *class_uchardata++ = c;
4446                *class_uchardata++ = d;
4447                }
4448    #else
4449              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4450              class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4451    #endif
4452    #else /* SUPPORT_UTF */
4453              *class_uchardata++ = c;
4454              *class_uchardata++ = d;
4455    #endif /* SUPPORT_UTF */
4456    
4457            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4458            caseless matching for UTF-8 characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
4459            for the smaller ones. */            for the smaller ones. As for 16 bit characters without UTF, we
4460              can still use  */
4461    
4462  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4463            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4464  #else            if (utf)
4465            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  #endif
4466                continue;    /* With next character in the class */
4467    #endif  /* SUPPORT_UCP */
4468    
4469    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4470              if (utf)
4471                {
4472                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4473                /* Adjust upper limit and fall through to set up the map */
4474                d = 127;
4475                }
4476              else
4477                {
4478                if (c > 255) continue;
4479                /* Adjust upper limit and fall through to set up the map */
4480                d = 255;
4481                }
4482    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4483              if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4484            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
   
4485            d = 127;            d = 127;
4486    #else
4487  #endif  /* SUPPORT_UCP */            if (c > 255) continue;
4488              /* Adjust upper limit and fall through to set up the map */
4489              d = 255;
4490    #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4491            }            }
4492  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4493    
4494          /* We use the bit map for all cases when not in UTF-8 mode; else          /* We use the bit map for 8 bit mode, or when the characters fall
4495          ranges that lie entirely within 0-127 when there is UCP support; else          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
         for partial ranges without UCP support. */  
4496    
4497          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4498    
4499          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4500    
# Line 4006  for (;; ptr++) Line 4503  for (;; ptr++)
4503            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4504            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4505              {              {
4506              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c]; /* flip case */
4507              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
4508              }              }
4509            }            }
# Line 4020  for (;; ptr++) Line 4517  for (;; ptr++)
4517    
4518        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4519    
4520        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4521    
4522          if (class_single_char < 2) class_single_char++;
4523    
4524          /* If class_charcount is 1, we saw precisely one character. As long as
4525          there was no use of \p or \P, in other words, no use of any XCLASS
4526          features, we can optimize.
4527    
4528          The optimization throws away the bit map. We turn the item into a
4529          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4530          In the positive case, it can cause firstchar to be set. Otherwise, there
4531          can be no first char if this item is first, whatever repeat count may
4532          follow. In the case of reqchar, save the previous value for reinstating. */
4533    
4534  #ifdef SUPPORT_UTF8        if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))  
4535          {          {
4536          class_utf8 = TRUE;          ptr++;
4537          *class_utf8data++ = XCL_SINGLE;          zeroreqchar = reqchar;
4538          class_utf8data += _pcre_ord2utf8(c, class_utf8data);  
4539            if (negate_class)
4540              {
4541              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4542              zerofirstchar = firstchar;
4543              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4544    #ifdef SUPPORT_UTF
4545              if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4546                code += PRIV(ord2utf)(c, code);
4547              else
4548    #endif
4549                *code++ = c;
4550              goto NOT_CHAR;
4551              }
4552    
4553            /* For a single, positive character, get the value into mcbuffer, and
4554            then we can handle this with the normal one-character code. */
4555    
4556    #ifdef SUPPORT_UTF
4557            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4558              mclength = PRIV(ord2utf)(c, mcbuffer);
4559            else
4560    #endif
4561              {
4562              mcbuffer[0] = c;
4563              mclength = 1;
4564              }
4565            goto ONE_CHAR;
4566            }       /* End of 1-char optimization */
4567    
4568          /* Handle a character that cannot go in the bit map. */
4569    
4570    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4571          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4572    #elif defined SUPPORT_UTF
4573          if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4574    #elif !(defined COMPILE_PCRE8)
4575          if (c > 255)
4576    #endif
4577    
4578    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4579            {
4580            xclass = TRUE;
4581            *class_uchardata++ = XCL_SINGLE;
4582    #ifdef SUPPORT_UTF
4583    #ifndef COMPILE_PCRE8
4584            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4585            if (!utf)
4586              *class_uchardata++ = c;
4587            else
4588    #endif
4589              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4590    #else /* SUPPORT_UTF */
4591            *class_uchardata++ = c;
4592    #endif /* SUPPORT_UTF */
4593    
4594  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4595    #ifdef COMPILE_PCRE8
4596          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4597    #else
4598            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4599            if (utf && (options & PCRE_CASELESS) != 0)
4600    #endif
4601            {            {
4602            unsigned int othercase;            unsigned int othercase;
4603            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((int)(othercase = UCD_OTHERCASE(c)) != c)
4604              {              {
4605              *class_utf8data++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4606              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4607              }              }
4608            }            }
4609  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
4610    
4611          }          }
4612        else        else
4613  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4614    
4615        /* Handle a single-byte character */        /* Handle a single-byte character */
4616          {          {
4617            class_has_8bitchar = 1;
4618          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4619          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4620            {            {
4621            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c]; /* flip case */
4622            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4623            }            }
         class_charcount++;  
         class_lastchar = c;  
4624          }          }
4625        }        }
4626    
# Line 4075  for (;; ptr++) Line 4641  for (;; ptr++)
4641        goto FAILED;        goto FAILED;
4642        }        }
4643    
4644      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4645      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4646      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
4647      optimize.  
4648        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4649      In UTF-8 mode, we can optimize the negative case only if there were no      zerofirstchar = firstchar;
4650      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      zeroreqchar = reqchar;
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstbyte to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqbyte, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF8  
     if (class_charcount == 1 && !class_utf8 &&  
       (!utf8 || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqbyte = reqbyte;  
   
       /* The OP_NOT[I] opcodes work on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
         zerofirstbyte = firstbyte;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF8  
       if (utf8 && class_lastchar > 127)  
         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqbyte setting must remain unchanged after any kind of  
     repeat. */  
   
     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
     zerofirstbyte = firstbyte;  
     zeroreqbyte = reqbyte;  
4651    
4652      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4653      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
# Line 4144  for (;; ptr++) Line 4657  for (;; ptr++)
4657      be listed) there are no characters < 256, we can omit the bitmap in the      be listed) there are no characters < 256, we can omit the bitmap in the
4658      actual compiled code. */      actual compiled code. */
4659    
4660  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4661      if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4662    #elif !defined COMPILE_PCRE8
4663        if (xclass && !should_flip_negation)
4664    #endif
4665    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4666        {        {
4667        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4668        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
4669        code += LINK_SIZE;        code += LINK_SIZE;
4670        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT:0;
4671    
4672        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4673        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4674    
4675        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4676          {          {
4677          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4678          memmove(code + 32, code, class_utf8data - code);          memmove(code + (32 / sizeof(pcre_uchar)), code,
4679              IN_UCHARS(class_uchardata - code));
4680          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
4681          code = class_utf8data + 32;          code = class_uchardata + (32 / sizeof(pcre_uchar));
4682          }          }
4683        else code = class_utf8data;        else code = class_uchardata;
4684    
4685        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4686    
4687        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4688        break;   /* End of class handling */        break;   /* End of class handling */
4689        }        }
4690  #endif  #endif
# Line 4178  for (;; ptr++) Line 4696  for (;; ptr++)
4696      negating it if necessary. */      negating it if necessary. */
4697    
4698      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4699      if (negate_class)      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
       {  
       if (lengthptr == NULL)    /* Save time in the pre-compile phase */  
         for (c = 0; c < 32; c++) code[c] = ~classbits[c];  
       }  
     else  
4700        {        {
4701          if (negate_class)
4702            for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4703        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4704        }        }
4705      code += 32;      code += 32 / sizeof(pcre_uchar);
4706        NOT_CHAR:
4707      break;      break;
4708    
4709    
# Line 4224  for (;; ptr++) Line 4740  for (;; ptr++)
4740    
4741      if (repeat_min == 0)      if (repeat_min == 0)
4742        {        {
4743        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */        firstchar = zerofirstchar;    /* Adjust for zero repeat */
4744        reqbyte = zeroreqbyte;        /* Ditto */        reqchar = zeroreqchar;        /* Ditto */
4745        }        }
4746    
4747      /* Remember whether this is a variable length repeat */      /* Remember whether this is a variable length repeat */
# Line 4267  for (;; ptr++) Line 4783  for (;; ptr++)
4783    
4784      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4785        {        {
4786        memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4787        *previous = OP_ONCE;        *previous = OP_ONCE;
4788        PUT(previous, 1, 2 + 2*LINK_SIZE);        PUT(previous, 1, 2 + 2*LINK_SIZE);
4789        previous[2 + 2*LINK_SIZE] = OP_KET;        previous[2 + 2*LINK_SIZE] = OP_KET;
# Line 4288  for (;; ptr++) Line 4804  for (;; ptr++)
4804    
4805      /* Now handle repetition for the different types of item. */      /* Now handle repetition for the different types of item. */
4806    
4807      /* If previous was a character match, abolish the item and generate a      /* If previous was a character or negated character match, abolish the item
4808      repeat item instead. If a char item has a minumum of more than one, ensure      and generate a repeat item instead. If a char item has a minimum of more
4809      that it is set in reqbyte - it might not be if a sequence such as x{3} is      than one, ensure that it is set in reqchar - it might not be if a sequence
4810      the first thing in a branch because the x will have gone into firstbyte      such as x{3} is the first thing in a branch because the x will have gone
4811      instead.  */      into firstchar instead.  */
4812    
4813      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI
4814            || *previous == OP_NOT || *previous == OP_NOTI)
4815        {        {
4816        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;        switch (*previous)
4817            {
4818            default: /* Make compiler happy. */
4819            case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
4820            case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4821            case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
4822            case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
4823            }
4824    
4825        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF characters that take up more than one character. It's
4826        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4827        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus UTF_LENGTH to flag that
4828        length rather than a small character. */        it's a length rather than a small character. */
4829    
4830  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4831        if (utf8 && (code[-1] & 0x80) != 0)        if (utf && NOT_FIRSTCHAR(code[-1]))
4832          {          {
4833          uschar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4834          while((*lastchar & 0xc0) == 0x80) lastchar--;          BACKCHAR(lastchar);
4835          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4836          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4837          c |= 0x80;                      /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4838          }          }
4839        else        else
4840  #endif  #endif /* SUPPORT_UTF */
   
       /* Handle the case of a single byte - either with no UTF8 support, or  
       with UTF-8 disabled, or for a UTF-8 character < 128. */  
4841    
4842          /* Handle the case of a single charater - either with no UTF support, or
4843          with UTF disabled, or for a single character UTF character. */
4844          {          {
4845          c = code[-1];          c = code[-1];
4846          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (*previous <= OP_CHARI && repeat_min > 1)
4847              reqchar = c | req_caseopt | cd->req_varyopt;
4848          }          }
4849    
4850        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4330  for (;; ptr++) Line 4854  for (;; ptr++)
4854    
4855        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4856            repeat_max < 0 &&            repeat_max < 0 &&
4857            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4858          {          {
4859          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4860          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4339  for (;; ptr++) Line 4863  for (;; ptr++)
4863        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4864        }        }
4865    
     /* If previous was a single negated character ([^a] or similar), we use  
     one of the special opcodes, replacing it. The code is shared with single-  
     character repeats by setting opt_type to add a suitable offset into  
     repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI  
     are currently used only for single-byte chars. */  
   
     else if (*previous == OP_NOT || *previous == OP_NOTI)  
       {  
       op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;  
       c = previous[1];  
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf8, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
       goto OUTPUT_SINGLE_REPEAT;  
       }  
   
4866      /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
4867      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
4868      repeats by setting op_type to add a suitable offset into repeat_type. Note      repeats by setting op_type to add a suitable offset into repeat_type. Note
# Line 4368  for (;; ptr++) Line 4872  for (;; ptr++)
4872    
4873      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
4874        {        {
4875        uschar *oldcode;        pcre_uchar *oldcode;
4876        int prop_type, prop_value;        int prop_type, prop_value;
4877        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4878        c = *previous;        c = *previous;
4879    
4880        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4881            repeat_max < 0 &&            repeat_max < 0 &&
4882            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4883          {          {
4884          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4885          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4455  for (;; ptr++) Line 4959  for (;; ptr++)
4959          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
4960          Unicode property match, there are two extra bytes that define the          Unicode property match, there are two extra bytes that define the
4961          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
4962          c, with the 0x80 bit as a flag. */          c, with the UTF_LENGTH bit as a flag. */
4963    
4964          if (repeat_max < 0)          if (repeat_max < 0)
4965            {            {
4966  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4967            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4968              {              {
4969              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4970              code += c & 7;              code += c & 7;
4971              }              }
4972            else            else
# Line 4484  for (;; ptr++) Line 4988  for (;; ptr++)
4988    
4989          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
4990            {            {
4991  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4992            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4993              {              {
4994              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4995              code += c & 7;              code += c & 7;
4996              }              }
4997            else            else
# Line 4514  for (;; ptr++) Line 5018  for (;; ptr++)
5018    
5019        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
5020    
5021  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5022        if (utf8 && c >= 128)        if (utf && (c & UTF_LENGTH) != 0)
5023          {          {
5024          memcpy(code, utf8_char, c & 7);          memcpy(code, utf_chars, IN_UCHARS(c & 7));
5025          code += c & 7;          code += c & 7;
5026          }          }
5027        else        else
# Line 4541  for (;; ptr++) Line 5045  for (;; ptr++)
5045    
5046      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
5047               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
5048  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5049               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
5050  #endif  #endif
5051               *previous == OP_REF ||               *previous == OP_REF ||
# Line 4590  for (;; ptr++) Line 5094  for (;; ptr++)
5094        {        {
5095        register int i;        register int i;
5096        int len = (int)(code - previous);        int len = (int)(code - previous);
5097        uschar *bralink = NULL;        pcre_uchar *bralink = NULL;
5098        uschar *brazeroptr = NULL;        pcre_uchar *brazeroptr = NULL;
5099    
5100        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5101        we just ignore the repeat. */        we just ignore the repeat. */
# Line 4644  for (;; ptr++) Line 5148  for (;; ptr++)
5148          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5149            {            {
5150            *code = OP_END;            *code = OP_END;
5151            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf, cd, save_hwm);
5152            memmove(previous+1, previous, len);            memmove(previous + 1, previous, IN_UCHARS(len));
5153            code++;            code++;
5154            if (repeat_max == 0)            if (repeat_max == 0)
5155              {              {
# Line 4668  for (;; ptr++) Line 5172  for (;; ptr++)
5172            {            {
5173            int offset;            int offset;
5174            *code = OP_END;            *code = OP_END;
5175            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);            adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5176            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5177            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
5178            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
5179            *previous++ = OP_BRA;            *previous++ = OP_BRA;
# Line 4715  for (;; ptr++) Line 5219  for (;; ptr++)
5219              *lengthptr += delta;              *lengthptr += delta;
5220              }              }
5221    
5222            /* This is compiling for real */            /* This is compiling for real. If there is a set first byte for
5223              the group, and we have not yet set a "required byte", set it. Make
5224              sure there is enough workspace for copying forward references before
5225              doing the copy. */
5226    
5227            else            else
5228              {              {
5229              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5230    
5231              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5232                {                {
5233                uschar *hc;                pcre_uchar *hc;
5234                uschar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5235                memcpy(code, previous, len);                memcpy(code, previous, IN_UCHARS(len));
5236    
5237                  while (cd->hwm > cd->start_workspace + cd->workspace_size -
5238                         WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5239                    {
5240                    int save_offset = save_hwm - cd->start_workspace;
5241                    int this_offset = this_hwm - cd->start_workspace;
5242                    *errorcodeptr = expand_workspace(cd);
5243                    if (*errorcodeptr != 0) goto FAILED;
5244                    save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5245                    this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5246                    }
5247    
5248                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5249                  {                  {
5250                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 4774  for (;; ptr++) Line 5294  for (;; ptr++)
5294    
5295          else for (i = repeat_max - 1; i >= 0; i--)          else for (i = repeat_max - 1; i >= 0; i--)
5296            {            {
5297            uschar *hc;            pcre_uchar *hc;
5298            uschar *this_hwm = cd->hwm;            pcre_uchar *this_hwm = cd->hwm;
5299    
5300            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
5301    
# Line 4791  for (;; ptr++) Line 5311  for (;; ptr++)
5311              PUTINC(code, 0, offset);              PUTINC(code, 0, offset);
5312              }              }
5313    
5314            memcpy(code, previous, len);            memcpy(code, previous, IN_UCHARS(len));
5315    
5316              /* Ensure there is enough workspace for forward references before
5317              copying them. */
5318    
5319              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5320                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5321                {
5322                int save_offset = save_hwm - cd->start_workspace;
5323                int this_offset = this_hwm - cd->start_workspace;
5324                *errorcodeptr = expand_workspace(cd);
5325                if (*errorcodeptr != 0) goto FAILED;
5326                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5327                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5328                }
5329    
5330            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5331              {              {
5332              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 4808  for (;; ptr++) Line 5343  for (;; ptr++)
5343            {            {
5344            int oldlinkoffset;            int oldlinkoffset;
5345            int offset = (int)(code - bralink + 1);            int offset = (int)(code - bralink + 1);
5346            uschar *bra = code - offset;            pcre_uchar *bra = code - offset;
5347            oldlinkoffset = GET(bra, 1);            oldlinkoffset = GET(bra, 1);
5348            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5349            *code++ = OP_KET;            *code++ = OP_KET;
# Line 4823  for (;; ptr++) Line 5358  for (;; ptr++)
5358        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5359        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5360    
5361        Otherwise, if the quantifier was possessive, we convert the BRA code to        Otherwise, when we are doing the actual compile phase, check to see
5362        the POS form, and the KET code to KETRPOS. (It turns out to be convenient        whether this group is one that could match an empty string. If so,
5363        at runtime to detect this kind of subpattern at both the start and at the        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5364        end.) The use of special opcodes makes it possible to reduce greatly the        that runtime checking can be done. [This check is also applied to ONCE
5365        stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO,        groups at runtime, but in a different way.]
5366        convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that  
5367        the default action below, of wrapping everything inside atomic brackets,        Then, if the quantifier was possessive and the bracket is not a
5368        does not happen.        conditional, we convert the BRA code to the POS form, and the KET code to
5369          KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5370        Then, when we are doing the actual compile phase, check to see whether        subpattern at both the start and at the end.) The use of special opcodes
5371        this group is one that could match an empty string. If so, convert the        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5372        initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5373        checking can be done. [This check is also applied to ONCE groups at  
5374        runtime, but in a different way.] */        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5375          flag so that the default action below, of wrapping everything inside
5376          atomic brackets, does not happen. When the minimum is greater than 1,
5377          there will be earlier copies of the group, and so we still have to wrap
5378          the whole thing. */
5379    
5380        else        else
5381          {          {
5382          uschar *ketcode = code - 1 - LINK_SIZE;          pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5383          uschar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5384    
5385          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&          /* Convert possessive ONCE brackets to non-capturing */
5386    
5387            if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5388              possessive_quantifier) *bracode = OP_BRA;              possessive_quantifier) *bracode = OP_BRA;
5389    
5390            /* For non-possessive ONCE brackets, all we need to do is to
5391            set the KET. */
5392    
5393          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5394            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5395    
5396            /* Handle non-ONCE brackets and possessive ONCEs (which have been
5397            converted to non-capturing above). */
5398    
5399          else          else
5400            {            {
5401            if (possessive_quantifier)            /* In the compile phase, check for empty string matching. */
             {  
             *bracode += 1;                   /* Switch to xxxPOS opcodes */  
             *ketcode = OP_KETRPOS;  
             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;  
             possessive_quantifier = FALSE;  
             }  
           else *ketcode = OP_KETRMAX + repeat_type;  
5402    
5403            if (lengthptr == NULL)            if (lengthptr == NULL)
5404              {              {
5405              uschar *scode = bracode;              pcre_uchar *scode = bracode;
5406              do              do
5407                {                {
5408                if (could_be_empty_branch(scode, ketcode, utf8, cd))                if (could_be_empty_branch(scode, ketcode, utf, cd))
5409                  {                  {
5410                  *bracode += OP_SBRA - OP_BRA;                  *bracode += OP_SBRA - OP_BRA;
5411                  break;                  break;
# Line 4873  for (;; ptr++) Line 5414  for (;; ptr++)
5414                }                }
5415              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5416              }              }
5417    
5418              /* Handle possessive quantifiers. */
5419    
5420              if (possessive_quantifier)
5421                {
5422                /* For COND brackets, we wrap the whole thing in a possessively
5423                repeated non-capturing bracket, because we have not invented POS
5424                versions of the COND opcodes. Because we are moving code along, we
5425                must ensure that any pending recursive references are updated. */
5426    
5427                if (*bracode == OP_COND || *bracode == OP_SCOND)
5428                  {
5429                  int nlen = (int)(code - bracode);
5430                  *code = OP_END;
5431                  adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5432                  memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5433                  code += 1 + LINK_SIZE;
5434                  nlen += 1 + LINK_SIZE;
5435                  *bracode = OP_BRAPOS;
5436                  *code++ = OP_KETRPOS;
5437                  PUTINC(code, 0, nlen);
5438                  PUT(bracode, 1, nlen);
5439                  }
5440    
5441                /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5442    
5443                else
5444                  {
5445                  *bracode += 1;              /* Switch to xxxPOS opcodes */
5446                  *ketcode = OP_KETRPOS;
5447                  }
5448    
5449                /* If the minimum is zero, mark it as possessive, then unset the
5450                possessive flag when the minimum is 0 or 1. */
5451    
5452                if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5453                if (repeat_min < 2) possessive_quantifier = FALSE;
5454                }
5455    
5456              /* Non-possessive quantifier */
5457    
5458              else *ketcode = OP_KETRMAX + repeat_type;
5459            }            }
5460          }          }
5461        }        }
# Line 4899  for (;; ptr++) Line 5482  for (;; ptr++)
5482      notation is just syntactic sugar, taken from Sun's Java package, but the      notation is just syntactic sugar, taken from Sun's Java package, but the
5483      special opcodes can optimize it.      special opcodes can optimize it.
5484    
5485      Possessively repeated subpatterns have already been handled in the code      Some (but not all) possessively repeated subpatterns have already been
5486      just above, so possessive_quantifier is always FALSE for them at this      completely handled in the code just above. For them, possessive_quantifier
5487      stage.      is always FALSE at this stage.
5488    
5489      Note that the repeated item starts at tempcode, not at previous, which      Note that the repeated item starts at tempcode, not at previous, which
5490      might be the first part of a string whose (former) last char we repeated.      might be the first part of a string whose (former) last char we repeated.
# Line 4915  for (;; ptr++) Line 5498  for (;; ptr++)
5498        int len;        int len;
5499    
5500        if (*tempcode == OP_TYPEEXACT)        if (*tempcode == OP_TYPEEXACT)
5501          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
5502            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);            ((tempcode[1 + IMM2_SIZE] == OP_PROP
5503              || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5504    
5505        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5506          {          {
5507          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
5508  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5509          if (utf8 && tempcode[-1] >= 0xc0)          if (utf && HAS_EXTRALEN(tempcode[-1]))
5510            tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];            tempcode += GET_EXTRALEN(tempcode[-1]);
5511  #endif  #endif
5512          }          }
5513    
# Line 4960  for (;; ptr++) Line 5544  for (;; ptr++)
5544    
5545          default:          default:
5546          *code = OP_END;          *code = OP_END;
5547          adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5548          memmove(tempcode + 1+LINK_SIZE, tempcode, len);          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5549          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
5550          len += 1 + LINK_SIZE;          len += 1 + LINK_SIZE;
5551          tempcode[0] = OP_ONCE;          tempcode[0] = OP_ONCE;
# Line 4973  for (;; ptr++) Line 5557  for (;; ptr++)
5557        }        }
5558    
5559      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
5560      "follows varying string" flag for subsequently encountered reqbytes if      "follows varying string" flag for subsequently encountered reqchars if
5561      it isn't already set and we have just passed a varying length item. */      it isn't already set and we have just passed a varying length item. */
5562    
5563      END_REPEAT:      END_REPEAT:
# Line 4996  for (;; ptr++) Line 5580  for (;; ptr++)
5580    
5581      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
5582    
5583      if (*(++ptr) == CHAR_ASTERISK &&      ptr++;
5584           ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5585             || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5586        {        {
5587        int i, namelen;        int i, namelen;
5588        int arglen = 0;        int arglen = 0;
5589        const char *vn = verbnames;        const char *vn = verbnames;
5590        const uschar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5591        const uschar *arg = NULL;        const pcre_uchar *arg = NULL;
5592        previous = NULL;        previous = NULL;
5593        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        ptr++;
5594          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5595        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5596    
5597        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
5598        a closing parenthesis, to appear in arguments, so we no longer insist on        a closing parenthesis, to appear in arguments, so we no longer insist on
5599        letters, digits, and underscores. */        letters, digits, and underscores. */
# Line 5017  for (;; ptr++) Line 5603  for (;; ptr++)
5603          arg = ++ptr;          arg = ++ptr;
5604          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5605          arglen = (int)(ptr - arg);          arglen = (int)(ptr - arg);
5606            if (arglen > (int)MAX_MARK)
5607              {
5608              *errorcodeptr = ERR75;
5609              goto FAILED;
5610              }
5611          }          }
5612    
5613        if (*ptr != CHAR_RIGHT_PARENTHESIS)        if (*ptr != CHAR_RIGHT_PARENTHESIS)
# Line 5030  for (;; ptr++) Line 5621  for (;; ptr++)
5621        for (i = 0; i < verbcount; i++)        for (i = 0; i < verbcount; i++)
5622          {          {
5623          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5624              strncmp((char *)name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5625            {            {
5626            /* Check for open captures before ACCEPT and convert it to            /* Check for open captures before ACCEPT and convert it to
5627            ASSERT_ACCEPT if in an assertion. */            ASSERT_ACCEPT if in an assertion. */
# Line 5051  for (;; ptr++) Line 5642  for (;; ptr++)
5642                }                }
5643              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5644    
5645              /* Do not set firstbyte after *ACCEPT */              /* Do not set firstchar after *ACCEPT */
5646              if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5647              }              }
5648    
5649            /* Handle other cases with/without an argument */            /* Handle other cases with/without an argument */
# Line 5078  for (;; ptr++) Line 5669  for (;; ptr++)
5669              *code = verbs[i].op_arg;              *code = verbs[i].op_arg;
5670              if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;              if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5671              *code++ = arglen;              *code++ = arglen;
5672              memcpy(code, arg, arglen);              memcpy(code, arg, IN_UCHARS(arglen));
5673              code += arglen;              code += arglen;
5674              *code++ = 0;              *code++ = 0;
5675              }              }
# Line 5101  for (;; ptr++) Line 5692  for (;; ptr++)
5692        {        {
5693        int i, set, unset, namelen;        int i, set, unset, namelen;
5694        int *optset;        int *optset;
5695        const uschar *name;        const pcre_uchar *name;
5696        uschar *slot;        pcre_uchar *slot;
5697    
5698        switch (*(++ptr))        switch (*(++ptr))
5699          {          {
# Line 5155  for (;; ptr++) Line 5746  for (;; ptr++)
5746            break;            break;
5747    
5748          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Most other conditions use OP_CREF (a couple change to OP_RREF
5749          below), and all need to skip 3 bytes at the start of the group. */          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5750    
5751          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
5752          skipbytes = 3;          skipbytes = 1+IMM2_SIZE;
5753          refsign = -1;          refsign = -1;
5754    
5755          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
# Line 5191  for (;; ptr++) Line 5782  for (;; ptr++)
5782    
5783          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
5784    
5785          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5786            {            {
5787            ptr += 1;  /* To get the right offset */            ptr += 1;  /* To get the right offset */
5788            *errorcodeptr = ERR28;            *errorcodeptr = ERR28;
# Line 5202  for (;; ptr++) Line 5793  for (;; ptr++)
5793    
5794          recno = 0;          recno = 0;
5795          name = ++ptr;          name = ++ptr;
5796          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5797            {            {
5798            if (recno >= 0)            if (recno >= 0)
5799              recno = ((digitab[*ptr] & ctype_digit) != 0)?              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
               recno * 10 + *ptr - CHAR_0 : -1;  
5800            ptr++;            ptr++;
5801            }            }
5802          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
# Line 5254  for (;; ptr++) Line 5844  for (;; ptr++)
5844          slot = cd->name_table;          slot = cd->name_table;
5845          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
5846            {            {
5847            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5848            slot += cd->name_entry_size;            slot += cd->name_entry_size;
5849            }            }
5850    
# Line 5270  for (;; ptr++) Line 5860  for (;; ptr++)
5860          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
5861    
5862          else if ((i = find_parens(cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
5863                          (options & PCRE_EXTENDED) != 0, utf8)) > 0)                          (options & PCRE_EXTENDED) != 0, utf)) > 0)
5864            {            {
5865            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
5866            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
# Line 5296  for (;; ptr++) Line 5886  for (;; ptr++)
5886            recno = 0;            recno = 0;
5887            for (i = 1; i < namelen; i++)            for (i = 1; i < namelen; i++)
5888              {              {
5889              if ((digitab[name[i]] & ctype_digit) == 0)              if (!IS_DIGIT(name[i]))
5890                {                {
5891                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
5892                goto FAILED;                goto FAILED;
# Line 5311  for (;; ptr++) Line 5901  for (;; ptr++)
5901          /* Similarly, check for the (?(DEFINE) "condition", which is always          /* Similarly, check for the (?(DEFINE) "condition", which is always
5902          false. */          false. */
5903    
5904          else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)          else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5905            {            {
5906            code[1+LINK_SIZE] = OP_DEF;            code[1+LINK_SIZE] = OP_DEF;
5907            skipbytes = 1;            skipbytes = 1;
# Line 5374  for (;; ptr++) Line 5964  for (;; ptr++)
5964            break;            break;
5965    
5966            default:                /* Could be name define, else bad */            default:                /* Could be name define, else bad */
5967            if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;            if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5968                goto DEFINE_NAME;
5969            ptr++;                  /* Correct offset for error */            ptr++;                  /* Correct offset for error */
5970            *errorcodeptr = ERR24;            *errorcodeptr = ERR24;
5971            goto FAILED;            goto FAILED;
# Line 5391  for (;; ptr++) Line 5982  for (;; ptr++)
5982    
5983          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5984          case CHAR_C:                 /* Callout - may be followed by digits; */          case CHAR_C:                 /* Callout - may be followed by digits; */
5985          previous_callout = code;  /* Save for later completion */          previous_callout = code;     /* Save for later completion */
5986          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1;    /* Skip one item before completing */
5987          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
5988            {            {
5989            int n = 0;            int n = 0;
5990            while ((digitab[*(++ptr)] & ctype_digit) != 0)            ptr++;
5991              n = n * 10 + *ptr - CHAR_0;            while(IS_DIGIT(*ptr))
5992                n = n * 10 + *ptr++ - CHAR_0;
5993            if (*ptr != CHAR_RIGHT_PARENTHESIS)            if (*ptr != CHAR_RIGHT_PARENTHESIS)
5994              {              {
5995              *errorcodeptr = ERR39;              *errorcodeptr = ERR39;
# Line 5442  for (;; ptr++) Line 6034  for (;; ptr++)
6034              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6035            name = ++ptr;            name = ++ptr;
6036    
6037            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6038            namelen = (int)(ptr - name);            namelen = (int)(ptr - name);
6039    
6040            /* In the pre-compile phase, just do a syntax check. */            /* In the pre-compile phase, just do a syntax check. */
# Line 5459  for (;; ptr++) Line 6051  for (;; ptr++)
6051                *errorcodeptr = ERR49;                *errorcodeptr = ERR49;
6052                goto FAILED;                goto FAILED;
6053                }                }
6054              if (namelen + 3 > cd->name_entry_size)              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6055                {                {
6056                cd->name_entry_size = namelen + 3;                cd->name_entry_size = namelen + IMM2_SIZE + 1;
6057                if (namelen > MAX_NAME_SIZE)                if (namelen > MAX_NAME_SIZE)
6058                  {                  {
6059                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR48;
# Line 5490  for (;; ptr++) Line 6082  for (;; ptr++)
6082    
6083              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
6084                {                {
6085                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6086                if (crc == 0)                if (crc == 0)
6087                  {                  {
6088                  if (slot[2+namelen] == 0)                  if (slot[IMM2_SIZE+namelen] == 0)
6089                    {                    {
6090                    if (GET2(slot, 0) != cd->bracount + 1 &&                    if (GET2(slot, 0) != cd->bracount + 1 &&
6091                        (options & PCRE_DUPNAMES) == 0)                        (options & PCRE_DUPNAMES) == 0)
# Line 5514  for (;; ptr++) Line 6106  for (;; ptr++)
6106                if (crc < 0)                if (crc < 0)
6107                  {                  {
6108                  memmove(slot + cd->name_entry_size, slot,                  memmove(slot + cd->name_entry_size, slot,
6109                    (cd->names_found - i) * cd->name_entry_size);                    IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
6110                  break;                  break;
6111                  }                  }
6112    
# Line 5528  for (;; ptr++) Line 6120  for (;; ptr++)
6120    
6121              if (!dupname)              if (!dupname)
6122                {                {
6123                uschar *cslot = cd->name_table;                pcre_uchar *cslot = cd->name_table;
6124                for (i = 0; i < cd->names_found; i++)                for (i = 0; i < cd->names_found; i++)
6125                  {                  {
6126                  if (cslot != slot)                  if (cslot != slot)
# Line 5545  for (;; ptr++) Line 6137  for (;; ptr++)
6137                }                }
6138    
6139              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
6140              memcpy(slot + 2, name, namelen);              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6141              slot[2+namelen] = 0;              slot[IMM2_SIZE + namelen] = 0;
6142              }              }
6143            }            }
6144    
# Line 5572  for (;; ptr++) Line 6164  for (;; ptr++)
6164    
6165          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
6166          name = ++ptr;          name = ++ptr;
6167          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6168          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6169    
6170          /* In the pre-compile phase, do a syntax check. We used to just set          /* In the pre-compile phase, do a syntax check. We used to just set
# Line 5584  for (;; ptr++) Line 6176  for (;; ptr++)
6176    
6177          if (lengthptr != NULL)          if (lengthptr != NULL)
6178            {            {
6179            const uschar *temp;            const pcre_uchar *temp;
6180    
6181            if (namelen == 0)            if (namelen == 0)
6182              {              {
# Line 5614  for (;; ptr++) Line 6206  for (;; ptr++)
6206            temp = cd->end_pattern;            temp = cd->end_pattern;
6207            cd->end_pattern = ptr;            cd->end_pattern = ptr;
6208            recno = find_parens(cd, name, namelen,            recno = find_parens(cd, name, namelen,
6209              (options & PCRE_EXTENDED) != 0, utf8);              (options & PCRE_EXTENDED) != 0, utf);
6210            cd->end_pattern = temp;            cd->end_pattern = temp;
6211            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
6212            }            }
# Line 5629  for (;; ptr++) Line 6221  for (;; ptr++)
6221            slot = cd->name_table;            slot = cd->name_table;
6222            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
6223              {              {
6224              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6225                  slot[2+namelen] == 0)                  slot[IMM2_SIZE+namelen] == 0)