/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 640 by ph10, Mon Jul 25 10:50:28 2011 UTC revision 842 by ph10, Sat Dec 31 15:19:04 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2011 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
# Line 88  so this number is very generous. Line 92  so this number is very generous.
92  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
93  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
94  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
96    filled up by repetitions of forward references, for example patterns like
97    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98    that the workspace is expanded using malloc() in this situation. The value
99    below is therefore a minimum, and we put a maximum on it for safety. The
100    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101    kicks in at the same number of forward references in all cases. */
102    
103  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105    
106  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
107  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
108    
109  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
110    
111    /* Private flags added to firstchar and reqchar. */
112    
113    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
114    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
115    
116    /* Repeated character flags. */
117    
118    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
119    
120  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 231  static const char posix_names[] = Line 250  static const char posix_names[] =
250    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
252    
253  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
254    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255    
256  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 266  substitutes must be in the order of the Line 285  substitutes must be in the order of the
285  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
286    
287  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
288  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
289    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
292    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
295      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297    static const pcre_uchar string_pXsp[] = {
298      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300    static const pcre_uchar string_PXwd[] = {
301      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303    static const pcre_uchar string_pXwd[] = {
304      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    
307    static const pcre_uchar *substitutes[] = {
308      string_PNd,           /* \D */
309      string_pNd,           /* \d */
310      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
311      string_pXsp,          /* \s */
312      string_PXwd,          /* \W */
313      string_pXwd           /* \w */
314  };  };
315    
316  static const uschar *posix_substitutes[] = {  static const pcre_uchar string_pL[] =   {
317    (uschar *)"\\p{L}",     /* alpha */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318    (uschar *)"\\p{Ll}",    /* lower */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319    (uschar *)"\\p{Lu}",    /* upper */  static const pcre_uchar string_pLl[] =  {
320    (uschar *)"\\p{Xan}",   /* alnum */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321    NULL,                   /* ascii */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pLu[] =  {
323    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325    NULL,                   /* graph */  static const pcre_uchar string_pXan[] = {
326    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327    NULL,                   /* punct */    CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_h[] =    {
329    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_h, '\0' };
330    NULL,                   /* xdigit */  static const pcre_uchar string_pXps[] = {
331      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333    static const pcre_uchar string_PL[] =   {
334      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336    static const pcre_uchar string_PLl[] =  {
337      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339    static const pcre_uchar string_PLu[] =  {
340      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342    static const pcre_uchar string_PXan[] = {
343      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345    static const pcre_uchar string_H[] =    {
346      CHAR_BACKSLASH, CHAR_H, '\0' };
347    static const pcre_uchar string_PXps[] = {
348      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350    
351    static const pcre_uchar *posix_substitutes[] = {
352      string_pL,            /* alpha */
353      string_pLl,           /* lower */
354      string_pLu,           /* upper */
355      string_pXan,          /* alnum */
356      NULL,                 /* ascii */
357      string_h,             /* blank */
358      NULL,                 /* cntrl */
359      string_pNd,           /* digit */
360      NULL,                 /* graph */
361      NULL,                 /* print */
362      NULL,                 /* punct */
363      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
364      string_pXwd,          /* word */
365      NULL,                 /* xdigit */
366    /* Negated cases */    /* Negated cases */
367    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
368    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
369    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
370    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
371    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
372    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
373    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
374    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
375    NULL,                   /* ^graph */    NULL,                 /* ^graph */
376    NULL,                   /* ^print */    NULL,                 /* ^print */
377    NULL,                   /* ^punct */    NULL,                 /* ^punct */
378    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
379    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */
380    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
381  };  };
382  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383  #endif  #endif
384    
385  #define STRING(a)  # a  #define STRING(a)  # a
# Line 409  static const char error_texts[] = Line 482  static const char error_texts[] =
482    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
483    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
484    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
485    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486      /* 70 */
487      "internal error: unknown opcode in find_fixedlength()\0"
488      "\\N is not supported in a class\0"
489      "too many forward references\0"
490      "disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff)\0"
491    ;    ;
492    
493  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 428  For convenience, we use the same bit def Line 506  For convenience, we use the same bit def
506    
507  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
508    
509    /* Using a simple comparison for decimal numbers rather than a memory read
510    is much faster, and the resulting code is simpler (the compiler turns it
511    into a subtraction and unsigned comparison). */
512    
513    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
514    
515  #ifndef EBCDIC  #ifndef EBCDIC
516    
517  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
518  UTF-8 mode. */  UTF-8 mode. */
519    
520  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
521    {    {
522    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
523    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 472  static const unsigned char digitab[] = Line 556  static const unsigned char digitab[] =
556    
557  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
558    
559  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
560    {    {
561    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
562    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 507  static const unsigned char digitab[] = Line 591  static const unsigned char digitab[] =
591    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
592    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
593    
594  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
595    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
596    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
597    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 546  static const unsigned char ebcdic_charta Line 630  static const unsigned char ebcdic_charta
630  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
631    
632  static BOOL  static BOOL
633    compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
634      int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
635    
636    
637    
# Line 578  return s; Line 662  return s;
662    
663    
664  /*************************************************  /*************************************************
665    *           Expand the workspace                 *
666    *************************************************/
667    
668    /* This function is called during the second compiling phase, if the number of
669    forward references fills the existing workspace, which is originally a block on
670    the stack. A larger block is obtained from malloc() unless the ultimate limit
671    has been reached or the increase will be rather small.
672    
673    Argument: pointer to the compile data block
674    Returns:  0 if all went well, else an error number
675    */
676    
677    static int
678    expand_workspace(compile_data *cd)
679    {
680    pcre_uchar *newspace;
681    int newsize = cd->workspace_size * 2;
682    
683    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
684    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
685        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
686     return ERR72;
687    
688    newspace = (PUBL(malloc))(IN_UCHARS(newsize));
689    if (newspace == NULL) return ERR21;
690    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
691    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
692    if (cd->workspace_size > COMPILE_WORK_SIZE)
693      (PUBL(free))((void *)cd->start_workspace);
694    cd->start_workspace = newspace;
695    cd->workspace_size = newsize;
696    return 0;
697    }
698    
699    
700    
701    /*************************************************
702  *            Check for counted repeat            *  *            Check for counted repeat            *
703  *************************************************/  *************************************************/
704    
# Line 593  Returns:    TRUE or FALSE Line 714  Returns:    TRUE or FALSE
714  */  */
715    
716  static BOOL  static BOOL
717  is_counted_repeat(const uschar *p)  is_counted_repeat(const pcre_uchar *p)
718  {  {
719  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
720  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
721    while (IS_DIGIT(*p)) p++;
722  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
723    
724  if (*p++ != CHAR_COMMA) return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
725  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
726    
727  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
728  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
729    while (IS_DIGIT(*p)) p++;
730    
731  return (*p == CHAR_RIGHT_CURLY_BRACKET);  return (*p == CHAR_RIGHT_CURLY_BRACKET);
732  }  }
# Line 635  Returns:         zero or positive => a d Line 758  Returns:         zero or positive => a d
758  */  */
759    
760  static int  static int
761  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
762    int options, BOOL isclass)    int options, BOOL isclass)
763  {  {
764  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
765  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
766  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
767    pcre_int32 c;
768    int i;
769    
770  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
771  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
# Line 654  in a table. A non-zero result is somethi Line 779  in a table. A non-zero result is somethi
779  Otherwise further processing may be required. */  Otherwise further processing may be required. */
780    
781  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
782  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
783    else if (c < CHAR_0 || c > CHAR_z) {}
784  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
785    
786  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
787  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
788    else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
789  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
790  #endif  #endif
791    
# Line 666  else if ((i = escapes[c - 0x48]) != 0) Line 793  else if ((i = escapes[c - 0x48]) != 0)
793    
794  else  else
795    {    {
796    const uschar *oldptr;    const pcre_uchar *oldptr;
797    BOOL braced, negated;    BOOL braced, negated;
798    
799    switch (c)    switch (c)
# Line 676  else Line 803  else
803    
804      case CHAR_l:      case CHAR_l:
805      case CHAR_L:      case CHAR_L:
806        *errorcodeptr = ERR37;
807        break;
808    
809      case CHAR_u:      case CHAR_u:
810        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
811          {
812          /* In JavaScript, \u must be followed by four hexadecimal numbers.
813          Otherwise it is a lowercase u letter. */
814          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
815            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
816            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
817            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
818            {
819            c = 0;
820            for (i = 0; i < 4; ++i)
821              {
822              register int cc = *(++ptr);
823    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
824              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
825              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
826    #else           /* EBCDIC coding */
827              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
828              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
829    #endif
830              }
831            }
832          }
833        else
834          *errorcodeptr = ERR37;
835        break;
836    
837      case CHAR_U:      case CHAR_U:
838      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
839        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
840      break;      break;
841    
842      /* In a character class, \g is just a literal "g". Outside a character      /* In a character class, \g is just a literal "g". Outside a character
843      class, \g must be followed by one of a number of specific things:      class, \g must be followed by one of a number of specific things:
844    
845      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
# Line 710  else Line 868  else
868    
869      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
870        {        {
871        const uschar *p;        const pcre_uchar *p;
872        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
873          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
874        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
875          {          {
876          c = -ESC_k;          c = -ESC_k;
# Line 730  else Line 888  else
888        }        }
889      else negated = FALSE;      else negated = FALSE;
890    
891        /* The integer range is limited by the machine's int representation. */
892      c = 0;      c = 0;
893      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
894          {
895          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
896            {
897            c = -1;
898            break;
899            }
900        c = c * 10 + *(++ptr) - CHAR_0;        c = c * 10 + *(++ptr) - CHAR_0;
901          }
902      if (c < 0)   /* Integer overflow */      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
903        {        {
904          while (IS_DIGIT(ptr[1]))
905            ptr++;
906        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
907        break;        break;
908        }        }
# Line 783  else Line 950  else
950      if (!isclass)      if (!isclass)
951        {        {
952        oldptr = ptr;        oldptr = ptr;
953          /* The integer range is limited by the machine's int representation. */
954        c -= CHAR_0;        c -= CHAR_0;
955        while ((digitab[ptr[1]] & ctype_digit) != 0)        while (IS_DIGIT(ptr[1]))
956            {
957            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
958              {
959              c = -1;
960              break;
961              }
962          c = c * 10 + *(++ptr) - CHAR_0;          c = c * 10 + *(++ptr) - CHAR_0;
963        if (c < 0)    /* Integer overflow */          }
964          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
965          {          {
966            while (IS_DIGIT(ptr[1]))
967              ptr++;
968          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
969          break;          break;
970          }          }
# Line 820  else Line 997  else
997      c -= CHAR_0;      c -= CHAR_0;
998      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
999          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
1000      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf && c > 0xff) *errorcodeptr = ERR51;
1001      break;      break;
1002    
1003      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
1004      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1005      treated as a data character. */      If not, { is treated as a data character. */
1006    
1007      case CHAR_x:      case CHAR_x:
1008        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1009          {
1010          /* In JavaScript, \x must be followed by two hexadecimal numbers.
1011          Otherwise it is a lowercase x letter. */
1012          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1013            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1014            {
1015            c = 0;
1016            for (i = 0; i < 2; ++i)
1017              {
1018              register int cc = *(++ptr);
1019    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1020              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1021              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1022    #else           /* EBCDIC coding */
1023              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1024              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1025    #endif
1026              }
1027            }
1028          break;
1029          }
1030    
1031      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1032        {        {
1033        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
       int count = 0;  
1034    
1035        c = 0;        c = 0;
1036        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1037          {          {
1038          register int cc = *pt++;          register int cc = *pt++;
1039          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
         count++;  
1040    
1041  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1042          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
# Line 847  else Line 1045  else
1045          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1046          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1047  #endif  #endif
1048    
1049    #ifdef COMPILE_PCRE8
1050            if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1051    #else
1052    #ifdef COMPILE_PCRE16
1053            if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1054    #endif
1055    #endif
1056            }
1057    
1058          if (c < 0)
1059            {
1060            while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1061            *errorcodeptr = ERR34;
1062          }          }
1063    
1064        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1065          {          {
1066          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1067          ptr = pt;          ptr = pt;
1068          break;          break;
1069          }          }
# Line 863  else Line 1075  else
1075      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1076    
1077      c = 0;      c = 0;
1078      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1079        {        {
1080        int cc;                                  /* Some compilers don't like */        int cc;                                  /* Some compilers don't like */
1081        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
# Line 921  else Line 1133  else
1133    }    }
1134    
1135  /* Perl supports \N{name} for character names, as well as plain \N for "not  /* Perl supports \N{name} for character names, as well as plain \N for "not
1136  newline". PCRE does not support \N{name}. However, it does support  newline". PCRE does not support \N{name}. However, it does support
1137  quantification such as \N{2,3}. */  quantification such as \N{2,3}. */
1138    
1139  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
# Line 961  Returns:         type value from ucp_typ Line 1173  Returns:         type value from ucp_typ
1173  */  */
1174    
1175  static int  static int
1176  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1177  {  {
1178  int c, i, bot, top;  int c, i, bot, top;
1179  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1180  char name[32];  pcre_uchar name[32];
1181    
1182  c = *(++ptr);  c = *(++ptr);
1183  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 982  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1194  if (c == CHAR_LEFT_CURLY_BRACKET)
1194      *negptr = TRUE;      *negptr = TRUE;
1195      ptr++;      ptr++;
1196      }      }
1197    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1198      {      {
1199      c = *(++ptr);      c = *(++ptr);
1200      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 1006  else Line 1218  else
1218  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1219    
1220  bot = 0;  bot = 0;
1221  top = _pcre_utt_size;  top = PRIV(utt_size);
1222    
1223  while (bot < top)  while (bot < top)
1224    {    {
1225    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1226    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1227    if (c == 0)    if (c == 0)
1228      {      {
1229      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1230      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1231      }      }
1232    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1233    }    }
# Line 1053  Returns:         pointer to '}' on succe Line 1265  Returns:         pointer to '}' on succe
1265                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1266  */  */
1267    
1268  static const uschar *  static const pcre_uchar *
1269  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1270  {  {
1271  int min = 0;  int min = 0;
1272  int max = -1;  int max = -1;
# Line 1062  int max = -1; Line 1274  int max = -1;
1274  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1275  an integer overflow. */  an integer overflow. */
1276    
1277  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1278  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1279    {    {
1280    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1077  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1289  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1289    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1290      {      {
1291      max = 0;      max = 0;
1292      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1293      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1294        {        {
1295        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1132  Arguments: Line 1344  Arguments:
1344    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1345    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1346    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1347    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1348    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1349    
1350  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1351  */  */
1352    
1353  static int  static int
1354  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1355    BOOL xmode, BOOL utf8, int *count)    BOOL xmode, BOOL utf, int *count)
1356  {  {
1357  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1358  int start_count = *count;  int start_count = *count;
1359  int hwm_count = start_count;  int hwm_count = start_count;
1360  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1209  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1421  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1421          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1422        {        {
1423        int term;        int term;
1424        const uschar *thisname;        const pcre_uchar *thisname;
1425        *count += 1;        *count += 1;
1426        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1427        term = *ptr++;        term = *ptr++;
# Line 1217  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1429  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1429        thisname = ptr;        thisname = ptr;
1430        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1431        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1432            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, lorn) == 0)
1433          return *count;          return *count;
1434        term++;        term++;
1435        }        }
# Line 1260  for (; ptr < cd->end_pattern; ptr++) Line 1472  for (; ptr < cd->end_pattern; ptr++)
1472          {          {
1473          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1474            ptr+= 2;            ptr+= 2;
1475          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1476                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1477            ptr += 4;            ptr += 4;
1478          else          else
# Line 1308  for (; ptr < cd->end_pattern; ptr++) Line 1520  for (; ptr < cd->end_pattern; ptr++)
1520        {        {
1521        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1522        ptr++;        ptr++;
1523  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1524        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;        if (utf) FORWARDCHAR(ptr);
1525  #endif  #endif
1526        }        }
1527      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
# Line 1320  for (; ptr < cd->end_pattern; ptr++) Line 1532  for (; ptr < cd->end_pattern; ptr++)
1532    
1533    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1534      {      {
1535      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1536      if (rc > 0) return rc;      if (rc > 0) return rc;
1537      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1538      }      }
# Line 1366  Arguments: Line 1578  Arguments:
1578    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1579    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1580    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1581    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1582    
1583  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1584  */  */
1585    
1586  static int  static int
1587  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1588    BOOL utf8)    BOOL utf)
1589  {  {
1590  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1591  int count = 0;  int count = 0;
1592  int rc;  int rc;
1593    
# Line 1386  matching closing parens. That is why we Line 1598  matching closing parens. That is why we
1598    
1599  for (;;)  for (;;)
1600    {    {
1601    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1602    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1603    }    }
1604    
# Line 1413  Arguments: Line 1625  Arguments:
1625  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1626  */  */
1627    
1628  static const uschar*  static const pcre_uchar*
1629  first_significant_code(const uschar *code, BOOL skipassert)  first_significant_code(const pcre_uchar *code, BOOL skipassert)
1630  {  {
1631  for (;;)  for (;;)
1632    {    {
# Line 1425  for (;;) Line 1637  for (;;)
1637      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1638      if (!skipassert) return code;      if (!skipassert) return code;
1639      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1640      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1641      break;      break;
1642    
1643      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1439  for (;;) Line 1651  for (;;)
1651      case OP_RREF:      case OP_RREF:
1652      case OP_NRREF:      case OP_NRREF:
1653      case OP_DEF:      case OP_DEF:
1654      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1655      break;      break;
1656    
1657      default:      default:
# Line 1469  and doing the check at the end; a flag s Line 1681  and doing the check at the end; a flag s
1681    
1682  Arguments:  Arguments:
1683    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1684    utf8     TRUE in UTF-8 mode    utf      TRUE in UTF-8 / UTF-16 mode
1685    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1686    cd       the "compile data" structure    cd       the "compile data" structure
1687    
1688  Returns:   the fixed length,  Returns:   the fixed length,
1689               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1690               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1691               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1692                 or -4 if an unknown opcode was encountered (internal error)
1693  */  */
1694    
1695  static int  static int
1696  find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1697  {  {
1698  int length = -1;  int length = -1;
1699    
1700  register int branchlength = 0;  register int branchlength = 0;
1701  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1702    
1703  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1704  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1493  branch, check the length against that of Line 1706  branch, check the length against that of
1706  for (;;)  for (;;)
1707    {    {
1708    int d;    int d;
1709    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1710    register int op = *cc;    register int op = *cc;
1711    
1712    switch (op)    switch (op)
1713      {      {
1714      /* We only need to continue for OP_CBRA (normal capturing bracket) and      /* We only need to continue for OP_CBRA (normal capturing bracket) and
1715      OP_BRA (normal non-capturing bracket) because the other variants of these      OP_BRA (normal non-capturing bracket) because the other variants of these
1716      opcodes are all concerned with unlimited repeated groups, which of course      opcodes are all concerned with unlimited repeated groups, which of course
1717      are not of fixed length. They will cause a -1 response from the default      are not of fixed length. */
     case of this switch. */  
1718    
1719      case OP_CBRA:      case OP_CBRA:
1720      case OP_BRA:      case OP_BRA:
1721      case OP_ONCE:      case OP_ONCE:
1722        case OP_ONCE_NC:
1723      case OP_COND:      case OP_COND:
1724      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1725      if (d < 0) return d;      if (d < 0) return d;
1726      branchlength += d;      branchlength += d;
1727      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1728      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1729      break;      break;
1730    
1731      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1732      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1733      END it's the end of the outer call. All can be handled by the same code.      an ALT. If it is END it's the end of the outer call. All can be handled by
1734      Note that we must not include the OP_KETRxxx opcodes here, because they      the same code. Note that we must not include the OP_KETRxxx opcodes here,
1735      all imply an unlimited repeat. */      because they all imply an unlimited repeat. */
1736    
1737      case OP_ALT:      case OP_ALT:
1738      case OP_KET:      case OP_KET:
1739      case OP_END:      case OP_END:
1740        case OP_ACCEPT:
1741        case OP_ASSERT_ACCEPT:
1742      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1743        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1744      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1536  for (;;) Line 1752  for (;;)
1752    
1753      case OP_RECURSE:      case OP_RECURSE:
1754      if (!atend) return -3;      if (!atend) return -3;
1755      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1756      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1757      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1758      d = find_fixedlength(cs + 2, utf8, atend, cd);      d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1759      if (d < 0) return d;      if (d < 0) return d;
1760      branchlength += d;      branchlength += d;
1761      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1552  for (;;) Line 1768  for (;;)
1768      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1769      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1770      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1771      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1772        break;
1773    
1774      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1775    
1776      case OP_REVERSE:      case OP_MARK:
1777      case OP_CREF:      case OP_PRUNE_ARG:
1778      case OP_NCREF:      case OP_SKIP_ARG:
1779      case OP_RREF:      case OP_THEN_ARG:
1780      case OP_NRREF:      cc += cc[1] + PRIV(OP_lengths)[*cc];
1781      case OP_DEF:      break;
1782    
1783      case OP_CALLOUT:      case OP_CALLOUT:
     case OP_SOD:  
     case OP_SOM:  
     case OP_SET_SOM:  
     case OP_EOD:  
     case OP_EODN:  
1784      case OP_CIRC:      case OP_CIRC:
1785      case OP_CIRCM:      case OP_CIRCM:
1786        case OP_CLOSE:
1787        case OP_COMMIT:
1788        case OP_CREF:
1789        case OP_DEF:
1790      case OP_DOLL:      case OP_DOLL:
1791      case OP_DOLLM:      case OP_DOLLM:
1792        case OP_EOD:
1793        case OP_EODN:
1794        case OP_FAIL:
1795        case OP_NCREF:
1796        case OP_NRREF:
1797      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1798        case OP_PRUNE:
1799        case OP_REVERSE:
1800        case OP_RREF:
1801        case OP_SET_SOM:
1802        case OP_SKIP:
1803        case OP_SOD:
1804        case OP_SOM:
1805        case OP_THEN:
1806      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1807      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1808      break;      break;
1809    
1810      /* Handle literal characters */      /* Handle literal characters */
# Line 1585  for (;;) Line 1815  for (;;)
1815      case OP_NOTI:      case OP_NOTI:
1816      branchlength++;      branchlength++;
1817      cc += 2;      cc += 2;
1818  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1819      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1820  #endif  #endif
1821      break;      break;
1822    
# Line 1594  for (;;) Line 1824  for (;;)
1824      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1825    
1826      case OP_EXACT:      case OP_EXACT:
1827        case OP_EXACTI:
1828        case OP_NOTEXACT:
1829        case OP_NOTEXACTI:
1830      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1831      cc += 4;      cc += 2 + IMM2_SIZE;
1832  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1833      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1834  #endif  #endif
1835      break;      break;
1836    
1837      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1838      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1839      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1840      cc += 4;      cc += 1 + IMM2_SIZE + 1;
1841      break;      break;
1842    
1843      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1614  for (;;) Line 1847  for (;;)
1847      cc += 2;      cc += 2;
1848      /* Fall through */      /* Fall through */
1849    
1850        case OP_HSPACE:
1851        case OP_VSPACE:
1852        case OP_NOT_HSPACE:
1853        case OP_NOT_VSPACE:
1854      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1855      case OP_DIGIT:      case OP_DIGIT:
1856      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1626  for (;;) Line 1863  for (;;)
1863      cc++;      cc++;
1864      break;      break;
1865    
1866      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1867        otherwise \C is coded as OP_ALLANY. */
1868    
1869      case OP_ANYBYTE:      case OP_ANYBYTE:
1870      return -2;      return -2;
1871    
1872      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1873    
1874  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1875      case OP_XCLASS:      case OP_XCLASS:
1876      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1877      /* Fall through */      /* Fall through */
1878  #endif  #endif
1879    
1880      case OP_CLASS:      case OP_CLASS:
1881      case OP_NCLASS:      case OP_NCLASS:
1882      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1883    
1884      switch (*cc)      switch (*cc)
1885        {        {
1886          case OP_CRPLUS:
1887          case OP_CRMINPLUS:
1888        case OP_CRSTAR:        case OP_CRSTAR:
1889        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1890        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1653  for (;;) Line 1893  for (;;)
1893    
1894        case OP_CRRANGE:        case OP_CRRANGE:
1895        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1896        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1897        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1898        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1899        break;        break;
1900    
1901        default:        default:
# Line 1665  for (;;) Line 1905  for (;;)
1905    
1906      /* Anything else is variable length */      /* Anything else is variable length */
1907    
1908      default:      case OP_ANYNL:
1909        case OP_BRAMINZERO:
1910        case OP_BRAPOS:
1911        case OP_BRAPOSZERO:
1912        case OP_BRAZERO:
1913        case OP_CBRAPOS:
1914        case OP_EXTUNI:
1915        case OP_KETRMAX:
1916        case OP_KETRMIN:
1917        case OP_KETRPOS:
1918        case OP_MINPLUS:
1919        case OP_MINPLUSI:
1920        case OP_MINQUERY:
1921        case OP_MINQUERYI:
1922        case OP_MINSTAR:
1923        case OP_MINSTARI:
1924        case OP_MINUPTO:
1925        case OP_MINUPTOI:
1926        case OP_NOTMINPLUS:
1927        case OP_NOTMINPLUSI:
1928        case OP_NOTMINQUERY:
1929        case OP_NOTMINQUERYI:
1930        case OP_NOTMINSTAR:
1931        case OP_NOTMINSTARI:
1932        case OP_NOTMINUPTO:
1933        case OP_NOTMINUPTOI:
1934        case OP_NOTPLUS:
1935        case OP_NOTPLUSI:
1936        case OP_NOTPOSPLUS:
1937        case OP_NOTPOSPLUSI:
1938        case OP_NOTPOSQUERY:
1939        case OP_NOTPOSQUERYI:
1940        case OP_NOTPOSSTAR:
1941        case OP_NOTPOSSTARI:
1942        case OP_NOTPOSUPTO:
1943        case OP_NOTPOSUPTOI:
1944        case OP_NOTQUERY:
1945        case OP_NOTQUERYI:
1946        case OP_NOTSTAR:
1947        case OP_NOTSTARI:
1948        case OP_NOTUPTO:
1949        case OP_NOTUPTOI:
1950        case OP_PLUS:
1951        case OP_PLUSI:
1952        case OP_POSPLUS:
1953        case OP_POSPLUSI:
1954        case OP_POSQUERY:
1955        case OP_POSQUERYI:
1956        case OP_POSSTAR:
1957        case OP_POSSTARI:
1958        case OP_POSUPTO:
1959        case OP_POSUPTOI:
1960        case OP_QUERY:
1961        case OP_QUERYI:
1962        case OP_REF:
1963        case OP_REFI:
1964        case OP_SBRA:
1965        case OP_SBRAPOS:
1966        case OP_SCBRA:
1967        case OP_SCBRAPOS:
1968        case OP_SCOND:
1969        case OP_SKIPZERO:
1970        case OP_STAR:
1971        case OP_STARI:
1972        case OP_TYPEMINPLUS:
1973        case OP_TYPEMINQUERY:
1974        case OP_TYPEMINSTAR:
1975        case OP_TYPEMINUPTO:
1976        case OP_TYPEPLUS:
1977        case OP_TYPEPOSPLUS:
1978        case OP_TYPEPOSQUERY:
1979        case OP_TYPEPOSSTAR:
1980        case OP_TYPEPOSUPTO:
1981        case OP_TYPEQUERY:
1982        case OP_TYPESTAR:
1983        case OP_TYPEUPTO:
1984        case OP_UPTO:
1985        case OP_UPTOI:
1986      return -1;      return -1;
1987    
1988        /* Catch unrecognized opcodes so that when new ones are added they
1989        are not forgotten, as has happened in the past. */
1990    
1991        default:
1992        return -4;
1993      }      }
1994    }    }
1995  /* Control never gets here */  /* Control never gets here */
# Line 1687  length. Line 2010  length.
2010    
2011  Arguments:  Arguments:
2012    code        points to start of expression    code        points to start of expression
2013    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2014    number      the required bracket number or negative to find a lookbehind    number      the required bracket number or negative to find a lookbehind
2015    
2016  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2017  */  */
2018    
2019  const uschar *  const pcre_uchar *
2020  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2021  {  {
2022  for (;;)  for (;;)
2023    {    {
# Line 1712  for (;;) Line 2035  for (;;)
2035    
2036    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
2037      {      {
2038      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
2039      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2040      }      }
2041    
2042    /* Handle capturing bracket */    /* Handle capturing bracket */
# Line 1722  for (;;) Line 2045  for (;;)
2045             c == OP_CBRAPOS || c == OP_SCBRAPOS)             c == OP_CBRAPOS || c == OP_SCBRAPOS)
2046      {      {
2047      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
2048      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2049      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2050      }      }
2051    
2052    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
# Line 1751  for (;;) Line 2074  for (;;)
2074        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2075        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2076        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2077        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2078            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2079        break;        break;
2080    
2081        case OP_MARK:        case OP_MARK:
# Line 1761  for (;;) Line 2085  for (;;)
2085        break;        break;
2086    
2087        case OP_THEN_ARG:        case OP_THEN_ARG:
2088        code += code[1+LINK_SIZE];        code += code[1];
2089        break;        break;
2090        }        }
2091    
2092      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2093    
2094      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2095    
2096    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2097    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2098    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2099    
2100  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2101      if (utf8) switch(c)      if (utf) switch(c)
2102        {        {
2103        case OP_CHAR:        case OP_CHAR:
2104        case OP_CHARI:        case OP_CHARI:
# Line 1804  for (;;) Line 2128  for (;;)
2128        case OP_MINQUERYI:        case OP_MINQUERYI:
2129        case OP_POSQUERY:        case OP_POSQUERY:
2130        case OP_POSQUERYI:        case OP_POSQUERYI:
2131        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2132        break;        break;
2133        }        }
2134  #else  #else
2135      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2136  #endif  #endif
2137      }      }
2138    }    }
# Line 1825  instance of OP_RECURSE. Line 2149  instance of OP_RECURSE.
2149    
2150  Arguments:  Arguments:
2151    code        points to start of expression    code        points to start of expression
2152    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2153    
2154  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2155  */  */
2156    
2157  static const uschar *  static const pcre_uchar *
2158  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2159  {  {
2160  for (;;)  for (;;)
2161    {    {
# Line 1870  for (;;) Line 2194  for (;;)
2194        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2195        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2196        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2197        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2198            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2199        break;        break;
2200    
2201        case OP_MARK:        case OP_MARK:
# Line 1880  for (;;) Line 2205  for (;;)
2205        break;        break;
2206    
2207        case OP_THEN_ARG:        case OP_THEN_ARG:
2208        code += code[1+LINK_SIZE];        code += code[1];
2209        break;        break;
2210        }        }
2211    
2212      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2213    
2214      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2215    
2216      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2217      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2218      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2219    
2220  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2221      if (utf8) switch(c)      if (utf) switch(c)
2222        {        {
2223        case OP_CHAR:        case OP_CHAR:
2224        case OP_CHARI:        case OP_CHARI:
# Line 1923  for (;;) Line 2248  for (;;)
2248        case OP_MINQUERYI:        case OP_MINQUERYI:
2249        case OP_POSQUERY:        case OP_POSQUERY:
2250        case OP_POSQUERYI:        case OP_POSQUERYI:
2251        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2252        break;        break;
2253        }        }
2254  #else  #else
2255      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2256  #endif  #endif
2257      }      }
2258    }    }
# Line 1950  bracket whose current branch will alread Line 2275  bracket whose current branch will alread
2275  Arguments:  Arguments:
2276    code        points to start of search    code        points to start of search
2277    endcode     points to where to stop    endcode     points to where to stop
2278    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2279    cd          contains pointers to tables etc.    cd          contains pointers to tables etc.
2280    
2281  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2282  */  */
2283    
2284  static BOOL  static BOOL
2285  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2286    compile_data *cd)    BOOL utf, compile_data *cd)
2287  {  {
2288  register int c;  register int c;
2289  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2290       code < endcode;       code < endcode;
2291       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2292    {    {
2293    const uschar *ccode;    const pcre_uchar *ccode;
2294    
2295    c = *code;    c = *code;
2296    
# Line 1982  for (code = first_significant_code(code Line 2307  for (code = first_significant_code(code
2307    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
2308    implies a backward reference subroutine call, we can scan it. If it's a    implies a backward reference subroutine call, we can scan it. If it's a
2309    forward reference subroutine call, we can't. To detect forward reference    forward reference subroutine call, we can't. To detect forward reference
2310    we have to scan up the list that is kept in the workspace. This function is    we have to scan up the list that is kept in the workspace. This function is
2311    called only when doing the real compile, not during the pre-compile that    called only when doing the real compile, not during the pre-compile that
2312    measures the size of the compiled pattern. */    measures the size of the compiled pattern. */
2313    
2314    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2315      {      {
2316      const uschar *scode;      const pcre_uchar *scode;
2317      BOOL empty_branch;      BOOL empty_branch;
2318    
2319      /* Test for forward reference */      /* Test for forward reference */
2320    
2321      for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)      for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2322        if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;        if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2323    
2324      /* Not a forward reference, test for completed backward reference */      /* Not a forward reference, test for completed backward reference */
2325    
2326      empty_branch = FALSE;      empty_branch = FALSE;
2327      scode = cd->start_code + GET(code, 1);      scode = cd->start_code + GET(code, 1);
2328      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2329    
2330      /* Completed backwards reference */      /* Completed backwards reference */
2331    
2332      do      do
2333        {        {
2334        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf, cd))
2335          {          {
2336          empty_branch = TRUE;          empty_branch = TRUE;
2337          break;          break;
# Line 2014  for (code = first_significant_code(code Line 2339  for (code = first_significant_code(code
2339        scode += GET(scode, 1);        scode += GET(scode, 1);
2340        }        }
2341      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2342    
2343      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2344      continue;      continue;
2345      }      }
# Line 2024  for (code = first_significant_code(code Line 2349  for (code = first_significant_code(code
2349    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2350        c == OP_BRAPOSZERO)        c == OP_BRAPOSZERO)
2351      {      {
2352      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2353      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
2354      c = *code;      c = *code;
2355      continue;      continue;
# Line 2045  for (code = first_significant_code(code Line 2370  for (code = first_significant_code(code
2370    
2371    if (c == OP_BRA  || c == OP_BRAPOS ||    if (c == OP_BRA  || c == OP_BRAPOS ||
2372        c == OP_CBRA || c == OP_CBRAPOS ||        c == OP_CBRA || c == OP_CBRAPOS ||
2373        c == OP_ONCE || c == OP_COND)        c == OP_ONCE || c == OP_ONCE_NC ||
2374          c == OP_COND)
2375      {      {
2376      BOOL empty_branch;      BOOL empty_branch;
2377      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 2061  for (code = first_significant_code(code Line 2387  for (code = first_significant_code(code
2387        empty_branch = FALSE;        empty_branch = FALSE;
2388        do        do
2389          {          {
2390          if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2391            empty_branch = TRUE;            empty_branch = TRUE;
2392          code += GET(code, 1);          code += GET(code, 1);
2393          }          }
# Line 2079  for (code = first_significant_code(code Line 2405  for (code = first_significant_code(code
2405      {      {
2406      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2407      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2408      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2409      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2410      here. */      here. */
2411    
2412  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2413      case OP_XCLASS:      case OP_XCLASS:
2414      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2415      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2091  for (code = first_significant_code(code Line 2417  for (code = first_significant_code(code
2417    
2418      case OP_CLASS:      case OP_CLASS:
2419      case OP_NCLASS:      case OP_NCLASS:
2420      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2421    
2422  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2423      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2424  #endif  #endif
2425    
# Line 2166  for (code = first_significant_code(code Line 2492  for (code = first_significant_code(code
2492      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2493      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2494      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2495      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP
2496          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2497      break;      break;
2498    
2499      /* End of branch */      /* End of branch */
# Line 2181  for (code = first_significant_code(code Line 2508  for (code = first_significant_code(code
2508      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2509      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
2510    
2511  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2512      case OP_STAR:      case OP_STAR:
2513      case OP_STARI:      case OP_STARI:
2514      case OP_MINSTAR:      case OP_MINSTAR:
# Line 2194  for (code = first_significant_code(code Line 2521  for (code = first_significant_code(code
2521      case OP_MINQUERYI:      case OP_MINQUERYI:
2522      case OP_POSQUERY:      case OP_POSQUERY:
2523      case OP_POSQUERYI:      case OP_POSQUERYI:
2524      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2525      break;      break;
2526    
2527      case OP_UPTO:      case OP_UPTO:
# Line 2203  for (code = first_significant_code(code Line 2530  for (code = first_significant_code(code
2530      case OP_MINUPTOI:      case OP_MINUPTOI:
2531      case OP_POSUPTO:      case OP_POSUPTO:
2532      case OP_POSUPTOI:      case OP_POSUPTOI:
2533      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2534      break;      break;
2535  #endif  #endif
2536    
# Line 2217  for (code = first_significant_code(code Line 2544  for (code = first_significant_code(code
2544      break;      break;
2545    
2546      case OP_THEN_ARG:      case OP_THEN_ARG:
2547      code += code[1+LINK_SIZE];      code += code[1];
2548      break;      break;
2549    
2550      /* None of the remaining opcodes are required to match a character. */      /* None of the remaining opcodes are required to match a character. */
# Line 2240  return TRUE; Line 2567  return TRUE;
2567  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2568  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2569  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2570  This function is called only during the real compile, not during the  This function is called only during the real compile, not during the
2571  pre-compile.  pre-compile.
2572    
2573  Arguments:  Arguments:
2574    code        points to start of the recursion    code        points to start of the recursion
2575    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2576    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2577    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2578    cd          pointers to tables etc    cd          pointers to tables etc
2579    
2580  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2581  */  */
2582    
2583  static BOOL  static BOOL
2584  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2585    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2586  {  {
2587  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2588    {    {
2589    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2590      return FALSE;      return FALSE;
2591    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2592    }    }
# Line 2295  I think. Line 2622  I think.
2622  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2623  It seems that the appearance of a nested POSIX class supersedes an apparent  It seems that the appearance of a nested POSIX class supersedes an apparent
2624  external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or  external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2625  a digit. Also, unescaped square brackets may also appear as part of class  a digit.
2626  names. For example, [:a[:abc]b:] gives unknown class "[:abc]b:]"in Perl.  
2627    In Perl, unescaped square brackets may also appear as part of class names. For
2628    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2629    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2630    seem right at all. PCRE does not allow closing square brackets in POSIX class
2631    names.
2632    
2633  Arguments:  Arguments:
2634    ptr      pointer to the initial [    ptr      pointer to the initial [
# Line 2306  Returns:   TRUE or FALSE Line 2638  Returns:   TRUE or FALSE
2638  */  */
2639    
2640  static BOOL  static BOOL
2641  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2642  {  {
2643  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2644  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2645  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2646    {    {
2647    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2648      ptr++;      ptr++;
2649      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2650    else    else
2651      {      {
2652      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
# Line 2325  for (++ptr; *ptr != 0; ptr++) Line 2658  for (++ptr; *ptr != 0; ptr++)
2658           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2659            ptr[1] == CHAR_EQUALS_SIGN) &&            ptr[1] == CHAR_EQUALS_SIGN) &&
2660          check_posix_syntax(ptr, endptr))          check_posix_syntax(ptr, endptr))
2661        return FALSE;        return FALSE;
2662      }      }
2663    }    }
2664  return FALSE;  return FALSE;
# Line 2349  Returns:     a value representing the na Line 2682  Returns:     a value representing the na
2682  */  */
2683    
2684  static int  static int
2685  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2686  {  {
2687  const char *pn = posix_names;  const char *pn = posix_names;
2688  register int yield = 0;  register int yield = 0;
2689  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2690    {    {
2691    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2692      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2693    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2694    yield++;    yield++;
2695    }    }
# Line 2388  value in the reference (which is a group Line 2721  value in the reference (which is a group
2721  Arguments:  Arguments:
2722    group      points to the start of the group    group      points to the start of the group
2723    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2724    utf8       TRUE in UTF-8 mode    utf        TRUE in UTF-8 / UTF-16 mode
2725    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2726    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm   the hwm forward reference pointer at the start of the group
2727    
# Line 2396  Returns:     nothing Line 2729  Returns:     nothing
2729  */  */
2730    
2731  static void  static void
2732  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2733    uschar *save_hwm)    pcre_uchar *save_hwm)
2734  {  {
2735  uschar *ptr = group;  pcre_uchar *ptr = group;
2736    
2737  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2738    {    {
2739    int offset;    int offset;
2740    uschar *hc;    pcre_uchar *hc;
2741    
2742    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2743    reference. */    reference. */
# Line 2449  Arguments: Line 2782  Arguments:
2782  Returns:         new code pointer  Returns:         new code pointer
2783  */  */
2784    
2785  static uschar *  static pcre_uchar *
2786  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2787  {  {
2788  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2789  *code++ = 255;  *code++ = 255;
2790  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2791  PUT(code, LINK_SIZE, 0);                       /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2792  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2793  }  }
2794    
2795    
# Line 2478  Returns:             nothing Line 2811  Returns:             nothing
2811  */  */
2812    
2813  static void  static void
2814  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2815  {  {
2816  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2817  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
# Line 2561  switch(ptype) Line 2894  switch(ptype)
2894            prop->chartype == ucp_Lt) == negated;            prop->chartype == ucp_Lt) == negated;
2895    
2896    case PT_GC:    case PT_GC:
2897    return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2898    
2899    case PT_PC:    case PT_PC:
2900    return (pdata == prop->chartype) == negated;    return (pdata == prop->chartype) == negated;
# Line 2572  switch(ptype) Line 2905  switch(ptype)
2905    /* These are specials */    /* These are specials */
2906    
2907    case PT_ALNUM:    case PT_ALNUM:
2908    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2909            _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2910    
2911    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2912    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2913            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2914            == negated;            == negated;
2915    
2916    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2917    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2918            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2919            c == CHAR_FF || c == CHAR_CR)            c == CHAR_FF || c == CHAR_CR)
2920            == negated;            == negated;
2921    
2922    case PT_WORD:    case PT_WORD:
2923    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2924            _pcre_ucp_gentype[prop->chartype] == ucp_N ||            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2925            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
2926    }    }
2927  return FALSE;  return FALSE;
# Line 2607  sense to automatically possessify the re Line 2940  sense to automatically possessify the re
2940    
2941  Arguments:  Arguments:
2942    previous      pointer to the repeated opcode    previous      pointer to the repeated opcode
2943    utf8          TRUE in UTF-8 mode    utf           TRUE in UTF-8 / UTF-16 mode
2944    ptr           next character in pattern    ptr           next character in pattern
2945    options       options bits    options       options bits
2946    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 2616  Returns:        TRUE if possessifying is Line 2949  Returns:        TRUE if possessifying is
2949  */  */
2950    
2951  static BOOL  static BOOL
2952  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2953    int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2954  {  {
2955  int c, next;  pcre_int32 c, next;
2956  int op_code = *previous++;  int op_code = *previous++;
2957    
2958  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2628  if ((options & PCRE_EXTENDED) != 0) Line 2961  if ((options & PCRE_EXTENDED) != 0)
2961    {    {
2962    for (;;)    for (;;)
2963      {      {
2964      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2965      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2966        {        {
2967        ptr++;        ptr++;
# Line 2636  if ((options & PCRE_EXTENDED) != 0) Line 2969  if ((options & PCRE_EXTENDED) != 0)
2969          {          {
2970          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2971          ptr++;          ptr++;
2972  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2973          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
2974  #endif  #endif
2975          }          }
2976        }        }
# Line 2655  if (*ptr == CHAR_BACKSLASH) Line 2988  if (*ptr == CHAR_BACKSLASH)
2988    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
2989    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
2990    }    }
2991    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
2992    {    {
2993  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2994    if (utf8) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
2995  #endif  #endif
2996    next = *ptr++;    next = *ptr++;
2997    }    }
   
2998  else return FALSE;  else return FALSE;
2999    
3000  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2672  if ((options & PCRE_EXTENDED) != 0) Line 3003  if ((options & PCRE_EXTENDED) != 0)
3003    {    {
3004    for (;;)    for (;;)
3005      {      {
3006      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3007      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3008        {        {
3009        ptr++;        ptr++;
# Line 2680  if ((options & PCRE_EXTENDED) != 0) Line 3011  if ((options & PCRE_EXTENDED) != 0)
3011          {          {
3012          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3013          ptr++;          ptr++;
3014  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3015          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3016  #endif  #endif
3017          }          }
3018        }        }
# Line 2692  if ((options & PCRE_EXTENDED) != 0) Line 3023  if ((options & PCRE_EXTENDED) != 0)
3023  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
3024    
3025  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3026    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3027      return FALSE;      return FALSE;
3028    
3029  /* Now compare the next item with the previous opcode. First, handle cases when  /* Now compare the next item with the previous opcode. First, handle cases when
# Line 2701  the next item is a character. */ Line 3032  the next item is a character. */
3032  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3033    {    {
3034    case OP_CHAR:    case OP_CHAR:
3035  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3036    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3037  #else  #else
3038    c = *previous;    c = *previous;
# Line 2713  if (next >= 0) switch(op_code) Line 3044  if (next >= 0) switch(op_code)
3044    high-valued characters. */    high-valued characters. */
3045    
3046    case OP_CHARI:    case OP_CHARI:
3047  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3048    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3049  #else  #else
3050    c = *previous;    c = *previous;
3051  #endif  #endif
3052    if (c == next) return FALSE;    if (c == next) return FALSE;
3053  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3054    if (utf8)    if (utf)
3055      {      {
3056      unsigned int othercase;      unsigned int othercase;
3057      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2732  if (next >= 0) switch(op_code) Line 3063  if (next >= 0) switch(op_code)
3063      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3064      }      }
3065    else    else
3066  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3067    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3068    
3069    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
3070    opcodes are not used for multi-byte characters, because they are coded using    opcodes are not used for multi-byte characters, because they are coded using
# Line 2744  if (next >= 0) switch(op_code) Line 3075  if (next >= 0) switch(op_code)
3075    
3076    case OP_NOTI:    case OP_NOTI:
3077    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
3078  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3079    if (utf8)    if (utf)
3080      {      {
3081      unsigned int othercase;      unsigned int othercase;
3082      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2757  if (next >= 0) switch(op_code) Line 3088  if (next >= 0) switch(op_code)
3088      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3089      }      }
3090    else    else
3091  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3092    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3093    
3094    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3095    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
# Line 2849  switch(op_code) Line 3180  switch(op_code)
3180    {    {
3181    case OP_CHAR:    case OP_CHAR:
3182    case OP_CHARI:    case OP_CHARI:
3183  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3184    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3185  #else  #else
3186    c = *previous;    c = *previous;
# Line 2954  switch(op_code) Line 3285  switch(op_code)
3285        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
3286    
3287        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3288          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3289            return FALSE;            return FALSE;
3290    
3291        /* Do the property check. */        /* Do the property check. */
# Line 3032  Arguments: Line 3363  Arguments:
3363    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3364    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3365    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3366    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3367    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3368    bcptr          points to current branch chain    bcptr          points to current branch chain
3369      cond_depth     conditional nesting depth
3370    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3371    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3372                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 3044  Returns:         TRUE on success Line 3376  Returns:         TRUE on success
3376  */  */
3377    
3378  static BOOL  static BOOL
3379  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3380    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3381      pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3382    compile_data *cd, int *lengthptr)    compile_data *cd, int *lengthptr)
3383  {  {
3384  int repeat_type, op_type;  int repeat_type, op_type;
3385  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3386  int bravalue = 0;  int bravalue = 0;
3387  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3388  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3389  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3390  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3391  int options = *optionsptr;               /* May change dynamically */  int options = *optionsptr;               /* May change dynamically */
3392  int after_manual_callout = 0;  int after_manual_callout = 0;
3393  int length_prevgroup = 0;  int length_prevgroup = 0;
3394  register int c;  register int c;
3395  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3396  uschar *last_code = code;  pcre_uchar *last_code = code;
3397  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3398  uschar *tempcode;  pcre_uchar *tempcode;
3399  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3400  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3401  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3402  const uschar *tempptr;  const pcre_uchar *tempptr;
3403  const uschar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
3404  uschar *previous = NULL;  pcre_uchar *previous = NULL;
3405  uschar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
3406  uschar *save_hwm = NULL;  pcre_uchar *save_hwm = NULL;
3407  uschar classbits[32];  pcre_uint8 classbits[32];
3408    
3409  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3410  must not do this for other options (e.g. PCRE_EXTENDED) because they may change  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3411  dynamically as we process the pattern. */  dynamically as we process the pattern. */
3412    
3413  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3414  BOOL class_utf8;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3415  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
3416  uschar *class_utf8data;  pcre_uchar utf_chars[6];
 uschar *class_utf8data_base;  
 uschar utf8_char[6];  
3417  #else  #else
3418  BOOL utf8 = FALSE;  BOOL utf = FALSE;
3419  uschar *utf8_char = NULL;  #endif
3420    
3421    /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3422    
3423    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3424    BOOL xclass;
3425    pcre_uchar *class_uchardata;
3426    pcre_uchar *class_uchardata_base;
3427  #endif  #endif
3428    
3429  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 3099  greedy_non_default = greedy_default ^ 1; Line 3437  greedy_non_default = greedy_default ^ 1;
3437    
3438  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3439  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3440  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3441  find one.  find one.
3442    
3443  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3444  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3445  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3446  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3447    
3448  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3449    
3450  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3451  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3452  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3453  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3454    value. This is used only for ASCII characters. */
3455    
3456  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3457    
3458  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3459    
# Line 3126  for (;; ptr++) Line 3465  for (;; ptr++)
3465    BOOL is_quantifier;    BOOL is_quantifier;
3466    BOOL is_recurse;    BOOL is_recurse;
3467    BOOL reset_bracount;    BOOL reset_bracount;
3468    int class_charcount;    int class_has_8bitchar;
3469    int class_lastchar;    int class_single_char;
3470    int newoptions;    int newoptions;
3471    int recno;    int recno;
3472    int refsign;    int refsign;
3473    int skipbytes;    int skipbytes;
3474    int subreqbyte;    int subreqchar;
3475    int subfirstbyte;    int subfirstchar;
3476    int terminator;    int terminator;
3477    int mclength;    int mclength;
3478    uschar mcbuffer[8];    int tempbracount;
3479      pcre_uchar mcbuffer[8];
3480    
3481    /* Get next byte in the pattern */    /* Get next character in the pattern */
3482    
3483    c = *ptr;    c = *ptr;
3484    
# Line 3160  for (;; ptr++) Line 3500  for (;; ptr++)
3500  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3501      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3502  #endif  #endif
3503      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3504            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3505        {        {
3506        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3507        goto FAILED;        goto FAILED;
# Line 3183  for (;; ptr++) Line 3524  for (;; ptr++)
3524        }        }
3525    
3526      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3527      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3528          (int)(code - last_code), c, c));
3529    
3530      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3531      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 3193  for (;; ptr++) Line 3535  for (;; ptr++)
3535        {        {
3536        if (previous > orig_code)        if (previous > orig_code)
3537          {          {
3538          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3539          code -= previous - orig_code;          code -= previous - orig_code;
3540          previous = orig_code;          previous = orig_code;
3541          }          }
# Line 3209  for (;; ptr++) Line 3551  for (;; ptr++)
3551    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3552    reference list. */    reference list. */
3553    
3554    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3555               WORK_SIZE_SAFETY_MARGIN)
3556      {      {
3557      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3558      goto FAILED;      goto FAILED;
# Line 3261  for (;; ptr++) Line 3604  for (;; ptr++)
3604    
3605    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3606      {      {
3607      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3608      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3609        {        {
3610        ptr++;        ptr++;
# Line 3269  for (;; ptr++) Line 3612  for (;; ptr++)
3612          {          {
3613          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3614          ptr++;          ptr++;
3615  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3616          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3617  #endif  #endif
3618          }          }
3619        if (*ptr != 0) continue;        if (*ptr != 0) continue;
# Line 3294  for (;; ptr++) Line 3637  for (;; ptr++)
3637      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3638      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3639      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3640      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3641      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3642      *codeptr = code;      *codeptr = code;
3643      *ptrptr = ptr;      *ptrptr = ptr;
3644      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3319  for (;; ptr++) Line 3662  for (;; ptr++)
3662      previous = NULL;      previous = NULL;
3663      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3664        {        {
3665        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3666        *code++ = OP_CIRCM;        *code++ = OP_CIRCM;
3667        }        }
3668      else *code++ = OP_CIRC;      else *code++ = OP_CIRC;
# Line 3331  for (;; ptr++) Line 3674  for (;; ptr++)
3674      break;      break;
3675    
3676      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3677      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3678    
3679      case CHAR_DOT:      case CHAR_DOT:
3680      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3681      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3682      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3683      previous = code;      previous = code;
3684      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3685      break;      break;
# Line 3391  for (;; ptr++) Line 3734  for (;; ptr++)
3734          {          {
3735          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
3736            ptr++;            ptr++;
3737          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
                           STR_Q STR_BACKSLASH STR_E, 3) == 0)  
3738            ptr += 3;            ptr += 3;
3739          else          else
3740            break;            break;
# Line 3411  for (;; ptr++) Line 3753  for (;; ptr++)
3753          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3754        {        {
3755        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3756        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3757        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3758        break;        break;
3759        }        }
3760    
# Line 3422  for (;; ptr++) Line 3764  for (;; ptr++)
3764    
3765      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3766    
3767      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3768      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3769      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3770        a single character. */
3771    
3772      class_charcount = 0;      class_has_8bitchar = 0;
3773      class_lastchar = -1;      class_single_char = 0;
3774    
3775      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3776      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
3777      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3778      */      */
3779    
3780      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3781    
3782  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3783      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3784      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3785      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3786  #endif  #endif
3787    
3788      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3448  for (;; ptr++) Line 3791  for (;; ptr++)
3791    
3792      if (c != 0) do      if (c != 0) do
3793        {        {
3794        const uschar *oldptr;        const pcre_uchar *oldptr;
3795    
3796  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3797        if (utf8 && c > 127)        if (utf && HAS_EXTRALEN(c))
3798          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3799          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3800          }          }
3801    #endif
3802    
3803        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3804          /* In the pre-compile phase, accumulate the length of any extra
3805        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3806        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
3807        (which is on the stack). */        (which is on the stack). */
3808    
3809        if (lengthptr != NULL)        if (lengthptr != NULL)
3810          {          {
3811          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_uchardata - class_uchardata_base;
3812          class_utf8data = class_utf8data_base;          class_uchardata = class_uchardata_base;
3813          }          }
   
3814  #endif  #endif
3815    
3816        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3494  for (;; ptr++) Line 3838  for (;; ptr++)
3838          {          {
3839          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3840          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3841          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3842          uschar pbits[32];          pcre_uint8 pbits[32];
3843    
3844          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
3845            {            {
# Line 3550  for (;; ptr++) Line 3894  for (;; ptr++)
3894          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3895    
3896          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3897            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3898    
3899          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3900    
# Line 3581  for (;; ptr++) Line 3925  for (;; ptr++)
3925            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3926    
3927          ptr = tempptr + 1;          ptr = tempptr + 1;
3928          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3929            class_has_8bitchar = 1;
3930            /* Every class contains at least two characters. */
3931            class_single_char = 2;
3932          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3933          }          }
3934    
3935        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3936        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3937        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3938        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3939        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3940        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3941          as literal characters (by default), or are faulted if
3942        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3943    
3944        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3599  for (;; ptr++) Line 3947  for (;; ptr++)
3947          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3948    
3949          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3950            else if (-c == ESC_N)            /* \N is not supported in a class */
3951              {
3952              *errorcodeptr = ERR71;
3953              goto FAILED;
3954              }
3955          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3956            {            {
3957            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3612  for (;; ptr++) Line 3965  for (;; ptr++)
3965    
3966          if (c < 0)          if (c < 0)
3967            {            {
3968            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3969            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3970              class_has_8bitchar++;
3971              /* Every class contains at least two characters. */
3972              class_single_char += 2;
3973    
3974            switch (-c)            switch (-c)
3975              {              {
# Line 3626  for (;; ptr++) Line 3982  for (;; ptr++)
3982              case ESC_SU:              case ESC_SU:
3983              nestptr = ptr;              nestptr = ptr;
3984              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3985              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
3986              continue;              continue;
3987  #endif  #endif
3988              case ESC_d:              case ESC_d:
# Line 3667  for (;; ptr++) Line 4023  for (;; ptr++)
4023              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
4024              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
4025              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
4026  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4027              if (utf8)              xclass = TRUE;
4028                *class_uchardata++ = XCL_SINGLE;
4029                *class_uchardata++ = 0x1680;
4030                *class_uchardata++ = XCL_SINGLE;
4031                *class_uchardata++ = 0x180e;
4032                *class_uchardata++ = XCL_RANGE;
4033                *class_uchardata++ = 0x2000;
4034                *class_uchardata++ = 0x200a;
4035                *class_uchardata++ = XCL_SINGLE;
4036                *class_uchardata++ = 0x202f;
4037                *class_uchardata++ = XCL_SINGLE;
4038                *class_uchardata++ = 0x205f;
4039                *class_uchardata++ = XCL_SINGLE;
4040                *class_uchardata++ = 0x3000;
4041    #elif defined SUPPORT_UTF
4042                if (utf)
4043                {                {
4044                class_utf8 = TRUE;                xclass = TRUE;
4045                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4046                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4047                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4048                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4049                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4050                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4051                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4052                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4053                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4054                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4055                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4056                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4057                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4058                }                }
4059  #endif  #endif
4060              continue;              continue;
# Line 3701  for (;; ptr++) Line 4072  for (;; ptr++)
4072                  }                  }
4073                classbits[c] |= x;                classbits[c] |= x;
4074                }                }
4075    #ifndef COMPILE_PCRE8
4076  #ifdef SUPPORT_UTF8              xclass = TRUE;
4077              if (utf8)              *class_uchardata++ = XCL_RANGE;
4078                *class_uchardata++ = 0x0100;
4079                *class_uchardata++ = 0x167f;
4080                *class_uchardata++ = XCL_RANGE;
4081                *class_uchardata++ = 0x1681;
4082                *class_uchardata++ = 0x180d;
4083                *class_uchardata++ = XCL_RANGE;
4084                *class_uchardata++ = 0x180f;
4085                *class_uchardata++ = 0x1fff;
4086                *class_uchardata++ = XCL_RANGE;
4087                *class_uchardata++ = 0x200b;
4088                *class_uchardata++ = 0x202e;
4089                *class_uchardata++ = XCL_RANGE;
4090                *class_uchardata++ = 0x2030;
4091                *class_uchardata++ = 0x205e;
4092                *class_uchardata++ = XCL_RANGE;
4093                *class_uchardata++ = 0x2060;
4094                *class_uchardata++ = 0x2fff;
4095                *class_uchardata++ = XCL_RANGE;
4096                *class_uchardata++ = 0x3001;
4097    #ifdef SUPPORT_UTF
4098                if (utf)
4099                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4100                else
4101    #endif
4102                  *class_uchardata++ = 0xffff;
4103    #elif defined SUPPORT_UTF
4104                if (utf)
4105                {                {
4106                class_utf8 = TRUE;                xclass = TRUE;
4107                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4108                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4109                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4110                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4111                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4112                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4113                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4114                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4115                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4116                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4117                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4118                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4119                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4120                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4121                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4122                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4123                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4124                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4125                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4126                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4127                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4128                }                }
4129  #endif  #endif
4130              continue;              continue;
# Line 3737  for (;; ptr++) Line 4135  for (;; ptr++)
4135              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
4136              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, 0x0d); /* CR */
4137              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, 0x85); /* NEL */
4138  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4139              if (utf8)              xclass = TRUE;
4140                *class_uchardata++ = XCL_RANGE;
4141                *class_uchardata++ = 0x2028;
4142                *class_uchardata++ = 0x2029;
4143    #elif defined SUPPORT_UTF
4144                if (utf)
4145                {                {
4146                class_utf8 = TRUE;                xclass = TRUE;
4147                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4148                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4149                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4150                }                }
4151  #endif  #endif
4152              continue;              continue;
# Line 3765  for (;; ptr++) Line 4168  for (;; ptr++)
4168                classbits[c] |= x;                classbits[c] |= x;
4169                }                }
4170    
4171  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4172              if (utf8)              xclass = TRUE;
4173                *class_uchardata++ = XCL_RANGE;
4174                *class_uchardata++ = 0x0100;
4175                *class_uchardata++ = 0x2027;
4176                *class_uchardata++ = XCL_RANGE;
4177                *class_uchardata++ = 0x202a;
4178    #ifdef SUPPORT_UTF
4179                if (utf)
4180                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4181                else
4182    #endif
4183                  *class_uchardata++ = 0xffff;
4184    #elif defined SUPPORT_UTF
4185                if (utf)
4186                {                {
4187                class_utf8 = TRUE;                xclass = TRUE;
4188                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4189                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4190                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4191                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4192                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4193                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4194                }                }
4195  #endif  #endif
4196              continue;              continue;
# Line 3787  for (;; ptr++) Line 4203  for (;; ptr++)
4203                int pdata;                int pdata;
4204                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4205                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
4206                class_utf8 = TRUE;                xclass = TRUE;
4207                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_uchardata++ = ((-c == ESC_p) != negated)?
4208                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4209                *class_utf8data++ = ptype;                *class_uchardata++ = ptype;
4210                *class_utf8data++ = pdata;                *class_uchardata++ = pdata;
4211                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4212                continue;                continue;
4213                }                }
4214  #endif  #endif
# Line 3806  for (;; ptr++) Line 4222  for (;; ptr++)
4222                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4223                goto FAILED;                goto FAILED;
4224                }                }
4225              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4226              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4227                c = *ptr;                /* Get the final character and fall through */
4228              break;              break;
4229              }              }
4230            }            }
4231    
4232          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4233          greater than 256 in UTF-8 mode. */          greater than 256. */
4234    
4235          }   /* End of backslash handling */          }   /* End of backslash handling */
4236    
# Line 3861  for (;; ptr++) Line 4278  for (;; ptr++)
4278            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4279            }            }
4280    
4281  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4282          if (utf8)          if (utf)
4283            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4284            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4285            }            }
# Line 3906  for (;; ptr++) Line 4323  for (;; ptr++)
4323    
4324          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4325    
4326            /* Since we found a character range, single character optimizations
4327            cannot be done anymore. */
4328            class_single_char = 2;
4329    
4330          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4331          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4332          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4333          available. */          available. */
4334    
4335  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4336          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4337    #elif defined  SUPPORT_UTF
4338            if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4339    #elif !(defined COMPILE_PCRE8)
4340            if (d > 255)
4341    #endif
4342    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4343            {            {
4344            class_utf8 = TRUE;            xclass = TRUE;
4345    
4346            /* With UCP support, we can find the other case equivalents of            /* With UCP support, we can find the other case equivalents of
4347            the relevant characters. There may be several ranges. Optimize how            the relevant characters. There may be several ranges. Optimize how
4348            they fit with the basic range. */            they fit with the basic range. */
4349    
4350  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4351    #ifndef COMPILE_PCRE8
4352              if (utf && (options & PCRE_CASELESS) != 0)
4353    #else
4354            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4355    #endif
4356              {              {
4357              unsigned int occ, ocd;              unsigned int occ, ocd;
4358              unsigned int cc = c;              unsigned int cc = c;
# Line 3947  for (;; ptr++) Line 4378  for (;; ptr++)
4378    
4379                if (occ == ocd)                if (occ == ocd)
4380                  {                  {
4381                  *class_utf8data++ = XCL_SINGLE;                  *class_uchardata++ = XCL_SINGLE;
4382                  }                  }
4383                else                else
4384                  {                  {
4385                  *class_utf8data++ = XCL_RANGE;                  *class_uchardata++ = XCL_RANGE;
4386                  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);                  class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4387                  }                  }
4388                class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);                class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4389                }                }
4390              }              }
4391  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 3962  for (;; ptr++) Line 4393  for (;; ptr++)
4393            /* Now record the original range, possibly modified for UCP caseless            /* Now record the original range, possibly modified for UCP caseless
4394            overlapping ranges. */            overlapping ranges. */
4395    
4396            *class_utf8data++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4397            class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #ifdef SUPPORT_UTF
4398            class_utf8data += _pcre_ord2utf8(d, class_utf8data);  #ifndef COMPILE_PCRE8
4399              if (utf)
4400                {
4401                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4402                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4403                }
4404              else
4405                {
4406                *class_uchardata++ = c;
4407                *class_uchardata++ = d;
4408                }
4409    #else
4410              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4411              class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4412    #endif
4413    #else /* SUPPORT_UTF */
4414              *class_uchardata++ = c;
4415              *class_uchardata++ = d;
4416    #endif /* SUPPORT_UTF */
4417    
4418            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4419            caseless matching for UTF-8 characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
4420            for the smaller ones. */            for the smaller ones. As for 16 bit characters without UTF, we
4421              can still use  */
4422    
4423  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4424            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4425  #else            if (utf)
4426            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  #endif
4427                continue;    /* With next character in the class */
4428    #endif  /* SUPPORT_UCP */
4429    
4430    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4431              if (utf)
4432                {
4433                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4434                /* Adjust upper limit and fall through to set up the map */
4435                d = 127;
4436                }
4437              else
4438                {
4439                if (c > 255) continue;
4440                /* Adjust upper limit and fall through to set up the map */
4441                d = 255;
4442                }
4443    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4444              if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4445            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
   
4446            d = 127;            d = 127;
4447    #else
4448  #endif  /* SUPPORT_UCP */            if (c > 255) continue;
4449              /* Adjust upper limit and fall through to set up the map */
4450              d = 255;
4451    #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4452            }            }
4453  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4454    
4455          /* We use the bit map for all cases when not in UTF-8 mode; else          /* We use the bit map for 8 bit mode, or when the characters fall
4456          ranges that lie entirely within 0-127 when there is UCP support; else          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
         for partial ranges without UCP support. */  
4457    
4458          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4459    
4460          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4461    
# Line 3997  for (;; ptr++) Line 4464  for (;; ptr++)
4464            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4465            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4466              {              {
4467              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c]; /* flip case */
4468              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
4469              }              }
4470            }            }
# Line 4011  for (;; ptr++) Line 4478  for (;; ptr++)
4478    
4479        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4480    
4481        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4482          if (class_single_char < 2) class_single_char++;
4483    
4484  #ifdef SUPPORT_UTF8        /* If class_charcount is 1, we saw precisely one character. As long as
4485        if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))        there were no negated characters >= 128 and there was no use of \p or \P,
4486          in other words, no use of any XCLASS features, we can optimize.
4487    
4488          In UTF-8 mode, we can optimize the negative case only if there were no
4489          characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4490          operate on single-bytes characters only. This is an historical hangover.
4491          Maybe one day we can tidy these opcodes to handle multi-byte characters.
4492    
4493          The optimization throws away the bit map. We turn the item into a
4494          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4495          Note that OP_NOT[I] does not support multibyte characters. In the positive
4496          case, it can cause firstchar to be set. Otherwise, there can be no first
4497          char if this item is first, whatever repeat count may follow. In the case
4498          of reqchar, save the previous value for reinstating. */
4499    
4500    #ifdef SUPPORT_UTF
4501          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4502            && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4503    #else
4504          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4505    #endif
4506          {          {
4507          class_utf8 = TRUE;          ptr++;
4508          *class_utf8data++ = XCL_SINGLE;          zeroreqchar = reqchar;
4509          class_utf8data += _pcre_ord2utf8(c, class_utf8data);  
4510            /* The OP_NOT[I] opcodes work on single characters only. */
4511    
4512            if (negate_class)
4513              {
4514              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4515              zerofirstchar = firstchar;
4516              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4517              *code++ = c;
4518              goto NOT_CHAR;
4519              }
4520    
4521            /* For a single, positive character, get the value into mcbuffer, and
4522            then we can handle this with the normal one-character code. */
4523    
4524    #ifdef SUPPORT_UTF
4525            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4526              mclength = PRIV(ord2utf)(c, mcbuffer);
4527            else
4528    #endif
4529              {
4530              mcbuffer[0] = c;
4531              mclength = 1;
4532              }
4533            goto ONE_CHAR;
4534            }       /* End of 1-char optimization */
4535    
4536          /* Handle a character that cannot go in the bit map. */
4537    
4538    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4539          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4540    #elif defined SUPPORT_UTF
4541          if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4542    #elif !(defined COMPILE_PCRE8)
4543          if (c > 255)
4544    #endif
4545    
4546    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4547            {
4548            xclass = TRUE;
4549            *class_uchardata++ = XCL_SINGLE;
4550    #ifdef SUPPORT_UTF
4551    #ifndef COMPILE_PCRE8
4552            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4553            if (!utf)
4554              *class_uchardata++ = c;
4555            else
4556    #endif
4557              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4558    #else /* SUPPORT_UTF */
4559            *class_uchardata++ = c;
4560    #endif /* SUPPORT_UTF */
4561    
4562  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4563    #ifdef COMPILE_PCRE8
4564          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4565    #else
4566            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4567            if (utf && (options & PCRE_CASELESS) != 0)
4568    #endif
4569            {            {
4570            unsigned int othercase;            unsigned int othercase;
4571            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((othercase = UCD_OTHERCASE(c)) != c)
4572              {              {
4573              *class_utf8data++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4574              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4575              }              }
4576            }            }
4577  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
4578    
4579          }          }
4580        else        else
4581  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4582    
4583        /* Handle a single-byte character */        /* Handle a single-byte character */
4584          {          {
4585            class_has_8bitchar = 1;
4586          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4587          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4588            {            {
4589            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c]; /* flip case */
4590            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4591            }            }
         class_charcount++;  
         class_lastchar = c;  
4592          }          }
4593        }        }
4594    
# Line 4066  for (;; ptr++) Line 4609  for (;; ptr++)
4609        goto FAILED;        goto FAILED;
4610        }        }
4611    
4612      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4613      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4614      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
4615      optimize.  
4616        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4617      In UTF-8 mode, we can optimize the negative case only if there were no      zerofirstchar = firstchar;
4618      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      zeroreqchar = reqchar;
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstbyte to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqbyte, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF8  
     if (class_charcount == 1 && !class_utf8 &&  
       (!utf8 || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqbyte = reqbyte;  
   
       /* The OP_NOT[I] opcodes work on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
         zerofirstbyte = firstbyte;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF8  
       if (utf8 && class_lastchar > 127)  
         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqbyte setting must remain unchanged after any kind of  
     repeat. */  
   
     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
     zerofirstbyte = firstbyte;  
     zeroreqbyte = reqbyte;  
4619    
4620      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4621      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
# Line 4135  for (;; ptr++) Line 4625  for (;; ptr++)
4625      be listed) there are no characters < 256, we can omit the bitmap in the      be listed) there are no characters < 256, we can omit the bitmap in the
4626      actual compiled code. */      actual compiled code. */
4627    
4628  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4629      if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4630    #elif !defined COMPILE_PCRE8
4631        if (xclass && !should_flip_negation)
4632    #endif
4633    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4634        {        {
4635        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4636        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
4637        code += LINK_SIZE;        code += LINK_SIZE;
4638        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT:0;
4639    
4640        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4641        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4642    
4643        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4644          {          {
4645          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4646          memmove(code + 32, code, class_utf8data - code);          memmove(code + (32 / sizeof(pcre_uchar)), code,
4647              IN_UCHARS(class_uchardata - code));
4648          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
4649          code = class_utf8data + 32;          code = class_uchardata + (32 / sizeof(pcre_uchar));
4650          }          }
4651        else code = class_utf8data;        else code = class_uchardata;
4652    
4653        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4654    
4655        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4656        break;   /* End of class handling */        break;   /* End of class handling */
4657        }        }
4658  #endif  #endif
# Line 4169  for (;; ptr++) Line 4664  for (;; ptr++)
4664      negating it if necessary. */      negating it if necessary. */
4665    
4666      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4667      if (negate_class)      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
       {  
       if (lengthptr == NULL)    /* Save time in the pre-compile phase */  
         for (c = 0; c < 32; c++) code[c] = ~classbits[c];  
       }  
     else  
4668        {        {
4669          if (negate_class)
4670            for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4671        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4672        }        }
4673      code += 32;      code += 32 / sizeof(pcre_uchar);
4674        NOT_CHAR:
4675      break;      break;
4676    
4677    
# Line 4215  for (;; ptr++) Line 4708  for (;; ptr++)
4708    
4709      if (repeat_min == 0)      if (repeat_min == 0)
4710        {        {
4711        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */        firstchar = zerofirstchar;    /* Adjust for zero repeat */
4712        reqbyte = zeroreqbyte;        /* Ditto */        reqchar = zeroreqchar;        /* Ditto */
4713        }        }
4714    
4715      /* Remember whether this is a variable length repeat */      /* Remember whether this is a variable length repeat */
# Line 4249  for (;; ptr++) Line 4742  for (;; ptr++)
4742        ptr++;        ptr++;
4743        }        }
4744      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4745    
4746      /* If previous was a recursion call, wrap it in atomic brackets so that      /* If previous was a recursion call, wrap it in atomic brackets so that
4747      previous becomes the atomic group. All recursions were so wrapped in the      previous becomes the atomic group. All recursions were so wrapped in the
4748      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4749      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4750      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4751    
4752      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4753        {        {
4754        memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4755        *previous = OP_ONCE;        *previous = OP_ONCE;
4756        PUT(previous, 1, 2 + 2*LINK_SIZE);        PUT(previous, 1, 2 + 2*LINK_SIZE);
4757        previous[2 + 2*LINK_SIZE] = OP_KET;        previous[2 + 2*LINK_SIZE] = OP_KET;
4758        PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);        PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4759        code += 2 + 2 * LINK_SIZE;        code += 2 + 2 * LINK_SIZE;
4760        length_prevgroup = 3 + 3*LINK_SIZE;        length_prevgroup = 3 + 3*LINK_SIZE;
4761    
4762        /* When actually compiling, we need to check whether this was a forward        /* When actually compiling, we need to check whether this was a forward
4763        reference, and if so, adjust the offset. */        reference, and if so, adjust the offset. */
4764    
4765        if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)        if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4766          {          {
4767          int offset = GET(cd->hwm, -LINK_SIZE);          int offset = GET(cd->hwm, -LINK_SIZE);
4768          if (offset == previous + 1 - cd->start_code)          if (offset == previous + 1 - cd->start_code)
4769            PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);            PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4770          }          }
4771        }        }
4772    
4773      /* Now handle repetition for the different types of item. */      /* Now handle repetition for the different types of item. */
4774    
4775      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4776      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
4777      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqchar - it might not be if a sequence such as x{3} is
4778      the first thing in a branch because the x will have gone into firstbyte      the first thing in a branch because the x will have gone into firstchar
4779      instead.  */      instead.  */
4780    
4781      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI)
4782        {        {
4783        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4784    
4785        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF characters that take up more than one character. It's
4786        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4787        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus UTF_LENGTH to flag that
4788        length rather than a small character. */        it's a length rather than a small character. */
4789    
4790  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4791        if (utf8 && (code[-1] & 0x80) != 0)        if (utf && NOT_FIRSTCHAR(code[-1]))
4792          {          {
4793          uschar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4794          while((*lastchar & 0xc0) == 0x80) lastchar--;          BACKCHAR(lastchar);
4795          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4796          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4797          c |= 0x80;                      /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4798          }          }
4799        else        else
4800  #endif  #endif /* SUPPORT_UTF */
   
       /* Handle the case of a single byte - either with no UTF8 support, or  
       with UTF-8 disabled, or for a UTF-8 character < 128. */  
4801    
4802          /* Handle the case of a single charater - either with no UTF support, or
4803          with UTF disabled, or for a single character UTF character. */
4804          {          {
4805          c = code[-1];          c = code[-1];
4806          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
4807          }          }
4808    
4809        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4321  for (;; ptr++) Line 4813  for (;; ptr++)
4813    
4814        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4815            repeat_max < 0 &&            repeat_max < 0 &&
4816            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4817          {          {
4818          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4819          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4342  for (;; ptr++) Line 4834  for (;; ptr++)
4834        c = previous[1];        c = previous[1];
4835        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4836            repeat_max < 0 &&            repeat_max < 0 &&
4837            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4838          {          {
4839          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4840          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4359  for (;; ptr++) Line 4851  for (;; ptr++)
4851    
4852      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
4853        {        {
4854        uschar *oldcode;        pcre_uchar *oldcode;
4855        int prop_type, prop_value;        int prop_type, prop_value;
4856        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4857        c = *previous;        c = *previous;
4858    
4859        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4860            repeat_max < 0 &&            repeat_max < 0 &&
4861            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4862          {          {
4863          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4864          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4446  for (;; ptr++) Line 4938  for (;; ptr++)
4938          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
4939          Unicode property match, there are two extra bytes that define the          Unicode property match, there are two extra bytes that define the
4940          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
4941          c, with the 0x80 bit as a flag. */          c, with the UTF_LENGTH bit as a flag. */
4942    
4943          if (repeat_max < 0)          if (repeat_max < 0)
4944            {            {
4945  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4946            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4947              {              {
4948              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4949              code += c & 7;              code += c & 7;
4950              }              }
4951            else            else
# Line 4475  for (;; ptr++) Line 4967  for (;; ptr++)
4967    
4968          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
4969            {            {
4970  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4971            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4972              {              {
4973              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4974              code += c & 7;              code += c & 7;
4975              }              }
4976            else            else
# Line 4505  for (;; ptr++) Line 4997  for (;; ptr++)
4997    
4998        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
4999    
5000  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5001        if (utf8 && c >= 128)        if (utf && (c & UTF_LENGTH) != 0)
5002          {          {
5003          memcpy(code, utf8_char, c & 7);          memcpy(code, utf_chars, IN_UCHARS(c & 7));
5004          code += c & 7;          code += c & 7;
5005          }          }
5006        else        else
# Line 4532  for (;; ptr++) Line 5024  for (;; ptr++)
5024    
5025      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
5026               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
5027  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5028               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
5029  #endif  #endif
5030               *previous == OP_REF ||               *previous == OP_REF ||
# Line 4574  for (;; ptr++) Line 5066  for (;; ptr++)
5066      opcodes such as BRA and CBRA, as this is the place where they get converted      opcodes such as BRA and CBRA, as this is the place where they get converted
5067      into the more special varieties such as BRAPOS and SBRA. A test for >=      into the more special varieties such as BRAPOS and SBRA. A test for >=
5068      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5069      ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow      ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5070      repetition of assertions, but now it does, for Perl compatibility. */      repetition of assertions, but now it does, for Perl compatibility. */
5071    
5072      else if (*previous >= OP_ASSERT && *previous <= OP_COND)      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5073        {        {
5074        register int i;        register int i;
5075        int len = (int)(code - previous);        int len = (int)(code - previous);
5076        uschar *bralink = NULL;        pcre_uchar *bralink = NULL;
5077        uschar *brazeroptr = NULL;        pcre_uchar *brazeroptr = NULL;
5078    
5079        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5080        we just ignore the repeat. */        we just ignore the repeat. */
5081    
5082        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5083          goto END_REPEAT;          goto END_REPEAT;
5084    
5085        /* There is no sense in actually repeating assertions. The only potential        /* There is no sense in actually repeating assertions. The only potential
5086        use of repetition is in cases when the assertion is optional. Therefore,        use of repetition is in cases when the assertion is optional. Therefore,
5087        if the minimum is greater than zero, just ignore the repeat. If the        if the minimum is greater than zero, just ignore the repeat. If the
5088        maximum is not not zero or one, set it to 1. */        maximum is not not zero or one, set it to 1. */
5089    
5090        if (*previous < OP_ONCE)    /* Assertion */        if (*previous < OP_ONCE)    /* Assertion */
5091          {          {
5092          if (repeat_min > 0) goto END_REPEAT;          if (repeat_min > 0) goto END_REPEAT;
5093          if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;          if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5094          }          }
5095    
5096        /* The case of a zero minimum is special because of the need to stick        /* The case of a zero minimum is special because of the need to stick
5097        OP_BRAZERO in front of it, and because the group appears once in the        OP_BRAZERO in front of it, and because the group appears once in the
# Line 4635  for (;; ptr++) Line 5127  for (;; ptr++)
5127          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5128            {            {
5129            *code = OP_END;            *code = OP_END;
5130            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf, cd, save_hwm);
5131            memmove(previous+1, previous, len);            memmove(previous + 1, previous, IN_UCHARS(len));
5132            code++;            code++;
5133            if (repeat_max == 0)            if (repeat_max == 0)
5134              {              {
# Line 4659  for (;; ptr++) Line 5151  for (;; ptr++)
5151            {            {
5152            int offset;            int offset;
5153            *code = OP_END;            *code = OP_END;
5154            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);            adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5155            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5156            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
5157            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
5158            *previous++ = OP_BRA;            *previous++ = OP_BRA;
# Line 4706  for (;; ptr++) Line 5198  for (;; ptr++)
5198              *lengthptr += delta;              *lengthptr += delta;
5199              }              }
5200    
5201            /* This is compiling for real */            /* This is compiling for real. If there is a set first byte for
5202              the group, and we have not yet set a "required byte", set it. Make
5203              sure there is enough workspace for copying forward references before
5204              doing the copy. */
5205    
5206            else            else
5207              {              {
5208              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5209    
5210              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5211                {                {
5212                uschar *hc;                pcre_uchar *hc;
5213                uschar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5214                memcpy(code, previous, len);                memcpy(code, previous, IN_UCHARS(len));
5215    
5216                  while (cd->hwm > cd->start_workspace + cd->workspace_size -
5217                         WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5218                    {
5219                    int save_offset = save_hwm - cd->start_workspace;
5220                    int this_offset = this_hwm - cd->start_workspace;
5221                    *errorcodeptr = expand_workspace(cd);
5222                    if (*errorcodeptr != 0) goto FAILED;
5223                    save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5224                    this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5225                    }
5226    
5227                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5228                  {                  {
5229                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 4765  for (;; ptr++) Line 5273  for (;; ptr++)
5273    
5274          else for (i = repeat_max - 1; i >= 0; i--)          else for (i = repeat_max - 1; i >= 0; i--)
5275            {            {
5276            uschar *hc;            pcre_uchar *hc;
5277            uschar *this_hwm = cd->hwm;            pcre_uchar *this_hwm = cd->hwm;
5278    
5279            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
5280    
# Line 4782  for (;; ptr++) Line 5290  for (;; ptr++)
5290              PUTINC(code, 0, offset);              PUTINC(code, 0, offset);
5291              }              }
5292    
5293            memcpy(code, previous, len);            memcpy(code, previous, IN_UCHARS(len));
5294    
5295              /* Ensure there is enough workspace for forward references before
5296              copying them. */
5297    
5298              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5299                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5300                {
5301                int save_offset = save_hwm - cd->start_workspace;
5302                int this_offset = this_hwm - cd->start_workspace;
5303                *errorcodeptr = expand_workspace(cd);
5304                if (*errorcodeptr != 0) goto FAILED;
5305                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5306                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5307                }
5308    
5309            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5310              {              {
5311              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 4799  for (;; ptr++) Line 5322  for (;; ptr++)
5322            {            {
5323            int oldlinkoffset;            int oldlinkoffset;
5324            int offset = (int)(code - bralink + 1);            int offset = (int)(code - bralink + 1);
5325            uschar *bra = code - offset;            pcre_uchar *bra = code - offset;
5326            oldlinkoffset = GET(bra, 1);            oldlinkoffset = GET(bra, 1);
5327            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5328            *code++ = OP_KET;            *code++ = OP_KET;
# Line 4814  for (;; ptr++) Line 5337  for (;; ptr++)
5337        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5338        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5339    
5340        Otherwise, if the quantifier was possessive, we convert the BRA code to        Otherwise, when we are doing the actual compile phase, check to see
5341        the POS form, and the KET code to KETRPOS. (It turns out to be convenient        whether this group is one that could match an empty string. If so,
5342        at runtime to detect this kind of subpattern at both the start and at the        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5343        end.) The use of special opcodes makes it possible to reduce greatly the        that runtime checking can be done. [This check is also applied to ONCE
5344        stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO,        groups at runtime, but in a different way.]
5345        convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that  
5346        the default action below, of wrapping everything inside atomic brackets,        Then, if the quantifier was possessive and the bracket is not a
5347        does not happen.        conditional, we convert the BRA code to the POS form, and the KET code to
5348          KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5349        Then, when we are doing the actual compile phase, check to see whether        subpattern at both the start and at the end.) The use of special opcodes
5350        this group is one that could match an empty string. If so, convert the        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5351        initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5352        checking can be done. [This check is also applied to ONCE groups at  
5353        runtime, but in a different way.] */        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5354          flag so that the default action below, of wrapping everything inside
5355          atomic brackets, does not happen. When the minimum is greater than 1,
5356          there will be earlier copies of the group, and so we still have to wrap
5357          the whole thing. */
5358    
5359        else        else
5360          {          {
5361          uschar *ketcode = code - 1 - LINK_SIZE;          pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5362          uschar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5363    
5364          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;          /* Convert possessive ONCE brackets to non-capturing */
5365          if (*bracode == OP_ONCE)  
5366            if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5367                possessive_quantifier) *bracode = OP_BRA;
5368    
5369            /* For non-possessive ONCE brackets, all we need to do is to
5370            set the KET. */
5371    
5372            if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5373            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5374    
5375            /* Handle non-ONCE brackets and possessive ONCEs (which have been
5376            converted to non-capturing above). */
5377    
5378          else          else
5379            {            {
5380            if (possessive_quantifier)            /* In the compile phase, check for empty string matching. */
             {  
             *bracode += 1;                   /* Switch to xxxPOS opcodes */  
             *ketcode = OP_KETRPOS;  
             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;  
             possessive_quantifier = FALSE;  
             }  
           else *ketcode = OP_KETRMAX + repeat_type;  
5381    
5382            if (lengthptr == NULL)            if (lengthptr == NULL)
5383              {              {
5384              uschar *scode = bracode;              pcre_uchar *scode = bracode;
5385              do              do
5386                {                {
5387                if (could_be_empty_branch(scode, ketcode, utf8, cd))                if (could_be_empty_branch(scode, ketcode, utf, cd))
5388                  {                  {
5389                  *bracode += OP_SBRA - OP_BRA;                  *bracode += OP_SBRA - OP_BRA;
5390                  break;                  break;
# Line 4862  for (;; ptr++) Line 5393  for (;; ptr++)
5393                }                }
5394              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5395              }              }
5396    
5397              /* Handle possessive quantifiers. */
5398    
5399              if (possessive_quantifier)
5400                {
5401                /* For COND brackets, we wrap the whole thing in a possessively
5402                repeated non-capturing bracket, because we have not invented POS
5403                versions of the COND opcodes. Because we are moving code along, we
5404                must ensure that any pending recursive references are updated. */
5405    
5406                if (*bracode == OP_COND || *bracode == OP_SCOND)
5407                  {
5408                  int nlen = (int)(code - bracode);
5409                  *code = OP_END;
5410                  adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5411                  memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5412                  code += 1 + LINK_SIZE;
5413                  nlen += 1 + LINK_SIZE;
5414                  *bracode = OP_BRAPOS;
5415                  *code++ = OP_KETRPOS;
5416                  PUTINC(code, 0, nlen);
5417                  PUT(bracode, 1, nlen);
5418                  }
5419    
5420                /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5421    
5422                else
5423                  {
5424                  *bracode += 1;              /* Switch to xxxPOS opcodes */
5425                  *ketcode = OP_KETRPOS;
5426                  }
5427    
5428                /* If the minimum is zero, mark it as possessive, then unset the
5429                possessive flag when the minimum is 0 or 1. */
5430    
5431                if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5432                if (repeat_min < 2) possessive_quantifier = FALSE;
5433                }
5434    
5435              /* Non-possessive quantifier */
5436    
5437              else *ketcode = OP_KETRMAX + repeat_type;
5438            }            }
5439          }          }
5440        }        }
# Line 4886  for (;; ptr++) Line 5459  for (;; ptr++)
5459      there are special alternative opcodes for this case. For anything else, we      there are special alternative opcodes for this case. For anything else, we
5460      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5461      notation is just syntactic sugar, taken from Sun's Java package, but the      notation is just syntactic sugar, taken from Sun's Java package, but the
5462      special opcodes can optimize it.      special opcodes can optimize it.
5463    
5464      Possessively repeated subpatterns have already been handled in the code      Some (but not all) possessively repeated subpatterns have already been
5465      just above, so possessive_quantifier is always FALSE for them at this      completely handled in the code just above. For them, possessive_quantifier
5466      stage.      is always FALSE at this stage.
5467    
5468      Note that the repeated item starts at tempcode, not at previous, which      Note that the repeated item starts at tempcode, not at previous, which
5469      might be the first part of a string whose (former) last char we repeated.      might be the first part of a string whose (former) last char we repeated.
5470    
# Line 4904  for (;; ptr++) Line 5477  for (;; ptr++)
5477        int len;        int len;
5478    
5479        if (*tempcode == OP_TYPEEXACT)        if (*tempcode == OP_TYPEEXACT)
5480          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
5481            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);            ((tempcode[1 + IMM2_SIZE] == OP_PROP
5482              || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5483    
5484        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5485          {          {
5486          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
5487  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5488          if (utf8 && tempcode[-1] >= 0xc0)          if (utf && HAS_EXTRALEN(tempcode[-1]))
5489            tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];            tempcode += GET_EXTRALEN(tempcode[-1]);
5490  #endif  #endif
5491          }          }
5492    
# Line 4949  for (;; ptr++) Line 5523  for (;; ptr++)
5523    
5524          default:          default:
5525          *code = OP_END;          *code = OP_END;
5526          adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5527          memmove(tempcode + 1+LINK_SIZE, tempcode, len);          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5528          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
5529          len += 1 + LINK_SIZE;          len += 1 + LINK_SIZE;
5530          tempcode[0] = OP_ONCE;          tempcode[0] = OP_ONCE;
# Line 4962  for (;; ptr++) Line 5536  for (;; ptr++)
5536        }        }
5537    
5538      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
5539      "follows varying string" flag for subsequently encountered reqbytes if      "follows varying string" flag for subsequently encountered reqchars if
5540      it isn't already set and we have just passed a varying length item. */      it isn't already set and we have just passed a varying length item. */
5541    
5542      END_REPEAT:      END_REPEAT:
# Line 4985  for (;; ptr++) Line 5559  for (;; ptr++)
5559    
5560      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
5561    
5562      if (*(++ptr) == CHAR_ASTERISK &&      ptr++;
5563           ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5564             || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5565        {        {
5566        int i, namelen;        int i, namelen;
5567        int arglen = 0;        int arglen = 0;
5568        const char *vn = verbnames;        const char *vn = verbnames;
5569        const uschar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5570        const uschar *arg = NULL;        const pcre_uchar *arg = NULL;
5571        previous = NULL;        previous = NULL;
5572        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        ptr++;
5573          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5574        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5575    
5576          /* It appears that Perl allows any characters whatsoever, other than
5577          a closing parenthesis, to appear in arguments, so we no longer insist on
5578          letters, digits, and underscores. */
5579    
5580        if (*ptr == CHAR_COLON)        if (*ptr == CHAR_COLON)
5581          {          {
5582          arg = ++ptr;          arg = ++ptr;
5583          while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
           || *ptr == '_') ptr++;  
5584          arglen = (int)(ptr - arg);          arglen = (int)(ptr - arg);
5585          }          }
5586    
# Line 5016  for (;; ptr++) Line 5595  for (;; ptr++)
5595        for (i = 0; i < verbcount; i++)        for (i = 0; i < verbcount; i++)
5596          {          {
5597          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5598              strncmp((char *)name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5599            {            {
5600            /* Check for open captures before ACCEPT and convert it to            /* Check for open captures before ACCEPT and convert it to
5601            ASSERT_ACCEPT if in an assertion. */            ASSERT_ACCEPT if in an assertion. */
5602    
5603            if (verbs[i].op == OP_ACCEPT)            if (verbs[i].op == OP_ACCEPT)
# Line 5028  for (;; ptr++) Line 5607  for (;; ptr++)
5607                {                {
5608                *errorcodeptr = ERR59;                *errorcodeptr = ERR59;
5609                goto FAILED;                goto FAILED;
5610                }                }
5611              cd->had_accept = TRUE;              cd->had_accept = TRUE;
5612              for (oc = cd->open_caps; oc != NULL; oc = oc->next)              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5613                {                {
# Line 5036  for (;; ptr++) Line 5615  for (;; ptr++)
5615                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5616                }                }
5617              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5618    
5619                /* Do not set firstchar after *ACCEPT */
5620                if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5621              }              }
5622    
5623            /* Handle other cases with/without an argument */            /* Handle other cases with/without an argument */
# Line 5048  for (;; ptr++) Line 5630  for (;; ptr++)
5630                goto FAILED;                goto FAILED;
5631                }                }
5632              *code = verbs[i].op;              *code = verbs[i].op;
5633              if (*code++ == OP_THEN)              if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
               {  
               PUT(code, 0, code - bcptr->current_branch - 1);  
               code += LINK_SIZE;  
               }  
5634              }              }
5635    
5636            else            else
# Line 5063  for (;; ptr++) Line 5641  for (;; ptr++)
5641                goto FAILED;                goto FAILED;
5642                }                }
5643              *code = verbs[i].op_arg;              *code = verbs[i].op_arg;
5644              if (*code++ == OP_THEN_ARG)              if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
               {  
               PUT(code, 0, code - bcptr->current_branch - 1);  
               code += LINK_SIZE;  
               }  
5645              *code++ = arglen;              *code++ = arglen;
5646              memcpy(code, arg, arglen);              memcpy(code, arg, IN_UCHARS(arglen));
5647              code += arglen;              code += arglen;
5648              *code++ = 0;              *code++ = 0;
5649              }              }
# Line 5092  for (;; ptr++) Line 5666  for (;; ptr++)
5666        {        {
5667        int i, set, unset, namelen;        int i, set, unset, namelen;
5668        int *optset;        int *optset;
5669        const uschar *name;        const pcre_uchar *name;
5670        uschar *slot;        pcre_uchar *slot;
5671    
5672        switch (*(++ptr))        switch (*(++ptr))
5673          {          {
# Line 5146  for (;; ptr++) Line 5720  for (;; ptr++)
5720            break;            break;
5721    
5722          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Most other conditions use OP_CREF (a couple change to OP_RREF
5723          below), and all need to skip 3 bytes at the start of the group. */          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5724    
5725          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
5726          skipbytes = 3;          skipbytes = 1+IMM2_SIZE;
5727          refsign = -1;          refsign = -1;
5728    
5729          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
# Line 5182  for (;; ptr++) Line 5756  for (;; ptr++)
5756    
5757          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
5758    
5759          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5760            {            {
5761            ptr += 1;  /* To get the right offset */            ptr += 1;  /* To get the right offset */
5762            *errorcodeptr = ERR28;            *errorcodeptr = ERR28;
# Line 5193  for (;; ptr++) Line 5767  for (;; ptr++)
5767    
5768          recno = 0;          recno = 0;
5769          name = ++ptr;          name = ++ptr;
5770          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5771            {            {
5772            if (recno >= 0)            if (recno >= 0)
5773              recno = ((digitab[*ptr] & ctype_digit) != 0)?              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
               recno * 10 + *ptr - CHAR_0 : -1;  
5774            ptr++;            ptr++;
5775            }            }
5776          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
# Line 5245  for (;; ptr++) Line 5818  for (;; ptr++)
5818          slot = cd->name_table;          slot = cd->name_table;
5819          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
5820            {            {
5821            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5822            slot += cd->name_entry_size;            slot += cd->name_entry_size;
5823            }            }
5824    
# Line 5261  for (;; ptr++) Line 5834  for (;; ptr++)
5834          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
5835    
5836          else if ((i = find_parens(cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
5837                          (options & PCRE_EXTENDED) != 0, utf8)) > 0)                          (options & PCRE_EXTENDED) != 0, utf)) > 0)
5838            {            {
5839            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
5840            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
# Line 5287  for (;; ptr++) Line 5860  for (;; ptr++)
5860            recno = 0;            recno = 0;
5861            for (i = 1; i < namelen; i++)            for (i = 1; i < namelen; i++)
5862              {              {
5863              if ((digitab[name[i]] & ctype_digit) == 0)              if (!IS_DIGIT(name[i]))
5864                {                {
5865                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
5866                goto FAILED;                goto FAILED;
# Line 5302  for (;; ptr++) Line 5875  for (;; ptr++)
5875          /* Similarly, check for the (?(DEFINE) "condition", which is always          /* Similarly, check for the (?(DEFINE) "condition", which is always
5876          false. */          false. */
5877    
5878          else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)          else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5879            {            {
5880            code[1+LINK_SIZE] = OP_DEF;            code[1+LINK_SIZE] = OP_DEF;
5881            skipbytes = 1;            skipbytes = 1;
# Line 5329  for (;; ptr++) Line 5902  for (;; ptr++)
5902          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5903          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5904          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
5905          cd->assert_depth += 1;          cd->assert_depth += 1;
5906          ptr++;          ptr++;
5907          break;          break;
5908    
# Line 5344  for (;; ptr++) Line 5917  for (;; ptr++)
5917            continue;            continue;
5918            }            }
5919          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
5920          cd->assert_depth += 1;          cd->assert_depth += 1;
5921          break;          break;
5922    
5923    
# Line 5354  for (;; ptr++) Line 5927  for (;; ptr++)
5927            {            {
5928            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5929            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
5930            cd->assert_depth += 1;            cd->assert_depth += 1;
5931            ptr += 2;            ptr += 2;
5932            break;            break;
5933    
5934            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5935            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
5936            cd->assert_depth += 1;            cd->assert_depth += 1;
5937            ptr += 2;            ptr += 2;
5938            break;            break;
5939    
5940            default:                /* Could be name define, else bad */            default:                /* Could be name define, else bad */
5941            if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;            if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5942                goto DEFINE_NAME;
5943            ptr++;                  /* Correct offset for error */            ptr++;                  /* Correct offset for error */
5944            *errorcodeptr = ERR24;            *errorcodeptr = ERR24;
5945            goto FAILED;            goto FAILED;
# Line 5382  for (;; ptr++) Line 5956  for (;; ptr++)
5956    
5957          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5958          case CHAR_C:                 /* Callout - may be followed by digits; */          case CHAR_C:                 /* Callout - may be followed by digits; */
5959          previous_callout = code;  /* Save for later completion */          previous_callout = code;     /* Save for later completion */
5960          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1;    /* Skip one item before completing */
5961          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
5962            {            {
5963            int n = 0;            int n = 0;
5964            while ((digitab[*(++ptr)] & ctype_digit) != 0)            ptr++;
5965              n = n * 10 + *ptr - CHAR_0;            while(IS_DIGIT(*ptr))
5966                n = n * 10 + *ptr++ - CHAR_0;
5967            if (*ptr != CHAR_RIGHT_PARENTHESIS)            if (*ptr != CHAR_RIGHT_PARENTHESIS)
5968              {              {
5969              *errorcodeptr = ERR39;              *errorcodeptr = ERR39;
# Line 5433  for (;; ptr++) Line 6008  for (;; ptr++)
6008              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6009            name = ++ptr;            name = ++ptr;
6010    
6011            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6012            namelen = (int)(ptr - name);            namelen = (int)(ptr - name);
6013    
6014            /* In the pre-compile phase, just do a syntax check. */            /* In the pre-compile phase, just do a syntax check. */
# Line 5450  for (;; ptr++) Line 6025  for (;; ptr++)
6025                *errorcodeptr = ERR49;                *errorcodeptr = ERR49;
6026                goto FAILED;                goto FAILED;
6027                }                }
6028              if (namelen + 3 > cd->name_entry_size)              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6029                {                {
6030                cd->name_entry_size = namelen + 3;                cd->name_entry_size = namelen + IMM2_SIZE + 1;
6031                if (namelen > MAX_NAME_SIZE)                if (namelen > MAX_NAME_SIZE)
6032                  {                  {
6033                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR48;
# Line 5481  for (;; ptr++) Line 6056  for (;; ptr++)
6056    
6057              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
6058                {                {
6059                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6060                if (crc == 0)                if (crc == 0)
6061                  {                  {
6062                  if (slot[2+namelen] == 0)                  if (slot[IMM2_SIZE+namelen] == 0)
6063                    {                    {
6064                    if (GET2(slot, 0) != cd->bracount + 1 &&                    if (GET2(slot, 0) != cd->bracount + 1 &&
6065                        (options & PCRE_DUPNAMES) == 0)                        (options & PCRE_DUPNAMES) == 0)
# Line 5505  for (;; ptr++) Line 6080  for (;; ptr++)
6080                if (crc < 0) &n