/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 773 by ph10, Wed Nov 30 18:10:27 2011 UTC revision 982 by ph10, Wed Jun 20 15:15:27 2012 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2011 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
# Line 88  so this number is very generous. Line 92  so this number is very generous.
92  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
93  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
94  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95  is 4 there is plenty of room for most patterns. However, the memory can get  is 4 there is plenty of room for most patterns. However, the memory can get
96  filled up by repetitions of forward references, for example patterns like  filled up by repetitions of forward references, for example patterns like
97  /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so  /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98  that the workspace is expanded using malloc() in this situation. The value  that the workspace is expanded using malloc() in this situation. The value
99  below is therefore a minimum, and we put a maximum on it for safety. The  below is therefore a minimum, and we put a maximum on it for safety. The
100  minimum is now also defined in terms of LINK_SIZE so that the use of malloc()  minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101  kicks in at the same number of forward references in all cases. */  kicks in at the same number of forward references in all cases. */
102    
103  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
# Line 104  overrun before it actually does run off Line 108  overrun before it actually does run off
108    
109  #define WORK_SIZE_SAFETY_MARGIN (100)  #define WORK_SIZE_SAFETY_MARGIN (100)
110    
111    /* Private flags added to firstchar and reqchar. */
112    
113    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
114    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
115    
116    /* Repeated character flags. */
117    
118    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
119    
120  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 238  static const char posix_names[] = Line 250  static const char posix_names[] =
250    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
252    
253  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
254    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255    
256  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 273  substitutes must be in the order of the Line 285  substitutes must be in the order of the
285  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
286    
287  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
288  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
289    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
292    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
295      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297    static const pcre_uchar string_pXsp[] = {
298      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300    static const pcre_uchar string_PXwd[] = {
301      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303    static const pcre_uchar string_pXwd[] = {
304      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    
307    static const pcre_uchar *substitutes[] = {
308      string_PNd,           /* \D */
309      string_pNd,           /* \d */
310      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
311      string_pXsp,          /* \s */
312      string_PXwd,          /* \W */
313      string_pXwd           /* \w */
314  };  };
315    
316  static const uschar *posix_substitutes[] = {  static const pcre_uchar string_pL[] =   {
317    (uschar *)"\\p{L}",     /* alpha */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318    (uschar *)"\\p{Ll}",    /* lower */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319    (uschar *)"\\p{Lu}",    /* upper */  static const pcre_uchar string_pLl[] =  {
320    (uschar *)"\\p{Xan}",   /* alnum */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321    NULL,                   /* ascii */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pLu[] =  {
323    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325    NULL,                   /* graph */  static const pcre_uchar string_pXan[] = {
326    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327    NULL,                   /* punct */    CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_h[] =    {
329    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_h, '\0' };
330    NULL,                   /* xdigit */  static const pcre_uchar string_pXps[] = {
331      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333    static const pcre_uchar string_PL[] =   {
334      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336    static const pcre_uchar string_PLl[] =  {
337      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339    static const pcre_uchar string_PLu[] =  {
340      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342    static const pcre_uchar string_PXan[] = {
343      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345    static const pcre_uchar string_H[] =    {
346      CHAR_BACKSLASH, CHAR_H, '\0' };
347    static const pcre_uchar string_PXps[] = {
348      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350    
351    static const pcre_uchar *posix_substitutes[] = {
352      string_pL,            /* alpha */
353      string_pLl,           /* lower */
354      string_pLu,           /* upper */
355      string_pXan,          /* alnum */
356      NULL,                 /* ascii */
357      string_h,             /* blank */
358      NULL,                 /* cntrl */
359      string_pNd,           /* digit */
360      NULL,                 /* graph */
361      NULL,                 /* print */
362      NULL,                 /* punct */
363      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
364      string_pXwd,          /* word */
365      NULL,                 /* xdigit */
366    /* Negated cases */    /* Negated cases */
367    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
368    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
369    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
370    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
371    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
372    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
373    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
374    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
375    NULL,                   /* ^graph */    NULL,                 /* ^graph */
376    NULL,                   /* ^print */    NULL,                 /* ^print */
377    NULL,                   /* ^punct */    NULL,                 /* ^punct */
378    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
379    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */
380    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
381  };  };
382  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383  #endif  #endif
384    
385  #define STRING(a)  # a  #define STRING(a)  # a
# Line 372  static const char error_texts[] = Line 438  static const char error_texts[] =
438    /* 30 */    /* 30 */
439    "unknown POSIX class name\0"    "unknown POSIX class name\0"
440    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
441    "this version of PCRE is not compiled with PCRE_UTF8 support\0"    "this version of PCRE is compiled without UTF support\0"
442    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
443    "character value in \\x{...} sequence is too large\0"    "character value in \\x{...} sequence is too large\0"
444    /* 35 */    /* 35 */
# Line 395  static const char error_texts[] = Line 461  static const char error_texts[] =
461    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
462    /* 50 */    /* 50 */
463    "repeated subpattern is too long\0"    /** DEAD **/    "repeated subpattern is too long\0"    /** DEAD **/
464    "octal value is greater than \\377 (not in UTF-8 mode)\0"    "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
465    "internal error: overran compiling workspace\0"    "internal error: overran compiling workspace\0"
466    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
467    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
# Line 414  static const char error_texts[] = Line 480  static const char error_texts[] =
480    /* 65 */    /* 65 */
481    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
482    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
483    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with Unicode property support\0"
484    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
485    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486    /* 70 */    /* 70 */
487    "internal error: unknown opcode in find_fixedlength()\0"    "internal error: unknown opcode in find_fixedlength()\0"
488    "\\N is not supported in a class\0"    "\\N is not supported in a class\0"
489    "too many forward references\0"    "too many forward references\0"
490      "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491      "invalid UTF-16 string\0"
492      /* 75 */
493      "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
494      "character value in \\u.... sequence is too large\0"
495    ;    ;
496    
497  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 439  For convenience, we use the same bit def Line 510  For convenience, we use the same bit def
510    
511  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
512    
513    /* Using a simple comparison for decimal numbers rather than a memory read
514    is much faster, and the resulting code is simpler (the compiler turns it
515    into a subtraction and unsigned comparison). */
516    
517    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
518    
519  #ifndef EBCDIC  #ifndef EBCDIC
520    
521  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
522  UTF-8 mode. */  UTF-8 mode. */
523    
524  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
525    {    {
526    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
527    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 483  static const unsigned char digitab[] = Line 560  static const unsigned char digitab[] =
560    
561  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
562    
563  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
564    {    {
565    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
566    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 518  static const unsigned char digitab[] = Line 595  static const unsigned char digitab[] =
595    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
596    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
597    
598  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
599    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
600    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
601    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 557  static const unsigned char ebcdic_charta Line 634  static const unsigned char ebcdic_charta
634  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
635    
636  static BOOL  static BOOL
637    compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
638      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
639    
640    
# Line 592  return s; Line 669  return s;
669  *           Expand the workspace                 *  *           Expand the workspace                 *
670  *************************************************/  *************************************************/
671    
672  /* This function is called during the second compiling phase, if the number of  /* This function is called during the second compiling phase, if the number of
673  forward references fills the existing workspace, which is originally a block on  forward references fills the existing workspace, which is originally a block on
674  the stack. A larger block is obtained from malloc() unless the ultimate limit  the stack. A larger block is obtained from malloc() unless the ultimate limit
675  has been reached or the increase will be rather small.  has been reached or the increase will be rather small.
676    
677  Argument: pointer to the compile data block  Argument: pointer to the compile data block
# Line 604  Returns:  0 if all went well, else an er Line 681  Returns:  0 if all went well, else an er
681  static int  static int
682  expand_workspace(compile_data *cd)  expand_workspace(compile_data *cd)
683  {  {
684  uschar *newspace;  pcre_uchar *newspace;
685  int newsize = cd->workspace_size * 2;  int newsize = cd->workspace_size * 2;
686    
687  if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;  if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
# Line 612  if (cd->workspace_size >= COMPILE_WORK_S Line 689  if (cd->workspace_size >= COMPILE_WORK_S
689      newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)      newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
690   return ERR72;   return ERR72;
691    
692  newspace = (pcre_malloc)(newsize);  newspace = (PUBL(malloc))(IN_UCHARS(newsize));
693  if (newspace == NULL) return ERR21;  if (newspace == NULL) return ERR21;
694    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
695  memcpy(newspace, cd->start_workspace, cd->workspace_size);  cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
696  cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace);  if (cd->workspace_size > COMPILE_WORK_SIZE)
697  if (cd->workspace_size > COMPILE_WORK_SIZE)    (PUBL(free))((void *)cd->start_workspace);
   (pcre_free)((void *)cd->start_workspace);  
698  cd->start_workspace = newspace;  cd->start_workspace = newspace;
699  cd->workspace_size = newsize;  cd->workspace_size = newsize;
700  return 0;  return 0;
# Line 642  Returns:    TRUE or FALSE Line 718  Returns:    TRUE or FALSE
718  */  */
719    
720  static BOOL  static BOOL
721  is_counted_repeat(const uschar *p)  is_counted_repeat(const pcre_uchar *p)
722  {  {
723  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
724  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
725    while (IS_DIGIT(*p)) p++;
726  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
727    
728  if (*p++ != CHAR_COMMA) return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
729  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
730    
731  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
732  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
733    while (IS_DIGIT(*p)) p++;
734    
735  return (*p == CHAR_RIGHT_CURLY_BRACKET);  return (*p == CHAR_RIGHT_CURLY_BRACKET);
736  }  }
# Line 684  Returns:         zero or positive => a d Line 762  Returns:         zero or positive => a d
762  */  */
763    
764  static int  static int
765  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
766    int options, BOOL isclass)    int options, BOOL isclass)
767  {  {
768  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
769  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
770  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
771    pcre_int32 c;
772    int i;
773    
774  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
775  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
# Line 703  in a table. A non-zero result is somethi Line 783  in a table. A non-zero result is somethi
783  Otherwise further processing may be required. */  Otherwise further processing may be required. */
784    
785  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
786  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
787    else if (c < CHAR_0 || c > CHAR_z) {}
788  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
789    
790  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
791  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
792    else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
793  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
794  #endif  #endif
795    
# Line 715  else if ((i = escapes[c - 0x48]) != 0) Line 797  else if ((i = escapes[c - 0x48]) != 0)
797    
798  else  else
799    {    {
800    const uschar *oldptr;    const pcre_uchar *oldptr;
801    BOOL braced, negated;    BOOL braced, negated;
802    
803    switch (c)    switch (c)
# Line 733  else Line 815  else
815        {        {
816        /* In JavaScript, \u must be followed by four hexadecimal numbers.        /* In JavaScript, \u must be followed by four hexadecimal numbers.
817        Otherwise it is a lowercase u letter. */        Otherwise it is a lowercase u letter. */
818        if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
819             && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
820            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
821            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
822          {          {
823          c = 0;          c = 0;
824          for (i = 0; i < 4; ++i)          for (i = 0; i < 4; ++i)
# Line 748  else Line 832  else
832            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
833  #endif  #endif
834            }            }
835    
836    #ifdef COMPILE_PCRE8
837            if (c > (utf ? 0x10ffff : 0xff))
838    #else
839    #ifdef COMPILE_PCRE16
840            if (c > (utf ? 0x10ffff : 0xffff))
841    #endif
842    #endif
843              {
844              *errorcodeptr = ERR76;
845              }
846            else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
847          }          }
848        }        }
849      else      else
# Line 788  else Line 884  else
884    
885      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
886        {        {
887        const uschar *p;        const pcre_uchar *p;
888        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
889          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
890        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
891          {          {
892          c = -ESC_k;          c = -ESC_k;
# Line 808  else Line 904  else
904        }        }
905      else negated = FALSE;      else negated = FALSE;
906    
907        /* The integer range is limited by the machine's int representation. */
908      c = 0;      c = 0;
909      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
910          {
911          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
912            {
913            c = -1;
914            break;
915            }
916        c = c * 10 + *(++ptr) - CHAR_0;        c = c * 10 + *(++ptr) - CHAR_0;
917          }
918      if (c < 0)   /* Integer overflow */      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
919        {        {
920          while (IS_DIGIT(ptr[1]))
921            ptr++;
922        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
923        break;        break;
924        }        }
# Line 861  else Line 966  else
966      if (!isclass)      if (!isclass)
967        {        {
968        oldptr = ptr;        oldptr = ptr;
969          /* The integer range is limited by the machine's int representation. */
970        c -= CHAR_0;        c -= CHAR_0;
971        while ((digitab[ptr[1]] & ctype_digit) != 0)        while (IS_DIGIT(ptr[1]))
972            {
973            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
974              {
975              c = -1;
976              break;
977              }
978          c = c * 10 + *(++ptr) - CHAR_0;          c = c * 10 + *(++ptr) - CHAR_0;
979        if (c < 0)    /* Integer overflow */          }
980          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
981          {          {
982            while (IS_DIGIT(ptr[1]))
983              ptr++;
984          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
985          break;          break;
986          }          }
# Line 891  else Line 1006  else
1006      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1007      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
1008      significant 8 bits of octal numbers (I think this is what early Perls used      significant 8 bits of octal numbers (I think this is what early Perls used
1009      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1010      than 3 octal digits. */      but no more than 3 octal digits. */
1011    
1012      case CHAR_0:      case CHAR_0:
1013      c -= CHAR_0;      c -= CHAR_0;
1014      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1015          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
1016      if (!utf8 && c > 255) *errorcodeptr = ERR51;  #ifdef COMPILE_PCRE8
1017        if (!utf && c > 0xff) *errorcodeptr = ERR51;
1018    #endif
1019      break;      break;
1020    
1021      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
1022      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1023      treated as a data character. */      If not, { is treated as a data character. */
1024    
1025      case CHAR_x:      case CHAR_x:
1026      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1027        {        {
1028        /* In JavaScript, \x must be followed by two hexadecimal numbers.        /* In JavaScript, \x must be followed by two hexadecimal numbers.
1029        Otherwise it is a lowercase x letter. */        Otherwise it is a lowercase x letter. */
1030        if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1031            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1032          {          {
1033          c = 0;          c = 0;
1034          for (i = 0; i < 2; ++i)          for (i = 0; i < 2; ++i)
# Line 930  else Line 1048  else
1048    
1049      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1050        {        {
1051        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
       int count = 0;  
1052    
1053        c = 0;        c = 0;
1054        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1055          {          {
1056          register int cc = *pt++;          register int cc = *pt++;
1057          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
         count++;  
1058    
1059  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1060          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
# Line 947  else Line 1063  else
1063          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1064          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1065  #endif  #endif
1066    
1067    #ifdef COMPILE_PCRE8
1068            if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1069    #else
1070    #ifdef COMPILE_PCRE16
1071            if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1072    #endif
1073    #endif
1074            }
1075    
1076          if (c < 0)
1077            {
1078            while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1079            *errorcodeptr = ERR34;
1080          }          }
1081    
1082        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1083          {          {
1084          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1085          ptr = pt;          ptr = pt;
1086          break;          break;
1087          }          }
# Line 963  else Line 1093  else
1093      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1094    
1095      c = 0;      c = 0;
1096      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1097        {        {
1098        int cc;                                  /* Some compilers don't like */        int cc;                                  /* Some compilers don't like */
1099        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
# Line 1061  Returns:         type value from ucp_typ Line 1191  Returns:         type value from ucp_typ
1191  */  */
1192    
1193  static int  static int
1194  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1195  {  {
1196  int c, i, bot, top;  int c, i, bot, top;
1197  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1198  char name[32];  pcre_uchar name[32];
1199    
1200  c = *(++ptr);  c = *(++ptr);
1201  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 1082  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1212  if (c == CHAR_LEFT_CURLY_BRACKET)
1212      *negptr = TRUE;      *negptr = TRUE;
1213      ptr++;      ptr++;
1214      }      }
1215    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1216      {      {
1217      c = *(++ptr);      c = *(++ptr);
1218      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 1106  else Line 1236  else
1236  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1237    
1238  bot = 0;  bot = 0;
1239  top = _pcre_utt_size;  top = PRIV(utt_size);
1240    
1241  while (bot < top)  while (bot < top)
1242    {    {
1243    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1244    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1245    if (c == 0)    if (c == 0)
1246      {      {
1247      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1248      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1249      }      }
1250    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1251    }    }
# Line 1153  Returns:         pointer to '}' on succe Line 1283  Returns:         pointer to '}' on succe
1283                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1284  */  */
1285    
1286  static const uschar *  static const pcre_uchar *
1287  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1288  {  {
1289  int min = 0;  int min = 0;
1290  int max = -1;  int max = -1;
# Line 1162  int max = -1; Line 1292  int max = -1;
1292  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1293  an integer overflow. */  an integer overflow. */
1294    
1295  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1296  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1297    {    {
1298    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1177  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1307  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1307    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1308      {      {
1309      max = 0;      max = 0;
1310      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1311      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1312        {        {
1313        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1232  Arguments: Line 1362  Arguments:
1362    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1363    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1364    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1365    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1366    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1367    
1368  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1369  */  */
1370    
1371  static int  static int
1372  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1373    BOOL xmode, BOOL utf8, int *count)    BOOL xmode, BOOL utf, int *count)
1374  {  {
1375  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1376  int start_count = *count;  int start_count = *count;
1377  int hwm_count = start_count;  int hwm_count = start_count;
1378  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1309  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1439  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1439          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1440        {        {
1441        int term;        int term;
1442        const uschar *thisname;        const pcre_uchar *thisname;
1443        *count += 1;        *count += 1;
1444        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1445        term = *ptr++;        term = *ptr++;
# Line 1317  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1447  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1447        thisname = ptr;        thisname = ptr;
1448        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1449        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1450            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, lorn) == 0)
1451          return *count;          return *count;
1452        term++;        term++;
1453        }        }
# Line 1360  for (; ptr < cd->end_pattern; ptr++) Line 1490  for (; ptr < cd->end_pattern; ptr++)
1490          {          {
1491          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1492            ptr+= 2;            ptr+= 2;
1493          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1494                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1495            ptr += 4;            ptr += 4;
1496          else          else
# Line 1408  for (; ptr < cd->end_pattern; ptr++) Line 1538  for (; ptr < cd->end_pattern; ptr++)
1538        {        {
1539        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1540        ptr++;        ptr++;
1541  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1542        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;        if (utf) FORWARDCHAR(ptr);
1543  #endif  #endif
1544        }        }
1545      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
# Line 1420  for (; ptr < cd->end_pattern; ptr++) Line 1550  for (; ptr < cd->end_pattern; ptr++)
1550    
1551    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1552      {      {
1553      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1554      if (rc > 0) return rc;      if (rc > 0) return rc;
1555      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1556      }      }
# Line 1466  Arguments: Line 1596  Arguments:
1596    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1597    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1598    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1599    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1600    
1601  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1602  */  */
1603    
1604  static int  static int
1605  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1606    BOOL utf8)    BOOL utf)
1607  {  {
1608  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1609  int count = 0;  int count = 0;
1610  int rc;  int rc;
1611    
# Line 1486  matching closing parens. That is why we Line 1616  matching closing parens. That is why we
1616    
1617  for (;;)  for (;;)
1618    {    {
1619    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1620    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1621    }    }
1622    
# Line 1513  Arguments: Line 1643  Arguments:
1643  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1644  */  */
1645    
1646  static const uschar*  static const pcre_uchar*
1647  first_significant_code(const uschar *code, BOOL skipassert)  first_significant_code(const pcre_uchar *code, BOOL skipassert)
1648  {  {
1649  for (;;)  for (;;)
1650    {    {
# Line 1525  for (;;) Line 1655  for (;;)
1655      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1656      if (!skipassert) return code;      if (!skipassert) return code;
1657      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1658      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1659      break;      break;
1660    
1661      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1539  for (;;) Line 1669  for (;;)
1669      case OP_RREF:      case OP_RREF:
1670      case OP_NRREF:      case OP_NRREF:
1671      case OP_DEF:      case OP_DEF:
1672      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1673      break;      break;
1674    
1675      default:      default:
# Line 1569  and doing the check at the end; a flag s Line 1699  and doing the check at the end; a flag s
1699    
1700  Arguments:  Arguments:
1701    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1702    utf8     TRUE in UTF-8 mode    utf      TRUE in UTF-8 / UTF-16 mode
1703    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1704    cd       the "compile data" structure    cd       the "compile data" structure
1705    
# Line 1581  Returns:   the fixed length, Line 1711  Returns:   the fixed length,
1711  */  */
1712    
1713  static int  static int
1714  find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1715  {  {
1716  int length = -1;  int length = -1;
1717    
1718  register int branchlength = 0;  register int branchlength = 0;
1719  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1720    
1721  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1722  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1594  branch, check the length against that of Line 1724  branch, check the length against that of
1724  for (;;)  for (;;)
1725    {    {
1726    int d;    int d;
1727    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1728    register int op = *cc;    register int op = *cc;
1729    
1730    switch (op)    switch (op)
1731      {      {
1732      /* We only need to continue for OP_CBRA (normal capturing bracket) and      /* We only need to continue for OP_CBRA (normal capturing bracket) and
# Line 1608  for (;;) Line 1739  for (;;)
1739      case OP_ONCE:      case OP_ONCE:
1740      case OP_ONCE_NC:      case OP_ONCE_NC:
1741      case OP_COND:      case OP_COND:
1742      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1743      if (d < 0) return d;      if (d < 0) return d;
1744      branchlength += d;      branchlength += d;
1745      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1639  for (;;) Line 1770  for (;;)
1770    
1771      case OP_RECURSE:      case OP_RECURSE:
1772      if (!atend) return -3;      if (!atend) return -3;
1773      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1774      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1775      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1776      d = find_fixedlength(cs + 2, utf8, atend, cd);      d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1777      if (d < 0) return d;      if (d < 0) return d;
1778      branchlength += d;      branchlength += d;
1779      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1655  for (;;) Line 1786  for (;;)
1786      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1787      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1788      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1789      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1790        break;
1791    
1792      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1793    
# Line 1663  for (;;) Line 1795  for (;;)
1795      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
1796      case OP_SKIP_ARG:      case OP_SKIP_ARG:
1797      case OP_THEN_ARG:      case OP_THEN_ARG:
1798      cc += cc[1] + _pcre_OP_lengths[*cc];      cc += cc[1] + PRIV(OP_lengths)[*cc];
1799      break;      break;
1800    
1801      case OP_CALLOUT:      case OP_CALLOUT:
# Line 1690  for (;;) Line 1822  for (;;)
1822      case OP_SOM:      case OP_SOM:
1823      case OP_THEN:      case OP_THEN:
1824      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1825      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1826      break;      break;
1827    
1828      /* Handle literal characters */      /* Handle literal characters */
# Line 1701  for (;;) Line 1833  for (;;)
1833      case OP_NOTI:      case OP_NOTI:
1834      branchlength++;      branchlength++;
1835      cc += 2;      cc += 2;
1836  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1837      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1838  #endif  #endif
1839      break;      break;
1840    
# Line 1714  for (;;) Line 1846  for (;;)
1846      case OP_NOTEXACT:      case OP_NOTEXACT:
1847      case OP_NOTEXACTI:      case OP_NOTEXACTI:
1848      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1849      cc += 4;      cc += 2 + IMM2_SIZE;
1850  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1851      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1852  #endif  #endif
1853      break;      break;
1854    
1855      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1856      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1857      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1858      cc += 4;      cc += 1 + IMM2_SIZE + 1;
1859      break;      break;
1860    
1861      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1749  for (;;) Line 1881  for (;;)
1881      cc++;      cc++;
1882      break;      break;
1883    
1884      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1885      otherwise \C is coded as OP_ALLANY. */      otherwise \C is coded as OP_ALLANY. */
1886    
1887      case OP_ANYBYTE:      case OP_ANYBYTE:
# Line 1757  for (;;) Line 1889  for (;;)
1889    
1890      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1891    
1892  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1893      case OP_XCLASS:      case OP_XCLASS:
1894      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1895      /* Fall through */      /* Fall through */
1896  #endif  #endif
1897    
1898      case OP_CLASS:      case OP_CLASS:
1899      case OP_NCLASS:      case OP_NCLASS:
1900      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1901    
1902      switch (*cc)      switch (*cc)
1903        {        {
# Line 1779  for (;;) Line 1911  for (;;)
1911    
1912        case OP_CRRANGE:        case OP_CRRANGE:
1913        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1914        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1915        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1916        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1917        break;        break;
1918    
1919        default:        default:
# Line 1896  length. Line 2028  length.
2028    
2029  Arguments:  Arguments:
2030    code        points to start of expression    code        points to start of expression
2031    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2032    number      the required bracket number or negative to find a lookbehind    number      the required bracket number or negative to find a lookbehind
2033    
2034  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2035  */  */
2036    
2037  const uschar *  const pcre_uchar *
2038  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2039  {  {
2040  for (;;)  for (;;)
2041    {    {
# Line 1921  for (;;) Line 2053  for (;;)
2053    
2054    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
2055      {      {
2056      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
2057      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2058      }      }
2059    
2060    /* Handle capturing bracket */    /* Handle capturing bracket */
# Line 1931  for (;;) Line 2063  for (;;)
2063             c == OP_CBRAPOS || c == OP_SCBRAPOS)             c == OP_CBRAPOS || c == OP_SCBRAPOS)
2064      {      {
2065      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
2066      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2067      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2068      }      }
2069    
2070    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
# Line 1960  for (;;) Line 2092  for (;;)
2092        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2093        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2094        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2095        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2096            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2097        break;        break;
2098    
2099        case OP_MARK:        case OP_MARK:
# Line 1976  for (;;) Line 2109  for (;;)
2109    
2110      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2111    
2112      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2113    
2114    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2115    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2116    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2117    
2118  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2119      if (utf8) switch(c)      if (utf) switch(c)
2120        {        {
2121        case OP_CHAR:        case OP_CHAR:
2122        case OP_CHARI:        case OP_CHARI:
# Line 2013  for (;;) Line 2146  for (;;)
2146        case OP_MINQUERYI:        case OP_MINQUERYI:
2147        case OP_POSQUERY:        case OP_POSQUERY:
2148        case OP_POSQUERYI:        case OP_POSQUERYI:
2149        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2150        break;        break;
2151        }        }
2152  #else  #else
2153      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2154  #endif  #endif
2155      }      }
2156    }    }
# Line 2034  instance of OP_RECURSE. Line 2167  instance of OP_RECURSE.
2167    
2168  Arguments:  Arguments:
2169    code        points to start of expression    code        points to start of expression
2170    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2171    
2172  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2173  */  */
2174    
2175  static const uschar *  static const pcre_uchar *
2176  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2177  {  {
2178  for (;;)  for (;;)
2179    {    {
# Line 2079  for (;;) Line 2212  for (;;)
2212        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2213        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2214        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2215        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2216            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2217        break;        break;
2218    
2219        case OP_MARK:        case OP_MARK:
# Line 2095  for (;;) Line 2229  for (;;)
2229    
2230      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2231    
2232      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2233    
2234      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2235      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2236      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2237    
2238  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2239      if (utf8) switch(c)      if (utf) switch(c)
2240        {        {
2241        case OP_CHAR:        case OP_CHAR:
2242        case OP_CHARI:        case OP_CHARI:
2243          case OP_NOT:
2244          case OP_NOTI:
2245        case OP_EXACT:        case OP_EXACT:
2246        case OP_EXACTI:        case OP_EXACTI:
2247          case OP_NOTEXACT:
2248          case OP_NOTEXACTI:
2249        case OP_UPTO:        case OP_UPTO:
2250        case OP_UPTOI:        case OP_UPTOI:
2251          case OP_NOTUPTO:
2252          case OP_NOTUPTOI:
2253        case OP_MINUPTO:        case OP_MINUPTO:
2254        case OP_MINUPTOI:        case OP_MINUPTOI:
2255          case OP_NOTMINUPTO:
2256          case OP_NOTMINUPTOI:
2257        case OP_POSUPTO:        case OP_POSUPTO:
2258        case OP_POSUPTOI:        case OP_POSUPTOI:
2259          case OP_NOTPOSUPTO:
2260          case OP_NOTPOSUPTOI:
2261        case OP_STAR:        case OP_STAR:
2262        case OP_STARI:        case OP_STARI:
2263          case OP_NOTSTAR:
2264          case OP_NOTSTARI:
2265        case OP_MINSTAR:        case OP_MINSTAR:
2266        case OP_MINSTARI:        case OP_MINSTARI:
2267          case OP_NOTMINSTAR:
2268          case OP_NOTMINSTARI:
2269        case OP_POSSTAR:        case OP_POSSTAR:
2270        case OP_POSSTARI:        case OP_POSSTARI:
2271          case OP_NOTPOSSTAR:
2272          case OP_NOTPOSSTARI:
2273        case OP_PLUS:        case OP_PLUS:
2274        case OP_PLUSI:        case OP_PLUSI:
2275          case OP_NOTPLUS:
2276          case OP_NOTPLUSI:
2277        case OP_MINPLUS:        case OP_MINPLUS:
2278        case OP_MINPLUSI:        case OP_MINPLUSI:
2279          case OP_NOTMINPLUS:
2280          case OP_NOTMINPLUSI:
2281        case OP_POSPLUS:        case OP_POSPLUS:
2282        case OP_POSPLUSI:        case OP_POSPLUSI:
2283          case OP_NOTPOSPLUS:
2284          case OP_NOTPOSPLUSI:
2285        case OP_QUERY:        case OP_QUERY:
2286        case OP_QUERYI:        case OP_QUERYI:
2287          case OP_NOTQUERY:
2288          case OP_NOTQUERYI:
2289        case OP_MINQUERY:        case OP_MINQUERY:
2290        case OP_MINQUERYI:        case OP_MINQUERYI:
2291          case OP_NOTMINQUERY:
2292          case OP_NOTMINQUERYI:
2293        case OP_POSQUERY:        case OP_POSQUERY:
2294        case OP_POSQUERYI:        case OP_POSQUERYI:
2295        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_NOTPOSQUERY:
2296          case OP_NOTPOSQUERYI:
2297          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2298        break;        break;
2299        }        }
2300  #else  #else
2301      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2302  #endif  #endif
2303      }      }
2304    }    }
# Line 2159  bracket whose current branch will alread Line 2321  bracket whose current branch will alread
2321  Arguments:  Arguments:
2322    code        points to start of search    code        points to start of search
2323    endcode     points to where to stop    endcode     points to where to stop
2324    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2325    cd          contains pointers to tables etc.    cd          contains pointers to tables etc.
2326    
2327  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2328  */  */
2329    
2330  static BOOL  static BOOL
2331  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2332    compile_data *cd)    BOOL utf, compile_data *cd)
2333  {  {
2334  register int c;  register int c;
2335  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2336       code < endcode;       code < endcode;
2337       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2338    {    {
2339    const uschar *ccode;    const pcre_uchar *ccode;
2340    
2341    c = *code;    c = *code;
2342    
# Line 2197  for (code = first_significant_code(code Line 2359  for (code = first_significant_code(code
2359    
2360    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2361      {      {
2362      const uschar *scode;      const pcre_uchar *scode;
2363      BOOL empty_branch;      BOOL empty_branch;
2364    
2365      /* Test for forward reference */      /* Test for forward reference */
# Line 2215  for (code = first_significant_code(code Line 2377  for (code = first_significant_code(code
2377    
2378      do      do
2379        {        {
2380        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf, cd))
2381          {          {
2382          empty_branch = TRUE;          empty_branch = TRUE;
2383          break;          break;
# Line 2233  for (code = first_significant_code(code Line 2395  for (code = first_significant_code(code
2395    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2396        c == OP_BRAPOSZERO)        c == OP_BRAPOSZERO)
2397      {      {
2398      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2399      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
2400      c = *code;      c = *code;
2401      continue;      continue;
# Line 2271  for (code = first_significant_code(code Line 2433  for (code = first_significant_code(code
2433        empty_branch = FALSE;        empty_branch = FALSE;
2434        do        do
2435          {          {
2436          if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2437            empty_branch = TRUE;            empty_branch = TRUE;
2438          code += GET(code, 1);          code += GET(code, 1);
2439          }          }
# Line 2289  for (code = first_significant_code(code Line 2451  for (code = first_significant_code(code
2451      {      {
2452      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2453      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2454      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2455      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2456      here. */      here. */
2457    
2458  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2459      case OP_XCLASS:      case OP_XCLASS:
2460      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2461      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2301  for (code = first_significant_code(code Line 2463  for (code = first_significant_code(code
2463    
2464      case OP_CLASS:      case OP_CLASS:
2465      case OP_NCLASS:      case OP_NCLASS:
2466      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2467    
2468  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2469      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2470  #endif  #endif
2471    
# Line 2376  for (code = first_significant_code(code Line 2538  for (code = first_significant_code(code
2538      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2539      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2540      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2541      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP
2542          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2543      break;      break;
2544    
2545      /* End of branch */      /* End of branch */
# Line 2391  for (code = first_significant_code(code Line 2554  for (code = first_significant_code(code
2554      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2555      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
2556    
2557  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2558      case OP_STAR:      case OP_STAR:
2559      case OP_STARI:      case OP_STARI:
2560      case OP_MINSTAR:      case OP_MINSTAR:
# Line 2404  for (code = first_significant_code(code Line 2567  for (code = first_significant_code(code
2567      case OP_MINQUERYI:      case OP_MINQUERYI:
2568      case OP_POSQUERY:      case OP_POSQUERY:
2569      case OP_POSQUERYI:      case OP_POSQUERYI:
2570      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2571      break;      break;
2572    
2573      case OP_UPTO:      case OP_UPTO:
# Line 2413  for (code = first_significant_code(code Line 2576  for (code = first_significant_code(code
2576      case OP_MINUPTOI:      case OP_MINUPTOI:
2577      case OP_POSUPTO:      case OP_POSUPTO:
2578      case OP_POSUPTOI:      case OP_POSUPTOI:
2579      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2580      break;      break;
2581  #endif  #endif
2582    
# Line 2457  Arguments: Line 2620  Arguments:
2620    code        points to start of the recursion    code        points to start of the recursion
2621    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2622    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2623    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2624    cd          pointers to tables etc    cd          pointers to tables etc
2625    
2626  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2627  */  */
2628    
2629  static BOOL  static BOOL
2630  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2631    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2632  {  {
2633  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2634    {    {
2635    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2636      return FALSE;      return FALSE;
2637    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2638    }    }
# Line 2521  Returns:   TRUE or FALSE Line 2684  Returns:   TRUE or FALSE
2684  */  */
2685    
2686  static BOOL  static BOOL
2687  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2688  {  {
2689  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2690  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
# Line 2565  Returns:     a value representing the na Line 2728  Returns:     a value representing the na
2728  */  */
2729    
2730  static int  static int
2731  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2732  {  {
2733  const char *pn = posix_names;  const char *pn = posix_names;
2734  register int yield = 0;  register int yield = 0;
2735  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2736    {    {
2737    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2738      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2739    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2740    yield++;    yield++;
2741    }    }
# Line 2604  value in the reference (which is a group Line 2767  value in the reference (which is a group
2767  Arguments:  Arguments:
2768    group      points to the start of the group    group      points to the start of the group
2769    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2770    utf8       TRUE in UTF-8 mode    utf        TRUE in UTF-8 / UTF-16 mode
2771    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2772    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm   the hwm forward reference pointer at the start of the group
2773    
# Line 2612  Returns:     nothing Line 2775  Returns:     nothing
2775  */  */
2776    
2777  static void  static void
2778  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2779    uschar *save_hwm)    pcre_uchar *save_hwm)
2780  {  {
2781  uschar *ptr = group;  pcre_uchar *ptr = group;
2782    
2783  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2784    {    {
2785    int offset;    int offset;
2786    uschar *hc;    pcre_uchar *hc;
2787    
2788    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2789    reference. */    reference. */
# Line 2665  Arguments: Line 2828  Arguments:
2828  Returns:         new code pointer  Returns:         new code pointer
2829  */  */
2830    
2831  static uschar *  static pcre_uchar *
2832  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2833  {  {
2834  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2835  *code++ = 255;  *code++ = 255;
2836  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2837  PUT(code, LINK_SIZE, 0);                       /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2838  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2839  }  }
2840    
2841    
# Line 2694  Returns:             nothing Line 2857  Returns:             nothing
2857  */  */
2858    
2859  static void  static void
2860  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2861  {  {
2862  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2863  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
# Line 2777  switch(ptype) Line 2940  switch(ptype)
2940            prop->chartype == ucp_Lt) == negated;            prop->chartype == ucp_Lt) == negated;
2941    
2942    case PT_GC:    case PT_GC:
2943    return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2944    
2945    case PT_PC:    case PT_PC:
2946    return (pdata == prop->chartype) == negated;    return (pdata == prop->chartype) == negated;
# Line 2788  switch(ptype) Line 2951  switch(ptype)
2951    /* These are specials */    /* These are specials */
2952    
2953    case PT_ALNUM:    case PT_ALNUM:
2954    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2955            _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2956    
2957    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2958    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2959            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2960            == negated;            == negated;
2961    
2962    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2963    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2964            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2965            c == CHAR_FF || c == CHAR_CR)            c == CHAR_FF || c == CHAR_CR)
2966            == negated;            == negated;
2967    
2968    case PT_WORD:    case PT_WORD:
2969    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2970            _pcre_ucp_gentype[prop->chartype] == ucp_N ||            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2971            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
2972    }    }
2973  return FALSE;  return FALSE;
# Line 2823  sense to automatically possessify the re Line 2986  sense to automatically possessify the re
2986    
2987  Arguments:  Arguments:
2988    previous      pointer to the repeated opcode    previous      pointer to the repeated opcode
2989    utf8          TRUE in UTF-8 mode    utf           TRUE in UTF-8 / UTF-16 mode
2990    ptr           next character in pattern    ptr           next character in pattern
2991    options       options bits    options       options bits
2992    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 2832  Returns:        TRUE if possessifying is Line 2995  Returns:        TRUE if possessifying is
2995  */  */
2996    
2997  static BOOL  static BOOL
2998  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2999    int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
3000  {  {
3001  int c, next;  pcre_int32 c, next;
3002  int op_code = *previous++;  int op_code = *previous++;
3003    
3004  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2844  if ((options & PCRE_EXTENDED) != 0) Line 3007  if ((options & PCRE_EXTENDED) != 0)
3007    {    {
3008    for (;;)    for (;;)
3009      {      {
3010      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3011      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3012        {        {
3013        ptr++;        ptr++;
# Line 2852  if ((options & PCRE_EXTENDED) != 0) Line 3015  if ((options & PCRE_EXTENDED) != 0)
3015          {          {
3016          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3017          ptr++;          ptr++;
3018  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3019          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3020  #endif  #endif
3021          }          }
3022        }        }
# Line 2871  if (*ptr == CHAR_BACKSLASH) Line 3034  if (*ptr == CHAR_BACKSLASH)
3034    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
3035    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
3036    }    }
3037    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
3038    {    {
3039  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3040    if (utf8) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
3041  #endif  #endif
3042    next = *ptr++;    next = *ptr++;
3043    }    }
   
3044  else return FALSE;  else return FALSE;
3045    
3046  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2888  if ((options & PCRE_EXTENDED) != 0) Line 3049  if ((options & PCRE_EXTENDED) != 0)
3049    {    {
3050    for (;;)    for (;;)
3051      {      {
3052      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3053      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3054        {        {
3055        ptr++;        ptr++;
# Line 2896  if ((options & PCRE_EXTENDED) != 0) Line 3057  if ((options & PCRE_EXTENDED) != 0)
3057          {          {
3058          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3059          ptr++;          ptr++;
3060  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3061          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3062  #endif  #endif
3063          }          }
3064        }        }
# Line 2908  if ((options & PCRE_EXTENDED) != 0) Line 3069  if ((options & PCRE_EXTENDED) != 0)
3069  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
3070    
3071  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3072    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3073      return FALSE;      return FALSE;
3074    
3075  /* Now compare the next item with the previous opcode. First, handle cases when  /* Now compare the next item with the previous opcode. First, handle cases when
# Line 2917  the next item is a character. */ Line 3078  the next item is a character. */
3078  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3079    {    {
3080    case OP_CHAR:    case OP_CHAR:
3081  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3082    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3083  #else  #else
3084    c = *previous;    c = *previous;
# Line 2929  if (next >= 0) switch(op_code) Line 3090  if (next >= 0) switch(op_code)
3090    high-valued characters. */    high-valued characters. */
3091    
3092    case OP_CHARI:    case OP_CHARI:
3093  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3094    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3095  #else  #else
3096    c = *previous;    c = *previous;
3097  #endif  #endif
3098    if (c == next) return FALSE;    if (c == next) return FALSE;
3099  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3100    if (utf8)    if (utf)
3101      {      {
3102      unsigned int othercase;      unsigned int othercase;
3103      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2948  if (next >= 0) switch(op_code) Line 3109  if (next >= 0) switch(op_code)
3109      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3110      }      }
3111    else    else
3112  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3113    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
   
   /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These  
   opcodes are not used for multi-byte characters, because they are coded using  
   an XCLASS instead. */  
3114    
3115    case OP_NOT:    case OP_NOT:
3116    return (c = *previous) == next;  #ifdef SUPPORT_UTF
3117      GETCHARTEST(c, previous);
3118    #else
3119      c = *previous;
3120    #endif
3121      return c == next;
3122    
3123    case OP_NOTI:    case OP_NOTI:
3124    if ((c = *previous) == next) return TRUE;  #ifdef SUPPORT_UTF
3125  #ifdef SUPPORT_UTF8    GETCHARTEST(c, previous);
3126    if (utf8)  #else
3127      c = *previous;
3128    #endif
3129      if (c == next) return TRUE;
3130    #ifdef SUPPORT_UTF
3131      if (utf)
3132      {      {
3133      unsigned int othercase;      unsigned int othercase;
3134      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
3135  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3136      othercase = UCD_OTHERCASE(next);      othercase = UCD_OTHERCASE((unsigned int)next);
3137  #else  #else
3138      othercase = NOTACHAR;      othercase = NOTACHAR;
3139  #endif  #endif
3140      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3141      }      }
3142    else    else
3143  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3144    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3145    
3146    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3147    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3148    
3149    case OP_DIGIT:    case OP_DIGIT:
3150    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3151    
3152    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3153    return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3154    
3155    case OP_WHITESPACE:    case OP_WHITESPACE:
3156    return next > 127 || (cd->ctypes[next] & ctype_space) == 0;    return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3157    
3158    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3159    return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3160    
3161    case OP_WORDCHAR:    case OP_WORDCHAR:
3162    return next > 127 || (cd->ctypes[next] & ctype_word) == 0;    return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3163    
3164    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3165    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3166    
3167    case OP_HSPACE:    case OP_HSPACE:
3168    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
# Line 3065  switch(op_code) Line 3232  switch(op_code)
3232    {    {
3233    case OP_CHAR:    case OP_CHAR:
3234    case OP_CHARI:    case OP_CHARI:
3235  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3236    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3237  #else  #else
3238    c = *previous;    c = *previous;
# Line 3073  switch(op_code) Line 3240  switch(op_code)
3240    switch(-next)    switch(-next)
3241      {      {
3242      case ESC_d:      case ESC_d:
3243      return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;      return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3244    
3245      case ESC_D:      case ESC_D:
3246      return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3247    
3248      case ESC_s:      case ESC_s:
3249      return c > 127 || (cd->ctypes[c] & ctype_space) == 0;      return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3250    
3251      case ESC_S:      case ESC_S:
3252      return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3253    
3254      case ESC_w:      case ESC_w:
3255      return c > 127 || (cd->ctypes[c] & ctype_word) == 0;      return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3256    
3257      case ESC_W:      case ESC_W:
3258      return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3259    
3260      case ESC_h:      case ESC_h:
3261      case ESC_H:      case ESC_H:
# Line 3170  switch(op_code) Line 3337  switch(op_code)
3337        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
3338    
3339        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3340          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3341            return FALSE;            return FALSE;
3342    
3343        /* Do the property check. */        /* Do the property check. */
# Line 3197  switch(op_code) Line 3364  switch(op_code)
3364    return next == -ESC_d;    return next == -ESC_d;
3365    
3366    case OP_WHITESPACE:    case OP_WHITESPACE:
3367    return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;    return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3368    
3369    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3370    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3371    
3372    case OP_HSPACE:    case OP_HSPACE:
3373    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
# Line 3248  Arguments: Line 3415  Arguments:
3415    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3416    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3417    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3418    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3419    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3420    bcptr          points to current branch chain    bcptr          points to current branch chain
3421    cond_depth     conditional nesting depth    cond_depth     conditional nesting depth
3422    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
# Line 3261  Returns:         TRUE on success Line 3428  Returns:         TRUE on success
3428  */  */
3429    
3430  static BOOL  static BOOL
3431  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3432    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3433    int cond_depth, compile_data *cd, int *lengthptr)    pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3434      compile_data *cd, int *lengthptr)
3435  {  {
3436  int repeat_type, op_type;  int repeat_type, op_type;
3437  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3438  int bravalue = 0;  int bravalue = 0;
3439  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3440  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3441  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3442  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3443  int options = *optionsptr;               /* May change dynamically */  int options = *optionsptr;               /* May change dynamically */
3444  int after_manual_callout = 0;  int after_manual_callout = 0;
3445  int length_prevgroup = 0;  int length_prevgroup = 0;
3446  register int c;  register int c;
3447  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3448  uschar *last_code = code;  pcre_uchar *last_code = code;
3449  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3450  uschar *tempcode;  pcre_uchar *tempcode;
3451  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3452  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3453  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3454  const uschar *tempptr;  const pcre_uchar *tempptr;
3455  const uschar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
3456  uschar *previous = NULL;  pcre_uchar *previous = NULL;
3457  uschar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
3458  uschar *save_hwm = NULL;  pcre_uchar *save_hwm = NULL;
3459  uschar classbits[32];  pcre_uint8 classbits[32];
3460    
3461  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3462  must not do this for other options (e.g. PCRE_EXTENDED) because they may change  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3463  dynamically as we process the pattern. */  dynamically as we process the pattern. */
3464    
3465  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3466  BOOL class_utf8;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3467  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
3468  uschar *class_utf8data;  pcre_uchar utf_chars[6];
 uschar *class_utf8data_base;  
 uschar utf8_char[6];  
3469  #else  #else
3470  BOOL utf8 = FALSE;  BOOL utf = FALSE;
3471    #endif
3472    
3473    /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3474    
3475    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3476    BOOL xclass;
3477    pcre_uchar *class_uchardata;
3478    pcre_uchar *class_uchardata_base;
3479  #endif  #endif
3480    
3481  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 3315  greedy_non_default = greedy_default ^ 1; Line 3489  greedy_non_default = greedy_default ^ 1;
3489    
3490  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3491  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3492  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3493  find one.  find one.
3494    
3495  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3496  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3497  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3498  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3499    
3500  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3501    
3502  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3503  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3504  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3505  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3506    value. This is used only for ASCII characters. */
3507    
3508  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3509    
3510  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3511    
# Line 3342  for (;; ptr++) Line 3517  for (;; ptr++)
3517    BOOL is_quantifier;    BOOL is_quantifier;
3518    BOOL is_recurse;    BOOL is_recurse;
3519    BOOL reset_bracount;    BOOL reset_bracount;
3520    int class_charcount;    int class_has_8bitchar;
3521    int class_lastchar;    int class_single_char;
3522    int newoptions;    int newoptions;
3523    int recno;    int recno;
3524    int refsign;    int refsign;
3525    int skipbytes;    int skipbytes;
3526    int subreqbyte;    int subreqchar;
3527    int subfirstbyte;    int subfirstchar;
3528    int terminator;    int terminator;
3529    int mclength;    int mclength;
3530    int tempbracount;    int tempbracount;
3531    uschar mcbuffer[8];    pcre_uchar mcbuffer[8];
3532    
3533    /* Get next byte in the pattern */    /* Get next character in the pattern */
3534    
3535    c = *ptr;    c = *ptr;
3536    
# Line 3377  for (;; ptr++) Line 3552  for (;; ptr++)
3552  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3553      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3554  #endif  #endif
3555      if (code > cd->start_workspace + cd->workspace_size -      if (code > cd->start_workspace + cd->workspace_size -
3556          WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */          WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3557        {        {
3558        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
# Line 3401  for (;; ptr++) Line 3576  for (;; ptr++)
3576        }        }
3577    
3578      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3579      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3580        c));        (int)(code - last_code), c, c));
3581    
3582      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3583      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 3412  for (;; ptr++) Line 3587  for (;; ptr++)
3587        {        {
3588        if (previous > orig_code)        if (previous > orig_code)
3589          {          {
3590          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3591          code -= previous - orig_code;          code -= previous - orig_code;
3592          previous = orig_code;          previous = orig_code;
3593          }          }
# Line 3428  for (;; ptr++) Line 3603  for (;; ptr++)
3603    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3604    reference list. */    reference list. */
3605    
3606    else if (cd->hwm > cd->start_workspace + cd->workspace_size -    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3607             WORK_SIZE_SAFETY_MARGIN)             WORK_SIZE_SAFETY_MARGIN)
3608      {      {
3609      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
# Line 3481  for (;; ptr++) Line 3656  for (;; ptr++)
3656    
3657    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3658      {      {
3659      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3660      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3661        {        {
3662        ptr++;        ptr++;
# Line 3489  for (;; ptr++) Line 3664  for (;; ptr++)
3664          {          {
3665          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3666          ptr++;          ptr++;
3667  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3668          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3669  #endif  #endif
3670          }          }
3671        if (*ptr != 0) continue;        if (*ptr != 0) continue;
# Line 3514  for (;; ptr++) Line 3689  for (;; ptr++)
3689      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3690      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3691      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3692      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3693      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3694      *codeptr = code;      *codeptr = code;
3695      *ptrptr = ptr;      *ptrptr = ptr;
3696      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3539  for (;; ptr++) Line 3714  for (;; ptr++)
3714      previous = NULL;      previous = NULL;
3715      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3716        {        {
3717        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3718        *code++ = OP_CIRCM;        *code++ = OP_CIRCM;
3719        }        }
3720      else *code++ = OP_CIRC;      else *code++ = OP_CIRC;
# Line 3551  for (;; ptr++) Line 3726  for (;; ptr++)
3726      break;      break;
3727    
3728      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3729      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3730    
3731      case CHAR_DOT:      case CHAR_DOT:
3732      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3733      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3734      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3735      previous = code;      previous = code;
3736      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3737      break;      break;
# Line 3611  for (;; ptr++) Line 3786  for (;; ptr++)
3786          {          {
3787          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
3788            ptr++;            ptr++;
3789          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
                           STR_Q STR_BACKSLASH STR_E, 3) == 0)  
3790            ptr += 3;            ptr += 3;
3791          else          else
3792            break;            break;
# Line 3631  for (;; ptr++) Line 3805  for (;; ptr++)
3805          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3806        {        {
3807        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3808        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3809        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3810        break;        break;
3811        }        }
3812    
# Line 3642  for (;; ptr++) Line 3816  for (;; ptr++)
3816    
3817      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3818    
3819      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3820      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3821      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3822        a single character. */
3823    
3824      class_charcount = 0;      class_has_8bitchar = 0;
3825      class_lastchar = -1;      class_single_char = 0;
3826    
3827      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3828      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
3829      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3830      */      */
3831    
3832      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3833    
3834  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3835      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3836      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3837      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3838  #endif  #endif
3839    
3840      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3668  for (;; ptr++) Line 3843  for (;; ptr++)
3843    
3844      if (c != 0) do      if (c != 0) do
3845        {        {
3846        const uschar *oldptr;        const pcre_uchar *oldptr;
3847    
3848  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3849        if (utf8 && c > 127)        if (utf && HAS_EXTRALEN(c))
3850          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3851          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3852          }          }
3853    #endif
3854    
3855        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3856          /* In the pre-compile phase, accumulate the length of any extra
3857        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3858        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
3859        (which is on the stack). */        (which is on the stack). */
3860    
3861        if (lengthptr != NULL)        if (lengthptr != NULL)
3862          {          {
3863          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_uchardata - class_uchardata_base;
3864          class_utf8data = class_utf8data_base;          class_uchardata = class_uchardata_base;
3865          }          }
   
3866  #endif  #endif
3867    
3868        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3714  for (;; ptr++) Line 3890  for (;; ptr++)
3890          {          {
3891          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3892          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3893          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3894          uschar pbits[32];          pcre_uint8 pbits[32];
3895    
3896          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
3897            {            {
# Line 3770  for (;; ptr++) Line 3946  for (;; ptr++)
3946          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3947    
3948          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3949            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3950    
3951          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3952    
# Line 3801  for (;; ptr++) Line 3977  for (;; ptr++)
3977            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3978    
3979          ptr = tempptr + 1;          ptr = tempptr + 1;
3980          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3981            class_has_8bitchar = 1;
3982            /* Every class contains at least two characters. */
3983            class_single_char = 2;
3984          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3985          }          }
3986    
3987        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3988        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3989        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3990        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3991        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3992        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3993          as literal characters (by default), or are faulted if
3994        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3995    
3996        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3822  for (;; ptr++) Line 4002  for (;; ptr++)
4002          else if (-c == ESC_N)            /* \N is not supported in a class */          else if (-c == ESC_N)            /* \N is not supported in a class */
4003            {            {
4004            *errorcodeptr = ERR71;            *errorcodeptr = ERR71;
4005            goto FAILED;            goto FAILED;
4006            }            }
4007          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
4008            {            {
4009            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3837  for (;; ptr++) Line 4017  for (;; ptr++)
4017    
4018          if (c < 0)          if (c < 0)
4019            {            {
4020            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
4021            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
4022              class_has_8bitchar++;
4023              /* Every class contains at least two characters. */
4024              class_single_char += 2;
4025    
4026            switch (-c)            switch (-c)
4027              {              {
# Line 3851  for (;; ptr++) Line 4034  for (;; ptr++)
4034              case ESC_SU:              case ESC_SU:
4035              nestptr = ptr;              nestptr = ptr;
4036              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
4037              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
4038              continue;              continue;
4039  #endif  #endif
4040              case ESC_d:              case ESC_d:
# Line 3892  for (;; ptr++) Line 4075  for (;; ptr++)
4075              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
4076              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
4077              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
4078  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4079              if (utf8)              xclass = TRUE;
4080                *class_uchardata++ = XCL_SINGLE;
4081                *class_uchardata++ = 0x1680;
4082                *class_uchardata++ = XCL_SINGLE;
4083                *class_uchardata++ = 0x180e;
4084                *class_uchardata++ = XCL_RANGE;
4085                *class_uchardata++ = 0x2000;
4086                *class_uchardata++ = 0x200a;
4087                *class_uchardata++ = XCL_SINGLE;
4088                *class_uchardata++ = 0x202f;
4089                *class_uchardata++ = XCL_SINGLE;
4090                *class_uchardata++ = 0x205f;
4091                *class_uchardata++ = XCL_SINGLE;
4092                *class_uchardata++ = 0x3000;
4093    #elif defined SUPPORT_UTF
4094                if (utf)
4095                {                {
4096                class_utf8 = TRUE;                xclass = TRUE;
4097                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4098                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4099                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4100                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4101                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4102                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4103                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4104                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4105                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4106                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4107                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4108                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4109                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4110                }                }
4111  #endif  #endif
4112              continue;              continue;
# Line 3926  for (;; ptr++) Line 4124  for (;; ptr++)
4124                  }                  }
4125                classbits[c] |= x;                classbits[c] |= x;
4126                }                }
4127    #ifndef COMPILE_PCRE8
4128  #ifdef SUPPORT_UTF8              xclass = TRUE;
4129              if (utf8)              *class_uchardata++ = XCL_RANGE;
4130                *class_uchardata++ = 0x0100;
4131                *class_uchardata++ = 0x167f;
4132                *class_uchardata++ = XCL_RANGE;
4133                *class_uchardata++ = 0x1681;
4134                *class_uchardata++ = 0x180d;
4135                *class_uchardata++ = XCL_RANGE;
4136                *class_uchardata++ = 0x180f;
4137                *class_uchardata++ = 0x1fff;
4138                *class_uchardata++ = XCL_RANGE;
4139                *class_uchardata++ = 0x200b;
4140                *class_uchardata++ = 0x202e;
4141                *class_uchardata++ = XCL_RANGE;
4142                *class_uchardata++ = 0x2030;
4143                *class_uchardata++ = 0x205e;
4144                *class_uchardata++ = XCL_RANGE;
4145                *class_uchardata++ = 0x2060;
4146                *class_uchardata++ = 0x2fff;
4147                *class_uchardata++ = XCL_RANGE;
4148                *class_uchardata++ = 0x3001;
4149    #ifdef SUPPORT_UTF
4150                if (utf)
4151                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4152                else
4153    #endif
4154                  *class_uchardata++ = 0xffff;
4155    #elif defined SUPPORT_UTF
4156                if (utf)
4157                {                {
4158                class_utf8 = TRUE;                xclass = TRUE;
4159                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4160                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4161                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4162                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4163                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4164                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4165                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4166                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4167                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4168                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4169                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4170                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4171                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4172                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4173                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4174                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4175                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4176                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4177                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4178                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4179                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4180                }                }
4181  #endif  #endif
4182              continue;              continue;
# Line 3962  for (;; ptr++) Line 4187  for (;; ptr++)
4187              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
4188              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, 0x0d); /* CR */
4189              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, 0x85); /* NEL */
4190  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4191              if (utf8)              xclass = TRUE;
4192                *class_uchardata++ = XCL_RANGE;
4193                *class_uchardata++ = 0x2028;
4194                *class_uchardata++ = 0x2029;
4195    #elif defined SUPPORT_UTF
4196                if (utf)
4197                {                {
4198                class_utf8 = TRUE;                xclass = TRUE;
4199                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4200                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4201                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4202                }                }
4203  #endif  #endif
4204              continue;              continue;
# Line 3990  for (;; ptr++) Line 4220  for (;; ptr++)
4220                classbits[c] |= x;                classbits[c] |= x;
4221                }                }
4222    
4223  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4224              if (utf8)              xclass = TRUE;
4225                *class_uchardata++ = XCL_RANGE;
4226                *class_uchardata++ = 0x0100;
4227                *class_uchardata++ = 0x2027;
4228                *class_uchardata++ = XCL_RANGE;
4229                *class_uchardata++ = 0x202a;
4230    #ifdef SUPPORT_UTF
4231                if (utf)
4232                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4233                else
4234    #endif
4235                  *class_uchardata++ = 0xffff;
4236    #elif defined SUPPORT_UTF
4237                if (utf)
4238                {                {
4239                class_utf8 = TRUE;                xclass = TRUE;
4240                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4241                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4242                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4243                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4244                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4245                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4246                }                }
4247  #endif  #endif
4248              continue;              continue;
# Line 4012  for (;; ptr++) Line 4255  for (;; ptr++)
4255                int pdata;                int pdata;
4256                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4257                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
4258                class_utf8 = TRUE;                xclass = TRUE;
4259                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_uchardata++ = ((-c == ESC_p) != negated)?
4260                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4261                *class_utf8data++ = ptype;                *class_uchardata++ = ptype;
4262                *class_utf8data++ = pdata;                *class_uchardata++ = pdata;
4263                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4264                continue;                continue;
4265                }                }
4266  #endif  #endif
# Line 4031  for (;; ptr++) Line 4274  for (;; ptr++)
4274                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4275                goto FAILED;                goto FAILED;
4276                }                }
4277              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4278              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4279                c = *ptr;                /* Get the final character and fall through */
4280              break;              break;
4281              }              }
4282            }            }
4283    
4284          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4285          greater than 256 in UTF-8 mode. */          greater than 256. */
4286    
4287          }   /* End of backslash handling */          }   /* End of backslash handling */
4288    
# Line 4086  for (;; ptr++) Line 4330  for (;; ptr++)
4330            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4331            }            }
4332    
4333  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4334          if (utf8)          if (utf)
4335            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4336            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4337            }            }
# Line 4131  for (;; ptr++) Line 4375  for (;; ptr++)
4375    
4376          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4377    
4378            /* Since we found a character range, single character optimizations
4379            cannot be done anymore. */
4380            class_single_char = 2;
4381    
4382          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4383          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4384          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4385          available. */          available. */
4386    
4387  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4388          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4389    #elif defined  SUPPORT_UTF
4390            if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4391    #elif !(defined COMPILE_PCRE8)
4392            if (d > 255)
4393    #endif
4394    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4395            {            {
4396            class_utf8 = TRUE;            xclass = TRUE;
4397    
4398            /* With UCP support, we can find the other case equivalents of            /* With UCP support, we can find the other case equivalents of
4399            the relevant characters. There may be several ranges. Optimize how            the relevant characters. There may be several ranges. Optimize how
4400            they fit with the basic range. */            they fit with the basic range. */
4401    
4402  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4403    #ifndef COMPILE_PCRE8
4404              if (utf && (options & PCRE_CASELESS) != 0)
4405    #else
4406            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4407    #endif
4408              {              {
4409              unsigned int occ, ocd;              unsigned int occ, ocd;
4410              unsigned int cc = c;              unsigned int cc = c;
# Line 4172  for (;; ptr++) Line 4430  for (;; ptr++)
4430    
4431                if (occ == ocd)                if (occ == ocd)
4432                  {                  {
4433                  *class_utf8data++ = XCL_SINGLE;                  *class_uchardata++ = XCL_SINGLE;
4434                  }                  }
4435                else                else
4436                  {                  {
4437                  *class_utf8data++ = XCL_RANGE;                  *class_uchardata++ = XCL_RANGE;
4438                  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);                  class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4439                  }                  }
4440                class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);                class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4441                }                }
4442              }              }
4443  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 4187  for (;; ptr++) Line 4445  for (;; ptr++)
4445            /* Now record the original range, possibly modified for UCP caseless            /* Now record the original range, possibly modified for UCP caseless
4446            overlapping ranges. */            overlapping ranges. */
4447    
4448            *class_utf8data++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4449            class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #ifdef SUPPORT_UTF
4450            class_utf8data += _pcre_ord2utf8(d, class_utf8data);  #ifndef COMPILE_PCRE8
4451              if (utf)
4452                {
4453                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4454                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4455                }
4456              else
4457                {
4458                *class_uchardata++ = c;
4459                *class_uchardata++ = d;
4460                }
4461    #else
4462              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4463              class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4464    #endif
4465    #else /* SUPPORT_UTF */
4466              *class_uchardata++ = c;
4467              *class_uchardata++ = d;
4468    #endif /* SUPPORT_UTF */
4469    
4470            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4471            caseless matching for UTF-8 characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
4472            for the smaller ones. */            for the smaller ones. As for 16 bit characters without UTF, we
4473              can still use  */
4474    
4475  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4476            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4477  #else            if (utf)
4478            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  #endif
4479                continue;    /* With next character in the class */
4480    #endif  /* SUPPORT_UCP */
4481    
4482    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4483              if (utf)
4484                {
4485                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4486                /* Adjust upper limit and fall through to set up the map */
4487                d = 127;
4488                }
4489              else
4490                {
4491                if (c > 255) continue;
4492                /* Adjust upper limit and fall through to set up the map */
4493                d = 255;
4494                }
4495    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4496              if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4497            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
   
4498            d = 127;            d = 127;
4499    #else
4500  #endif  /* SUPPORT_UCP */            if (c > 255) continue;
4501              /* Adjust upper limit and fall through to set up the map */
4502              d = 255;
4503    #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4504            }            }
4505  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4506    
4507          /* We use the bit map for all cases when not in UTF-8 mode; else          /* We use the bit map for 8 bit mode, or when the characters fall
4508          ranges that lie entirely within 0-127 when there is UCP support; else          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
         for partial ranges without UCP support. */  
4509    
4510          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4511    
4512          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4513    
# Line 4222  for (;; ptr++) Line 4516  for (;; ptr++)
4516            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4517            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4518              {              {
4519              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c]; /* flip case */
4520              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
4521              }              }
4522            }            }
# Line 4236  for (;; ptr++) Line 4530  for (;; ptr++)
4530    
4531        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4532    
4533        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4534    
4535          if (class_single_char < 2) class_single_char++;
4536    
4537          /* If class_charcount is 1, we saw precisely one character. As long as
4538          there was no use of \p or \P, in other words, no use of any XCLASS
4539          features, we can optimize.
4540    
4541  #ifdef SUPPORT_UTF8        The optimization throws away the bit map. We turn the item into a
4542        if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))        1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4543          In the positive case, it can cause firstchar to be set. Otherwise, there
4544          can be no first char if this item is first, whatever repeat count may
4545          follow. In the case of reqchar, save the previous value for reinstating. */
4546    
4547          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4548          {          {
4549          class_utf8 = TRUE;          ptr++;
4550          *class_utf8data++ = XCL_SINGLE;          zeroreqchar = reqchar;
4551          class_utf8data += _pcre_ord2utf8(c, class_utf8data);  
4552            if (negate_class)
4553              {
4554              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4555              zerofirstchar = firstchar;
4556              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4557    #ifdef SUPPORT_UTF
4558              if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4559                code += PRIV(ord2utf)(c, code);
4560              else
4561    #endif
4562                *code++ = c;
4563              goto NOT_CHAR;
4564              }
4565    
4566            /* For a single, positive character, get the value into mcbuffer, and
4567            then we can handle this with the normal one-character code. */
4568    
4569    #ifdef SUPPORT_UTF
4570            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4571              mclength = PRIV(ord2utf)(c, mcbuffer);
4572            else
4573    #endif
4574              {
4575              mcbuffer[0] = c;
4576              mclength = 1;
4577              }
4578            goto ONE_CHAR;
4579            }       /* End of 1-char optimization */
4580    
4581          /* Handle a character that cannot go in the bit map. */
4582    
4583    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4584          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4585    #elif defined SUPPORT_UTF
4586          if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4587    #elif !(defined COMPILE_PCRE8)
4588          if (c > 255)
4589    #endif
4590    
4591    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4592            {
4593            xclass = TRUE;
4594            *class_uchardata++ = XCL_SINGLE;
4595    #ifdef SUPPORT_UTF
4596    #ifndef COMPILE_PCRE8
4597            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4598            if (!utf)
4599              *class_uchardata++ = c;
4600            else
4601    #endif
4602              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4603    #else /* SUPPORT_UTF */
4604            *class_uchardata++ = c;
4605    #endif /* SUPPORT_UTF */
4606    
4607  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4608    #ifdef COMPILE_PCRE8
4609          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4610    #else
4611            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4612            if (utf && (options & PCRE_CASELESS) != 0)
4613    #endif
4614            {            {
4615            unsigned int othercase;            unsigned int othercase;
4616            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((int)(othercase = UCD_OTHERCASE(c)) != c)
4617              {              {
4618              *class_utf8data++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4619              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4620              }              }
4621            }            }
4622  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
4623    
4624          }          }
4625        else        else
4626  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4627    
4628        /* Handle a single-byte character */        /* Handle a single-byte character */
4629          {          {
4630            class_has_8bitchar = 1;
4631          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4632          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4633            {            {
4634            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c]; /* flip case */
4635            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4636            }            }
         class_charcount++;  
         class_lastchar = c;  
4637          }          }
4638        }        }
4639    
# Line 4291  for (;; ptr++) Line 4654  for (;; ptr++)
4654        goto FAILED;        goto FAILED;
4655        }        }
4656    
4657      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4658      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4659      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
4660      optimize.  
4661        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4662      In UTF-8 mode, we can optimize the negative case only if there were no      zerofirstchar = firstchar;
4663      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      zeroreqchar = reqchar;
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstbyte to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqbyte, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF8  
     if (class_charcount == 1 && !class_utf8 &&  
       (!utf8 || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqbyte = reqbyte;  
   
       /* The OP_NOT[I] opcodes work on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
         zerofirstbyte = firstbyte;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF8  
       if (utf8 && class_lastchar > 127)  
         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqbyte setting must remain unchanged after any kind of  
     repeat. */  
   
     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
     zerofirstbyte = firstbyte;  
     zeroreqbyte = reqbyte;  
4664    
4665      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4666      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
# Line 4360  for (;; ptr++) Line 4670  for (;; ptr++)
4670      be listed) there are no characters < 256, we can omit the bitmap in the      be listed) there are no characters < 256, we can omit the bitmap in the
4671      actual compiled code. */      actual compiled code. */
4672    
4673  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4674      if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4675    #elif !defined COMPILE_PCRE8
4676        if (xclass && !should_flip_negation)
4677    #endif
4678    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4679        {        {
4680        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4681        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
4682        code += LINK_SIZE;        code += LINK_SIZE;
4683        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT:0;
4684    
4685        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4686        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4687    
4688        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4689          {          {
4690          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4691          memmove(code + 32, code, class_utf8data - code);          memmove(code + (32 / sizeof(pcre_uchar)), code,
4692              IN_UCHARS(class_uchardata - code));
4693          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
4694          code = class_utf8data + 32;          code = class_uchardata + (32 / sizeof(pcre_uchar));
4695          }          }
4696        else code = class_utf8data;        else code = class_uchardata;
4697    
4698        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4699    
4700        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4701        break;   /* End of class handling */        break;   /* End of class handling */
4702        }        }
4703  #endif  #endif
# Line 4394  for (;; ptr++) Line 4709  for (;; ptr++)
4709      negating it if necessary. */      negating it if necessary. */
4710    
4711      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4712      if (negate_class)      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
       {  
       if (lengthptr == NULL)    /* Save time in the pre-compile phase */  
         for (c = 0; c < 32; c++) code[c] = ~classbits[c];  
       }  
     else  
4713        {        {
4714          if (negate_class)
4715            for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4716        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4717        }        }
4718      code += 32;      code += 32 / sizeof(pcre_uchar);
4719        NOT_CHAR:
4720      break;      break;
4721    
4722    
# Line 4440  for (;; ptr++) Line 4753  for (;; ptr++)
4753    
4754      if (repeat_min == 0)      if (repeat_min == 0)
4755        {        {
4756        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */        firstchar = zerofirstchar;    /* Adjust for zero repeat */
4757        reqbyte = zeroreqbyte;        /* Ditto */        reqchar = zeroreqchar;        /* Ditto */
4758        }        }
4759    
4760      /* Remember whether this is a variable length repeat */      /* Remember whether this is a variable length repeat */
# Line 4480  for (;; ptr++) Line 4793  for (;; ptr++)
4793      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4794      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4795      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4796    
4797      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4798        {        {
4799        memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4800        *previous = OP_ONCE;        *previous = OP_ONCE;
4801        PUT(previous, 1, 2 + 2*LINK_SIZE);        PUT(previous, 1, 2 + 2*LINK_SIZE);
4802        previous[2 + 2*LINK_SIZE] = OP_KET;        previous[2 + 2*LINK_SIZE] = OP_KET;
# Line 4504  for (;; ptr++) Line 4817  for (;; ptr++)
4817    
4818      /* Now handle repetition for the different types of item. */      /* Now handle repetition for the different types of item. */
4819    
4820      /* If previous was a character match, abolish the item and generate a      /* If previous was a character or negated character match, abolish the item
4821      repeat item instead. If a char item has a minumum of more than one, ensure      and generate a repeat item instead. If a char item has a minimum of more
4822      that it is set in reqbyte - it might not be if a sequence such as x{3} is      than one, ensure that it is set in reqchar - it might not be if a sequence
4823      the first thing in a branch because the x will have gone into firstbyte      such as x{3} is the first thing in a branch because the x will have gone
4824      instead.  */      into firstchar instead.  */
4825    
4826      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI
4827            || *previous == OP_NOT || *previous == OP_NOTI)
4828        {        {
4829        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;        switch (*previous)
4830            {
4831            default: /* Make compiler happy. */
4832            case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
4833            case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4834            case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
4835            case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
4836            }
4837    
4838        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF characters that take up more than one character. It's
4839        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4840        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus UTF_LENGTH to flag that
4841        length rather than a small character. */        it's a length rather than a small character. */
4842    
4843  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4844        if (utf8 && (code[-1] & 0x80) != 0)        if (utf && NOT_FIRSTCHAR(code[-1]))
4845          {          {
4846          uschar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4847          while((*lastchar & 0xc0) == 0x80) lastchar--;          BACKCHAR(lastchar);
4848          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4849          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4850          c |= 0x80;                      /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4851          }          }
4852        else        else
4853  #endif  #endif /* SUPPORT_UTF */
   
       /* Handle the case of a single byte - either with no UTF8 support, or  
       with UTF-8 disabled, or for a UTF-8 character < 128. */  
4854    
4855          /* Handle the case of a single charater - either with no UTF support, or
4856          with UTF disabled, or for a single character UTF character. */
4857          {          {
4858          c = code[-1];          c = code[-1];
4859          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (*previous <= OP_CHARI && repeat_min > 1)
4860              reqchar = c | req_caseopt | cd->req_varyopt;
4861          }          }
4862    
4863        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4546  for (;; ptr++) Line 4867  for (;; ptr++)
4867    
4868        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4869            repeat_max < 0 &&            repeat_max < 0 &&
4870            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4871          {          {
4872          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4873          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4555  for (;; ptr++) Line 4876  for (;; ptr++)
4876        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4877        }        }
4878    
     /* If previous was a single negated character ([^a] or similar), we use  
     one of the special opcodes, replacing it. The code is shared with single-  
     character repeats by setting opt_type to add a suitable offset into  
     repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI  
     are currently used only for single-byte chars. */  
   
     else if (*previous == OP_NOT || *previous == OP_NOTI)  
       {  
       op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;  
       c = previous[1];  
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf8, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
       goto OUTPUT_SINGLE_REPEAT;  
       }  
   
4879      /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
4880      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
4881      repeats by setting op_type to add a suitable offset into repeat_type. Note      repeats by setting op_type to add a suitable offset into repeat_type. Note
# Line 4584  for (;; ptr++) Line 4885  for (;; ptr++)
4885    
4886      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
4887        {        {
4888        uschar *oldcode;        pcre_uchar *oldcode;
4889        int prop_type, prop_value;        int prop_type, prop_value;
4890        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4891        c = *previous;        c = *previous;
4892    
4893        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4894            repeat_max < 0 &&            repeat_max < 0 &&
4895            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4896          {          {
4897          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4898          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4671  for (;; ptr++) Line 4972  for (;; ptr++)
4972          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
4973          Unicode property match, there are two extra bytes that define the          Unicode property match, there are two extra bytes that define the
4974          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
4975          c, with the 0x80 bit as a flag. */          c, with the UTF_LENGTH bit as a flag. */
4976    
4977          if (repeat_max < 0)          if (repeat_max < 0)
4978            {            {
4979  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4980            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4981              {              {
4982              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4983              code += c & 7;              code += c & 7;
4984              }              }
4985            else            else
# Line 4700  for (;; ptr++) Line 5001  for (;; ptr++)
5001    
5002          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
5003            {            {
5004  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5005            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
5006              {              {
5007              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
5008              code += c & 7;              code += c & 7;
5009              }              }
5010            else            else
# Line 4730  for (;; ptr++) Line 5031  for (;; ptr++)
5031    
5032        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
5033    
5034  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5035        if (utf8 && c >= 128)        if (utf && (c & UTF_LENGTH) != 0)
5036          {          {
5037          memcpy(code, utf8_char, c & 7);          memcpy(code, utf_chars, IN_UCHARS(c & 7));
5038          code += c & 7;          code += c & 7;
5039          }          }
5040        else        else
# Line 4757  for (;; ptr++) Line 5058  for (;; ptr++)
5058    
5059      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
5060               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
5061  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5062               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
5063  #endif  #endif
5064               *previous == OP_REF ||               *previous == OP_REF ||
# Line 4806  for (;; ptr++) Line 5107  for (;; ptr++)
5107        {        {
5108        register int i;        register int i;
5109        int len = (int)(code - previous);        int len = (int)(code - previous);
5110        uschar *bralink = NULL;        pcre_uchar *bralink = NULL;
5111        uschar *brazeroptr = NULL;        pcre_uchar *brazeroptr = NULL;
5112    
5113        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5114        we just ignore the repeat. */        we just ignore the repeat. */
# Line 4860  for (;; ptr++) Line 5161  for (;; ptr++)
5161          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5162            {            {
5163            *code = OP_END;            *code = OP_END;
5164            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf, cd, save_hwm);
5165            memmove(previous+1, previous, len);            memmove(previous + 1, previous, IN_UCHARS(len));
5166            code++;            code++;
5167            if (repeat_max == 0)            if (repeat_max == 0)
5168              {              {
# Line 4884  for (;; ptr++) Line 5185  for (;; ptr++)
5185            {            {
5186            int offset;            int offset;
5187            *code = OP_END;            *code = OP_END;
5188            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);            adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5189            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5190            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
5191            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
5192            *previous++ = OP_BRA;            *previous++ = OP_BRA;
# Line 4932  for (;; ptr++) Line 5233  for (;; ptr++)
5233              }              }
5234    
5235            /* This is compiling for real. If there is a set first byte for            /* This is compiling for real. If there is a set first byte for
5236            the group, and we have not yet set a "required byte", set it. Make            the group, and we have not yet set a "required byte", set it. Make
5237            sure there is enough workspace for copying forward references before            sure there is enough workspace for copying forward references before
5238            doing the copy. */            doing the copy. */
5239    
5240            else            else
5241              {              {
5242              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5243    
5244              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5245                {                {
5246                uschar *hc;                pcre_uchar *hc;
5247                uschar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5248                memcpy(code, previous, len);                memcpy(code, previous, IN_UCHARS(len));
5249    
5250                while (cd->hwm > cd->start_workspace + cd->workspace_size -                while (cd->hwm > cd->start_workspace + cd->workspace_size -
5251                       WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))                       WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5252                  {                  {
# Line 4953  for (;; ptr++) Line 5254  for (;; ptr++)
5254                  int this_offset = this_hwm - cd->start_workspace;                  int this_offset = this_hwm - cd->start_workspace;
5255                  *errorcodeptr = expand_workspace(cd);                  *errorcodeptr = expand_workspace(cd);
5256                  if (*errorcodeptr != 0) goto FAILED;                  if (*errorcodeptr != 0) goto FAILED;
5257                  save_hwm = (uschar *)cd->start_workspace + save_offset;                  save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5258                  this_hwm = (uschar *)cd->start_workspace + this_offset;                  this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5259                  }                  }
5260    
5261                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5262                  {                  {
5263                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 4986  for (;; ptr++) Line 5287  for (;; ptr++)
5287          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5288          paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is          paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5289          a 64-bit integer type when available, otherwise double. */          a 64-bit integer type when available, otherwise double. */
5290    
5291          if (lengthptr != NULL && repeat_max > 0)          if (lengthptr != NULL && repeat_max > 0)
5292            {            {
5293            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
# Line 5006  for (;; ptr++) Line 5307  for (;; ptr++)
5307    
5308          else for (i = repeat_max - 1; i >= 0; i--)          else for (i = repeat_max - 1; i >= 0; i--)
5309            {            {
5310            uschar *hc;            pcre_uchar *hc;
5311            uschar *this_hwm = cd->hwm;            pcre_uchar *this_hwm = cd->hwm;
5312    
5313            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
5314    
# Line 5023  for (;; ptr++) Line 5324  for (;; ptr++)
5324              PUTINC(code, 0, offset);              PUTINC(code, 0, offset);
5325              }              }
5326    
5327            memcpy(code, previous, len);            memcpy(code, previous, IN_UCHARS(len));
5328    
5329            /* Ensure there is enough workspace for forward references before            /* Ensure there is enough workspace for forward references before
5330            copying them. */            copying them. */
5331    
5332            while (cd->hwm > cd->start_workspace + cd->workspace_size -            while (cd->hwm > cd->start_workspace + cd->workspace_size -
5333                   WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))                   WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5334              {              {
# Line 5035  for (;; ptr++) Line 5336  for (;; ptr++)
5336              int this_offset = this_hwm - cd->start_workspace;              int this_offset = this_hwm - cd->start_workspace;
5337              *errorcodeptr = expand_workspace(cd);              *errorcodeptr = expand_workspace(cd);
5338              if (*errorcodeptr != 0) goto FAILED;              if (*errorcodeptr != 0) goto FAILED;
5339              save_hwm = (uschar *)cd->start_workspace + save_offset;              save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5340              this_hwm = (uschar *)cd->start_workspace + this_offset;              this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5341              }              }
5342    
5343            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5344              {              {
5345              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 5055  for (;; ptr++) Line 5356  for (;; ptr++)
5356            {            {
5357            int oldlinkoffset;            int oldlinkoffset;
5358            int offset = (int)(code - bralink + 1);            int offset = (int)(code - bralink + 1);
5359            uschar *bra = code - offset;            pcre_uchar *bra = code - offset;
5360            oldlinkoffset = GET(bra, 1);            oldlinkoffset = GET(bra, 1);
5361            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5362            *code++ = OP_KET;            *code++ = OP_KET;
# Line 5069  for (;; ptr++) Line 5370  for (;; ptr++)
5370        ONCE brackets can be converted into non-capturing brackets, as the        ONCE brackets can be converted into non-capturing brackets, as the
5371        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5372        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5373    
5374        Otherwise, when we are doing the actual compile phase, check to see        Otherwise, when we are doing the actual compile phase, check to see
5375        whether this group is one that could match an empty string. If so,        whether this group is one that could match an empty string. If so,
5376        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5377        that runtime checking can be done. [This check is also applied to ONCE        that runtime checking can be done. [This check is also applied to ONCE
5378        groups at runtime, but in a different way.]        groups at runtime, but in a different way.]
5379    
5380        Then, if the quantifier was possessive and the bracket is not a        Then, if the quantifier was possessive and the bracket is not a
5381        conditional, we convert the BRA code to the POS form, and the KET code to        conditional, we convert the BRA code to the POS form, and the KET code to
5382        KETRPOS. (It turns out to be convenient at runtime to detect this kind of        KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5383        subpattern at both the start and at the end.) The use of special opcodes        subpattern at both the start and at the end.) The use of special opcodes
5384        makes it possible to reduce greatly the stack usage in pcre_exec(). If        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5385        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5386    
5387        Then, if the minimum number of matches is 1 or 0, cancel the possessive        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5388        flag so that the default action below, of wrapping everything inside        flag so that the default action below, of wrapping everything inside
5389        atomic brackets, does not happen. When the minimum is greater than 1,        atomic brackets, does not happen. When the minimum is greater than 1,
5390        there will be earlier copies of the group, and so we still have to wrap        there will be earlier copies of the group, and so we still have to wrap
5391        the whole thing. */        the whole thing. */
5392    
5393        else        else
5394          {          {
5395          uschar *ketcode = code - 1 - LINK_SIZE;          pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5396          uschar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5397    
5398          /* Convert possessive ONCE brackets to non-capturing */          /* Convert possessive ONCE brackets to non-capturing */
5399    
5400          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5401              possessive_quantifier) *bracode = OP_BRA;              possessive_quantifier) *bracode = OP_BRA;
5402    
5403          /* For non-possessive ONCE brackets, all we need to do is to          /* For non-possessive ONCE brackets, all we need to do is to
5404          set the KET. */          set the KET. */
5405    
5406          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5407            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5408    
5409          /* Handle non-ONCE brackets and possessive ONCEs (which have been          /* Handle non-ONCE brackets and possessive ONCEs (which have been
5410          converted to non-capturing above). */          converted to non-capturing above). */
5411    
5412          else          else
5413            {            {
5414            /* In the compile phase, check for empty string matching. */            /* In the compile phase, check for empty string matching. */
5415    
5416            if (lengthptr == NULL)            if (lengthptr == NULL)
5417              {              {
5418              uschar *scode = bracode;              pcre_uchar *scode = bracode;
5419              do              do
5420                {                {
5421                if (could_be_empty_branch(scode, ketcode, utf8, cd))                if (could_be_empty_branch(scode, ketcode, utf, cd))
5422                  {                  {
5423                  *bracode += OP_SBRA - OP_BRA;                  *bracode += OP_SBRA - OP_BRA;
5424                  break;                  break;
# Line 5126  for (;; ptr++) Line 5427  for (;; ptr++)
5427                }                }
5428              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5429              }              }
5430    
5431            /* Handle possessive quantifiers. */            /* Handle possessive quantifiers. */
5432    
5433            if (possessive_quantifier)            if (possessive_quantifier)
# Line 5135  for (;; ptr++) Line 5436  for (;; ptr++)
5436              repeated non-capturing bracket, because we have not invented POS              repeated non-capturing bracket, because we have not invented POS
5437              versions of the COND opcodes. Because we are moving code along, we              versions of the COND opcodes. Because we are moving code along, we
5438              must ensure that any pending recursive references are updated. */              must ensure that any pending recursive references are updated. */
5439    
5440              if (*bracode == OP_COND || *bracode == OP_SCOND)              if (*bracode == OP_COND || *bracode == OP_SCOND)
5441                {                {
5442                int nlen = (int)(code - bracode);                int nlen = (int)(code - bracode);
5443                *code = OP_END;                *code = OP_END;
5444                adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);                adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5445                memmove(bracode + 1+LINK_SIZE, bracode, nlen);                memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5446                code += 1 + LINK_SIZE;                code += 1 + LINK_SIZE;
5447                nlen += 1 + LINK_SIZE;                nlen += 1 + LINK_SIZE;
5448                *bracode = OP_BRAPOS;                *bracode = OP_BRAPOS;
5449                *code++ = OP_KETRPOS;                *code++ = OP_KETRPOS;
5450                PUTINC(code, 0, nlen);                PUTINC(code, 0, nlen);
5451                PUT(bracode, 1, nlen);                PUT(bracode, 1, nlen);
5452                }                }
5453    
5454              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5455    
5456              else              else
5457                {                {
5458                *bracode += 1;              /* Switch to xxxPOS opcodes */                *bracode += 1;              /* Switch to xxxPOS opcodes */
5459                *ketcode = OP_KETRPOS;                *ketcode = OP_KETRPOS;
5460                }                }
5461    
5462              /* If the minimum is zero, mark it as possessive, then unset the              /* If the minimum is zero, mark it as possessive, then unset the
5463              possessive flag when the minimum is 0 or 1. */              possessive flag when the minimum is 0 or 1. */
5464    
5465              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5466              if (repeat_min < 2) possessive_quantifier = FALSE;              if (repeat_min < 2) possessive_quantifier = FALSE;
5467              }              }
5468    
5469            /* Non-possessive quantifier */            /* Non-possessive quantifier */
5470    
5471            else *ketcode = OP_KETRMAX + repeat_type;            else *ketcode = OP_KETRMAX + repeat_type;
5472            }            }
5473          }          }
# Line 5210  for (;; ptr++) Line 5511  for (;; ptr++)
5511        int len;        int len;
5512    
5513        if (*tempcode == OP_TYPEEXACT)        if (*tempcode == OP_TYPEEXACT)
5514          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
5515            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);            ((tempcode[1 + IMM2_SIZE] == OP_PROP
5516              || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5517    
5518        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5519          {          {
5520          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
5521  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5522          if (utf8 && tempcode[-1] >= 0xc0)          if (utf && HAS_EXTRALEN(tempcode[-1]))
5523            tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];            tempcode += GET_EXTRALEN(tempcode[-1]);
5524  #endif  #endif
5525          }          }
5526    
# Line 5255  for (;; ptr++) Line 5557  for (;; ptr++)
5557    
5558          default:          default:
5559          *code = OP_END;          *code = OP_END;
5560          adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5561          memmove(tempcode + 1+LINK_SIZE, tempcode, len);          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5562          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
5563          len += 1 + LINK_SIZE;          len += 1 + LINK_SIZE;
5564          tempcode[0] = OP_ONCE;          tempcode[0] = OP_ONCE;
# Line 5268  for (;; ptr++) Line 5570  for (;; ptr++)
5570        }        }
5571    
5572      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
5573      "follows varying string" flag for subsequently encountered reqbytes if      "follows varying string" flag for subsequently encountered reqchars if
5574      it isn't already set and we have just passed a varying length item. */      it isn't already set and we have just passed a varying length item. */
5575    
5576      END_REPEAT:      END_REPEAT:
# Line 5291  for (;; ptr++) Line 5593  for (;; ptr++)
5593    
5594      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
5595    
5596      if (*(++ptr) == CHAR_ASTERISK &&      ptr++;
5597           ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5598             || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5599        {        {
5600        int i, namelen;        int i, namelen;
5601        int arglen = 0;        int arglen = 0;
5602        const char *vn = verbnames;        const char *vn = verbnames;
5603        const uschar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5604        const uschar *arg = NULL;        const pcre_uchar *arg = NULL;
5605        previous = NULL;        previous = NULL;
5606        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        ptr++;
5607          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5608        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5609    
5610        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
# Line 5312  for (;; ptr++) Line 5616  for (;; ptr++)
5616          arg = ++ptr;          arg = ++ptr;
5617          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5618          arglen = (int)(ptr - arg);          arglen = (int)(ptr - arg);
5619            if (arglen > (int)MAX_MARK)
5620              {
5621              *errorcodeptr = ERR75;
5622              goto FAILED;
5623              }
5624          }          }
5625    
5626        if (*ptr != CHAR_RIGHT_PARENTHESIS)        if (*ptr != CHAR_RIGHT_PARENTHESIS)
# Line 5325  for (;; ptr++) Line 5634  for (;; ptr++)
5634        for (i = 0; i < verbcount; i++)        for (i = 0; i < verbcount; i++)
5635          {          {
5636          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5637              strncmp((char *)name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5638            {            {
5639            /* Check for open captures before ACCEPT and convert it to            /* Check for open captures before ACCEPT and convert it to
5640            ASSERT_ACCEPT if in an assertion. */            ASSERT_ACCEPT if in an assertion. */
# Line 5346  for (;; ptr++) Line 5655  for (;; ptr++)
5655                }                }
5656              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5657    
5658              /* Do not set firstbyte after *ACCEPT */              /* Do not set firstchar after *ACCEPT */
5659              if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5660              }              }
5661    
5662            /* Handle other cases with/without an argument */            /* Handle other cases with/without an argument */
# Line 5373  for (;; ptr++) Line 5682  for (;; ptr++)
5682              *code = verbs[i].op_arg;              *code = verbs[i].op_arg;
5683              if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;              if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5684              *code++ = arglen;              *code++ = arglen;
5685              memcpy(code, arg, arglen);              memcpy(code, arg, IN_UCHARS(arglen));
5686              code += arglen;              code += arglen;
5687              *code++ = 0;              *code++ = 0;
5688              }              }
# Line 5396  for (;; ptr++) Line 5705  for (;; ptr++)
5705        {        {
5706        int i, set, unset, namelen;        int i, set, unset, namelen;
5707        int *optset;        int *optset;
5708        const uschar *name;        const pcre_uchar *name;
5709        uschar *slot;        pcre_uchar *slot;
5710    
5711        switch (*(++ptr))        switch (*(++ptr))
5712          {          {
# Line 5450  for (;; ptr++) Line 5759  for (;; ptr++)
5759            break;            break;
5760    
5761          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Most other conditions use OP_CREF (a couple change to OP_RREF
5762          below), and all need to skip 3 bytes at the start of the group. */          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5763    
5764          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
5765          skipbytes = 3;          skipbytes = 1+IMM2_SIZE;
5766          refsign = -1;          refsign = -1;
5767    
5768          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
# Line 5486  for (;; ptr++) Line 5795  for (;; ptr++)
5795    
5796          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
5797    
5798          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5799            {            {
5800            ptr += 1;  /* To get the right offset */            ptr += 1;  /* To get the right offset */
5801            *errorcodeptr = ERR28;            *errorcodeptr = ERR28;
# Line 5497  for (;; ptr++) Line 5806  for (;; ptr++)
5806    
5807          recno = 0;          recno = 0;
5808          name = ++ptr;          name = ++ptr;
5809          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5810            {            {
5811            if (recno >= 0)            if (recno >= 0)
5812              recno = ((digitab[*ptr] & ctype_digit) != 0)?              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
               recno * 10 + *ptr - CHAR_0 : -1;  
5813            ptr++;            ptr++;
5814            }            }
5815          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
# Line 5549  for (;; ptr++) Line 5857  for (;; ptr++)
5857          slot = cd->name_table;          slot = cd->name_table;
5858          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
5859            {            {
5860            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5861            slot += cd->name_entry_size;            slot += cd->name_entry_size;
5862            }            }
5863    
# Line 5565  for (;; ptr++) Line 5873  for (;; ptr++)
5873          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
5874    
5875          else if ((i = find_parens(cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
5876                          (options & PCRE_EXTENDED) != 0, utf8)) > 0)                          (options & PCRE_EXTENDED) != 0, utf)) > 0)
5877            {            {
5878            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
5879            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
# Line 5591  for (;; ptr++) Line 5899  for (;; ptr++)
5899            recno = 0;            recno = 0;
5900            for (i = 1; i < namelen; i++)            for (i = 1; i < namelen; i++)
5901              {              {
5902              if ((digitab[name[i]] & ctype_digit) == 0)              if (!IS_DIGIT(name[i]))
5903                {                {
5904                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
5905                goto FAILED;                goto FAILED;
# Line 5606  for (;; ptr++) Line 5914  for (;; ptr++)
5914          /* Similarly, check for the (?(DEFINE) "condition", which is always          /* Similarly, check for the (?(DEFINE) "condition", which is always
5915          false. */          false. */
5916    
5917          else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)          else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5918            {            {
5919            code[1+LINK_SIZE] = OP_DEF;            code[1+LINK_SIZE] = OP_DEF;
5920            skipbytes = 1;            skipbytes = 1;
# Line 5669  for (;; ptr++) Line 5977  for (;; ptr++)
5977            break;            break;
5978    
5979            default:                /* Could be name define, else bad */            default:                /* Could be name define, else bad */
5980            if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;            if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5981                goto DEFINE_NAME;
5982            ptr++;                  /* Correct offset for error */            ptr++;                  /* Correct offset for error */
5983            *errorcodeptr = ERR24;            *errorcodeptr = ERR24;
5984            goto FAILED;            goto FAILED;
# Line 5691  for (;; ptr++) Line 6000  for (;; ptr++)
6000          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
6001            {            {
6002            int n = 0;            int n = 0;
6003            while ((digitab[*(++ptr)] & ctype_digit) != 0)            ptr++;
6004              n = n * 10 + *ptr - CHAR_0;            while(IS_DIGIT(*ptr))
6005                n = n * 10 + *ptr++ - CHAR_0;
6006            if (*ptr != CHAR_RIGHT_PARENTHESIS)            if (*ptr != CHAR_RIGHT_PARENTHESIS)
6007              {              {
6008              *errorcodeptr = ERR39;              *errorcodeptr = ERR39;
# Line 5737  for (;; ptr++) Line 6047  for (;; ptr++)
6047              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6048            name = ++ptr;            name = ++ptr;
6049    
6050            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6051            namelen = (int)(ptr - name);            namelen = (int)(ptr - name);
6052    
6053            /* In the pre-compile phase, just do a syntax check. */            /* In the pre-compile phase, just do a syntax check. */
# Line 5754  for (;; ptr++) Line 6064  for (;; ptr++)
6064                *errorcodeptr = ERR49;                *errorcodeptr = ERR49;
6065                goto FAILED;                goto FAILED;
6066                }                }
6067              if (namelen + 3 > cd->name_entry_size)              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6068                {                {
6069                cd->name_entry_size = namelen + 3;                cd->name_entry_size = namelen + IMM2_SIZE + 1;
6070                if (namelen > MAX_NAME_SIZE)                if (namelen > MAX_NAME_SIZE)
6071                  {                  {
6072                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR48;
# Line 5785  for (;; ptr++) Line 6095  for (;; ptr++)
6095    
6096              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
6097                {                {
6098                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6099                if (crc == 0)                if (crc == 0)
6100                  {                  {
6101                  if (slot[2+namelen] == 0)                  if (slot[IMM2_SIZE+namelen] == 0)
6102                    {                    {
6103                    if (GET2(slot, 0) != cd->bracount + 1 &&                    if (GET2(slot, 0) != cd->bracount + 1 &&
6104                        (options & PCRE_DUPNAMES) == 0)                        (options & PCRE_DUPNAMES) == 0)
# Line 5809  for (;; ptr++) Line 6119  for (;; ptr++)
6119                if (crc < 0)                if (crc < 0)
6120                  {                  {
6121                  memmove(slot + cd->name_entry_size, slot,                  memmove(slot + cd->name_entry_size, slot,
6122                    (cd->names_found - i) * cd->name_entry_size);                    IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
6123                  break;                  break;
6124                  }                  }
6125    
# Line 5823  for (;; ptr++) Line 6133  for (;; ptr++)
6133    
6134              if (!dupname)              if (!dupname)
6135                {                {
6136                uschar *cslot = cd->name_table;                pcre_uchar *cslot = cd->name_table;
6137                for (i = 0; i < cd->names_found; i++)                for (i = 0; i < cd->names_found; i++)
6138                  {                  {
6139                  if (cslot != slot)                  if (cslot != slot)
# Line 5840  for (;; ptr++) Line 6150  for (;; ptr++)
6150                }                }
6151    
6152              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
6153              memcpy(slot + 2, name, namelen);              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6154              slot[2+namelen] = 0;              slot[IMM2_SIZE + namelen] = 0;
6155              }              }
6156            }            }
6157    
# Line 5867  for (;; ptr++) Line 6177  for (;; ptr++)
6177    
6178          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
6179          name = ++ptr;          name = ++ptr;
6180          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6181          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6182    
6183          /* In the pre-compile phase, do a syntax check. We used to just set          /* In the pre-compile phase, do a syntax check. We used to just set
# Line 5879  for (;; ptr++) Line 6189  for (;; ptr++)
6189    
6190          if (lengthptr != NULL)          if (lengthptr != NULL)
6191            {            {
6192            const uschar *temp;            const pcre_uchar *temp;
6193    
6194            if (namelen == 0)            if (namelen == 0)
6195              {              {
# Line 5909  for (;; ptr++) Line 6219  for (;; ptr++)
6219            temp = cd->end_pattern;            temp = cd->end_pattern;
6220            cd->end_pattern = ptr;            cd->end_pattern = ptr;
6221            recno = find_parens(cd, name, namelen,            recno = find_parens(cd, name, namelen,
6222              (options & PCRE_EXTENDED) != 0, utf8);              (options & PCRE_EXTENDED) != 0, utf);
6223            cd->end_pattern = temp;            cd->end_pattern = temp;
6224            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
6225            }            }
# Line 5924  for (;; ptr++) Line 6234  for (;; ptr++)
6234            slot = cd->name_table;            slot = cd->name_table;
6235            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
6236              {              {
6237              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6238                  slot[2+namelen] == 0)                  slot[IMM2_SIZE+namelen] == 0)
6239                break;                break;
6240              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6241              }              }
# Line 5936  for (;; ptr++) Line 6246  for (;; ptr++)
6246              }              }
6247            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
6248                      find_parens(cd, name, namelen,                      find_parens(cd, name, namelen,
6249                        (options & PCRE_EXTENDED) != 0, utf8)) <= 0)                        (options & PCRE_EXTENDED) != 0, utf)) <= 0)
6250              {              {
6251              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
6252              goto FAILED;              goto FAILED;
# Line 5961  for (;; ptr++) Line 6271  for (;; ptr++)
6271          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6272          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6273            {            {
6274            const uschar *called;            const pcre_uchar *called;
6275            terminator = CHAR_RIGHT_PARENTHESIS;            terminator = CHAR_RIGHT_PARENTHESIS;
6276    
6277            /* Come here from the \g<...> and \g'...' code (Oniguruma            /* Come here from the \g<...> and \g'...' code (Oniguruma
# Line 5975  for (;; ptr++) Line 6285  for (;; ptr++)
6285            if ((refsign = *ptr) == CHAR_PLUS)            if ((refsign = *ptr) == CHAR_PLUS)
6286              {              {
6287              ptr++;              ptr++;
6288              if ((digitab[*ptr] & ctype_digit) == 0)              if (!IS_DIGIT(*ptr))
6289                {                {
6290                *errorcodeptr = ERR63;                *errorcodeptr = ERR63;
6291                goto FAILED;                goto FAILED;
# Line 5983  for (;; ptr++) Line 6293  for (;; ptr++)
6293              }              }
6294            else if (refsign == CHAR_MINUS)            else if (refsign == CHAR_MINUS)
6295              {              {
6296              if ((digitab[ptr[1]] & ctype_digit) == 0)              if (!IS_DIGIT(ptr[1]))
6297                goto OTHER_CHAR_AFTER_QUERY;                goto OTHER_CHAR_AFTER_QUERY;
6298              ptr++;              ptr++;
6299              }              }
6300    
6301            recno = 0;            recno = 0;
6302            while((digitab[*ptr] & ctype_digit) != 0)            while(IS_DIGIT(*ptr))
6303              recno = recno * 10 + *ptr++ - CHAR_0;              recno = recno * 10 + *ptr++ - CHAR_0;
6304    
6305            if (*ptr != terminator)            if (*ptr != terminator)
# Line 6040  for (;; ptr++) Line 6350  for (;; ptr++)
6350              {              {
6351              *code = OP_END;              *code = OP_END;
6352              if (recno != 0)              if (recno != 0)
6353                called = _pcre_find_bracket(cd->start_code, utf8, recno);                called = PRIV(find_bracket)(cd->start_code, utf, recno);
6354    
6355              /* Forward reference */              /* Forward reference */
6356    
6357              if (called == NULL)              if (called == NULL)
6358                {                {
6359                if (find_parens(cd, NULL, recno,                if (find_parens(cd, NULL, recno,
6360                      (options & PCRE_EXTENDED) != 0, utf8) < 0)                      (options & PCRE_EXTENDED) != 0, utf) < 0)
6361                  {                  {
6362                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
6363                  goto FAILED;                  goto FAILED;
# Line 6056  for (;; ptr++) Line 6366  for (;; ptr++)
6366                /* Fudge the value of "called" so that when it is inserted as an                /* Fudge the value of "called" so that when it is inserted as an
6367                offset below, what it actually inserted is the reference number                offset below, what it actually inserted is the reference number
6368                of the group. Then remember the forward reference. */                of the group. Then remember the forward reference. */
6369    
6370                called = cd->start_code + recno;                called = cd->start_code + recno;
6371                if (cd->hwm >= cd->start_workspace + cd->workspace_size -                if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6372                    WORK_SIZE_SAFETY_MARGIN)                    WORK_SIZE_SAFETY_MARGIN)
6373                  {                  {
6374                  *errorcodeptr = expand_workspace(cd);                  *errorcodeptr = expand_workspace(cd);
6375                  if (*errorcodeptr != 0) goto FAILED;                  if (*errorcodeptr != 0) goto FAILED;
6376                  }                  }
6377                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6378                }                }
6379    
# Line 6077  for (;; ptr++) Line 6387  for (;; ptr++)
6387              conditional subpatterns will be picked up then. */              conditional subpatterns will be picked up then. */
6388    
6389              else if (GET(called, 1) == 0 && cond_depth <= 0 &&              else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6390                       could_be_empty(called, code, bcptr, utf8, cd))                       could_be_empty(called, code, bcptr, utf, cd))
6391                {                {
6392                *errorcodeptr = ERR40;                *errorcodeptr = ERR40;
6393                goto FAILED;                goto FAILED;
# Line 6085  for (;; ptr++) Line 6395  for (;; ptr++)
6395              }              }
6396    
6397            /* Insert the recursion/subroutine item. It does not have a set first            /* Insert the recursion/subroutine item. It does not have a set first
6398            byte (relevant if it is repeated, because it will then be wrapped            character (relevant if it is repeated, because it will then be
6399            with ONCE brackets). */            wrapped with ONCE brackets). */
6400    
6401            *code = OP_RECURSE;            *code = OP_RECURSE;
6402            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
6403            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
6404            groupsetfirstbyte = FALSE;            groupsetfirstchar = FALSE;
6405            }            }
6406    
6407          /* Can't determine a first byte now */          /* Can't determine a first byte now */
6408    
6409          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6410          continue;          continue;
6411    
6412    
# Line 6153  for (;; ptr++) Line 6463  for (;; ptr++)
6463          both phases.          both phases.
6464    
6465          If we are not at the pattern start, reset the greedy defaults and the          If we are not at the pattern start, reset the greedy defaults and the
6466          case value for firstbyte and reqbyte. */          case value for firstchar and reqchar. */
6467    
6468          if (*ptr == CHAR_RIGHT_PARENTHESIS)          if (*ptr == CHAR_RIGHT_PARENTHESIS)
6469            {            {
# Line 6166  for (;; ptr++) Line 6476  for (;; ptr++)
6476              {              {
6477              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6478              greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
6479              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6480              }              }
6481    
6482            /* Change options at this level, and pass them back for use            /* Change options at this level, and pass them back for use
# Line 6203  for (;; ptr++) Line 6513  for (;; ptr++)
6513        NUMBERED_GROUP:        NUMBERED_GROUP:
6514        cd->bracount += 1;        cd->bracount += 1;
6515        PUT2(code, 1+LINK_SIZE, cd->bracount);        PUT2(code, 1+LINK_SIZE, cd->bracount);
6516        skipbytes = 2;        skipbytes = IMM2_SIZE;
6517        }        }
6518    
6519      /* Process nested bracketed regex. Assertions used not to be repeatable,      /* Process nested bracketed regex. Assertions used not to be repeatable,
# Line 6229  for (;; ptr++) Line 6539  for (;; ptr++)
6539           skipbytes,                       /* Skip over bracket number */           skipbytes,                       /* Skip over bracket number */
6540           cond_depth +           cond_depth +
6541             ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */             ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
6542           &subfirstbyte,                   /* For possible first char */           &subfirstchar,                   /* For possible first char */
6543           &subreqbyte,                     /* For possible last char */           &subreqchar,                     /* For possible last char */
6544           bcptr,                           /* Current branch chain */           bcptr,                           /* Current branch chain */
6545           cd,                              /* Tables block */           cd,                              /* Tables block */
6546           (lengthptr == NULL)? NULL :      /* Actual compile phase */           (lengthptr == NULL)? NULL :      /* Actual compile phase */
# Line 6258  for (;; ptr++) Line 6568  for (;; ptr++)
6568    
6569      if (bravalue == OP_COND && lengthptr == NULL)      if (bravalue == OP_COND && lengthptr == NULL)
6570        {        {
6571        uschar *tc = code;        pcre_uchar *tc = code;
6572        int condcount = 0;        int condcount = 0;
6573    
6574        do {        do {
# Line 6281  for (;; ptr++) Line 6591  for (;; ptr++)
6591          }          }
6592    
6593        /* A "normal" conditional group. If there is just one branch, we must not        /* A "normal" conditional group. If there is just one branch, we must not
6594        make use of its firstbyte or reqbyte, because this is equivalent to an        make use of its firstchar or reqchar, because this is equivalent to an
6595        empty second branch. */        empty second branch. */
6596