/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 640 by ph10, Mon Jul 25 10:50:28 2011 UTC revision 1041 by ph10, Sun Sep 16 10:16:27 2012 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2011 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
# Line 88  so this number is very generous. Line 92  so this number is very generous.
92  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
93  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
94  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
96    filled up by repetitions of forward references, for example patterns like
97    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98    that the workspace is expanded using malloc() in this situation. The value
99    below is therefore a minimum, and we put a maximum on it for safety. The
100    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101    kicks in at the same number of forward references in all cases. */
102    
103  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105    
106  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
107  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
108    
109  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
110    
111    /* Private flags added to firstchar and reqchar. */
112    
113    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
114    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
115    
116    /* Repeated character flags. */
117    
118    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
119    
120  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 231  static const char posix_names[] = Line 250  static const char posix_names[] =
250    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
252    
253  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
254    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255    
256  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
# Line 266  substitutes must be in the order of the Line 285  substitutes must be in the order of the
285  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
286    
287  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
288  static const uschar *substitutes[] = {  static const pcre_uchar string_PNd[]  = {
289    (uschar *)"\\P{Nd}",    /* \D */    CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290    (uschar *)"\\p{Nd}",    /* \d */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */  static const pcre_uchar string_pNd[]  = {
292    (uschar *)"\\p{Xsp}",   /* \s */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293    (uschar *)"\\P{Xwd}",   /* \W */    CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294    (uschar *)"\\p{Xwd}"    /* \w */  static const pcre_uchar string_PXsp[] = {
295      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297    static const pcre_uchar string_pXsp[] = {
298      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300    static const pcre_uchar string_PXwd[] = {
301      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303    static const pcre_uchar string_pXwd[] = {
304      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    
307    static const pcre_uchar *substitutes[] = {
308      string_PNd,           /* \D */
309      string_pNd,           /* \d */
310      string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
311      string_pXsp,          /* \s */
312      string_PXwd,          /* \W */
313      string_pXwd           /* \w */
314  };  };
315    
316  static const uschar *posix_substitutes[] = {  static const pcre_uchar string_pL[] =   {
317    (uschar *)"\\p{L}",     /* alpha */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318    (uschar *)"\\p{Ll}",    /* lower */    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319    (uschar *)"\\p{Lu}",    /* upper */  static const pcre_uchar string_pLl[] =  {
320    (uschar *)"\\p{Xan}",   /* alnum */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321    NULL,                   /* ascii */    CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322    (uschar *)"\\h",        /* blank */  static const pcre_uchar string_pLu[] =  {
323    NULL,                   /* cntrl */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324    (uschar *)"\\p{Nd}",    /* digit */    CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325    NULL,                   /* graph */  static const pcre_uchar string_pXan[] = {
326    NULL,                   /* print */    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327    NULL,                   /* punct */    CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */  static const pcre_uchar string_h[] =    {
329    (uschar *)"\\p{Xwd}",   /* word */    CHAR_BACKSLASH, CHAR_h, '\0' };
330    NULL,                   /* xdigit */  static const pcre_uchar string_pXps[] = {
331      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333    static const pcre_uchar string_PL[] =   {
334      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336    static const pcre_uchar string_PLl[] =  {
337      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339    static const pcre_uchar string_PLu[] =  {
340      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342    static const pcre_uchar string_PXan[] = {
343      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345    static const pcre_uchar string_H[] =    {
346      CHAR_BACKSLASH, CHAR_H, '\0' };
347    static const pcre_uchar string_PXps[] = {
348      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350    
351    static const pcre_uchar *posix_substitutes[] = {
352      string_pL,            /* alpha */
353      string_pLl,           /* lower */
354      string_pLu,           /* upper */
355      string_pXan,          /* alnum */
356      NULL,                 /* ascii */
357      string_h,             /* blank */
358      NULL,                 /* cntrl */
359      string_pNd,           /* digit */
360      NULL,                 /* graph */
361      NULL,                 /* print */
362      NULL,                 /* punct */
363      string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
364      string_pXwd,          /* word */
365      NULL,                 /* xdigit */
366    /* Negated cases */    /* Negated cases */
367    (uschar *)"\\P{L}",     /* ^alpha */    string_PL,            /* ^alpha */
368    (uschar *)"\\P{Ll}",    /* ^lower */    string_PLl,           /* ^lower */
369    (uschar *)"\\P{Lu}",    /* ^upper */    string_PLu,           /* ^upper */
370    (uschar *)"\\P{Xan}",   /* ^alnum */    string_PXan,          /* ^alnum */
371    NULL,                   /* ^ascii */    NULL,                 /* ^ascii */
372    (uschar *)"\\H",        /* ^blank */    string_H,             /* ^blank */
373    NULL,                   /* ^cntrl */    NULL,                 /* ^cntrl */
374    (uschar *)"\\P{Nd}",    /* ^digit */    string_PNd,           /* ^digit */
375    NULL,                   /* ^graph */    NULL,                 /* ^graph */
376    NULL,                   /* ^print */    NULL,                 /* ^print */
377    NULL,                   /* ^punct */    NULL,                 /* ^punct */
378    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
379    (uschar *)"\\P{Xwd}",   /* ^word */    string_PXwd,          /* ^word */
380    NULL                    /* ^xdigit */    NULL                  /* ^xdigit */
381  };  };
382  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383  #endif  #endif
384    
385  #define STRING(a)  # a  #define STRING(a)  # a
# Line 365  static const char error_texts[] = Line 438  static const char error_texts[] =
438    /* 30 */    /* 30 */
439    "unknown POSIX class name\0"    "unknown POSIX class name\0"
440    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
441    "this version of PCRE is not compiled with PCRE_UTF8 support\0"    "this version of PCRE is compiled without UTF support\0"
442    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
443    "character value in \\x{...} sequence is too large\0"    "character value in \\x{...} sequence is too large\0"
444    /* 35 */    /* 35 */
# Line 388  static const char error_texts[] = Line 461  static const char error_texts[] =
461    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
462    /* 50 */    /* 50 */
463    "repeated subpattern is too long\0"    /** DEAD **/    "repeated subpattern is too long\0"    /** DEAD **/
464    "octal value is greater than \\377 (not in UTF-8 mode)\0"    "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
465    "internal error: overran compiling workspace\0"    "internal error: overran compiling workspace\0"
466    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
467    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
# Line 407  static const char error_texts[] = Line 480  static const char error_texts[] =
480    /* 65 */    /* 65 */
481    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
482    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
483    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with Unicode property support\0"
484    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
485    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486      /* 70 */
487      "internal error: unknown opcode in find_fixedlength()\0"
488      "\\N is not supported in a class\0"
489      "too many forward references\0"
490      "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491      "invalid UTF-16 string\0"
492      /* 75 */
493      "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
494      "character value in \\u.... sequence is too large\0"
495    ;    ;
496    
497  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 428  For convenience, we use the same bit def Line 510  For convenience, we use the same bit def
510    
511  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
512    
513    /* Using a simple comparison for decimal numbers rather than a memory read
514    is much faster, and the resulting code is simpler (the compiler turns it
515    into a subtraction and unsigned comparison). */
516    
517    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
518    
519  #ifndef EBCDIC  #ifndef EBCDIC
520    
521  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
522  UTF-8 mode. */  UTF-8 mode. */
523    
524  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
525    {    {
526    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
527    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 472  static const unsigned char digitab[] = Line 560  static const unsigned char digitab[] =
560    
561  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
562    
563  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
564    {    {
565    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
566    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 507  static const unsigned char digitab[] = Line 595  static const unsigned char digitab[] =
595    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
596    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
597    
598  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
599    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
600    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
601    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 546  static const unsigned char ebcdic_charta Line 634  static const unsigned char ebcdic_charta
634  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
635    
636  static BOOL  static BOOL
637    compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
638      int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
639    
640    
641    
# Line 578  return s; Line 666  return s;
666    
667    
668  /*************************************************  /*************************************************
669    *           Expand the workspace                 *
670    *************************************************/
671    
672    /* This function is called during the second compiling phase, if the number of
673    forward references fills the existing workspace, which is originally a block on
674    the stack. A larger block is obtained from malloc() unless the ultimate limit
675    has been reached or the increase will be rather small.
676    
677    Argument: pointer to the compile data block
678    Returns:  0 if all went well, else an error number
679    */
680    
681    static int
682    expand_workspace(compile_data *cd)
683    {
684    pcre_uchar *newspace;
685    int newsize = cd->workspace_size * 2;
686    
687    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
688    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
689        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
690     return ERR72;
691    
692    newspace = (PUBL(malloc))(IN_UCHARS(newsize));
693    if (newspace == NULL) return ERR21;
694    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
695    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
696    if (cd->workspace_size > COMPILE_WORK_SIZE)
697      (PUBL(free))((void *)cd->start_workspace);
698    cd->start_workspace = newspace;
699    cd->workspace_size = newsize;
700    return 0;
701    }
702    
703    
704    
705    /*************************************************
706  *            Check for counted repeat            *  *            Check for counted repeat            *
707  *************************************************/  *************************************************/
708    
# Line 593  Returns:    TRUE or FALSE Line 718  Returns:    TRUE or FALSE
718  */  */
719    
720  static BOOL  static BOOL
721  is_counted_repeat(const uschar *p)  is_counted_repeat(const pcre_uchar *p)
722  {  {
723  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
724  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
725    while (IS_DIGIT(*p)) p++;
726  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
727    
728  if (*p++ != CHAR_COMMA) return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
729  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
730    
731  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
732  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
733    while (IS_DIGIT(*p)) p++;
734    
735  return (*p == CHAR_RIGHT_CURLY_BRACKET);  return (*p == CHAR_RIGHT_CURLY_BRACKET);
736  }  }
# Line 635  Returns:         zero or positive => a d Line 762  Returns:         zero or positive => a d
762  */  */
763    
764  static int  static int
765  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
766    int options, BOOL isclass)    int options, BOOL isclass)
767  {  {
768  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
769  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
770  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
771    pcre_int32 c;
772    int i;
773    
774  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
775  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
# Line 654  in a table. A non-zero result is somethi Line 783  in a table. A non-zero result is somethi
783  Otherwise further processing may be required. */  Otherwise further processing may be required. */
784    
785  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
786  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
787    else if (c < CHAR_0 || c > CHAR_z) {}
788  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
789    
790  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
791  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
792    else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
793  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
794  #endif  #endif
795    
# Line 666  else if ((i = escapes[c - 0x48]) != 0) Line 797  else if ((i = escapes[c - 0x48]) != 0)
797    
798  else  else
799    {    {
800    const uschar *oldptr;    const pcre_uchar *oldptr;
801    BOOL braced, negated;    BOOL braced, negated;
802    
803    switch (c)    switch (c)
# Line 676  else Line 807  else
807    
808      case CHAR_l:      case CHAR_l:
809      case CHAR_L:      case CHAR_L:
810        *errorcodeptr = ERR37;
811        break;
812    
813      case CHAR_u:      case CHAR_u:
814        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
815          {
816          /* In JavaScript, \u must be followed by four hexadecimal numbers.
817          Otherwise it is a lowercase u letter. */
818          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
819            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
820            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
821            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
822            {
823            c = 0;
824            for (i = 0; i < 4; ++i)
825              {
826              register int cc = *(++ptr);
827    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
828              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
829              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
830    #else           /* EBCDIC coding */
831              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
832              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
833    #endif
834              }
835    
836    #ifdef COMPILE_PCRE8
837            if (c > (utf ? 0x10ffff : 0xff))
838    #else
839    #ifdef COMPILE_PCRE16
840            if (c > (utf ? 0x10ffff : 0xffff))
841    #endif
842    #endif
843              {
844              *errorcodeptr = ERR76;
845              }
846            else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
847            }
848          }
849        else
850          *errorcodeptr = ERR37;
851        break;
852    
853      case CHAR_U:      case CHAR_U:
854      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
855        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
856      break;      break;
857    
858      /* In a character class, \g is just a literal "g". Outside a character      /* In a character class, \g is just a literal "g". Outside a character
859      class, \g must be followed by one of a number of specific things:      class, \g must be followed by one of a number of specific things:
860    
861      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
# Line 710  else Line 884  else
884    
885      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
886        {        {
887        const uschar *p;        const pcre_uchar *p;
888        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
889          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
890        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
891          {          {
892          c = -ESC_k;          c = -ESC_k;
# Line 730  else Line 904  else
904        }        }
905      else negated = FALSE;      else negated = FALSE;
906    
907        /* The integer range is limited by the machine's int representation. */
908      c = 0;      c = 0;
909      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
910          {
911          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
912            {
913            c = -1;
914            break;
915            }
916        c = c * 10 + *(++ptr) - CHAR_0;        c = c * 10 + *(++ptr) - CHAR_0;
917          }
918      if (c < 0)   /* Integer overflow */      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
919        {        {
920          while (IS_DIGIT(ptr[1]))
921            ptr++;
922        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
923        break;        break;
924        }        }
# Line 783  else Line 966  else
966      if (!isclass)      if (!isclass)
967        {        {
968        oldptr = ptr;        oldptr = ptr;
969          /* The integer range is limited by the machine's int representation. */
970        c -= CHAR_0;        c -= CHAR_0;
971        while ((digitab[ptr[1]] & ctype_digit) != 0)        while (IS_DIGIT(ptr[1]))
972            {
973            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
974              {
975              c = -1;
976              break;
977              }
978          c = c * 10 + *(++ptr) - CHAR_0;          c = c * 10 + *(++ptr) - CHAR_0;
979        if (c < 0)    /* Integer overflow */          }
980          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
981          {          {
982            while (IS_DIGIT(ptr[1]))
983              ptr++;
984          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
985          break;          break;
986          }          }
# Line 813  else Line 1006  else
1006      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1007      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
1008      significant 8 bits of octal numbers (I think this is what early Perls used      significant 8 bits of octal numbers (I think this is what early Perls used
1009      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1010      than 3 octal digits. */      but no more than 3 octal digits. */
1011    
1012      case CHAR_0:      case CHAR_0:
1013      c -= CHAR_0;      c -= CHAR_0;
1014      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1015          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
1016      if (!utf8 && c > 255) *errorcodeptr = ERR51;  #ifdef COMPILE_PCRE8
1017        if (!utf && c > 0xff) *errorcodeptr = ERR51;
1018    #endif
1019      break;      break;
1020    
1021      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
1022      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1023      treated as a data character. */      If not, { is treated as a data character. */
1024    
1025      case CHAR_x:      case CHAR_x:
1026        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1027          {
1028          /* In JavaScript, \x must be followed by two hexadecimal numbers.
1029          Otherwise it is a lowercase x letter. */
1030          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1031            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1032            {
1033            c = 0;
1034            for (i = 0; i < 2; ++i)
1035              {
1036              register int cc = *(++ptr);
1037    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1038              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1039              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1040    #else           /* EBCDIC coding */
1041              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1042              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1043    #endif
1044              }
1045            }
1046          break;
1047          }
1048    
1049      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1050        {        {
1051        const uschar *pt = ptr + 2;        const pcre_uchar *pt = ptr + 2;
       int count = 0;  
1052    
1053        c = 0;        c = 0;
1054        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1055          {          {
1056          register int cc = *pt++;          register int cc = *pt++;
1057          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
         count++;  
1058    
1059  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1060          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
# Line 847  else Line 1063  else
1063          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1064          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1065  #endif  #endif
1066    
1067    #ifdef COMPILE_PCRE8
1068            if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1069    #else
1070    #ifdef COMPILE_PCRE16
1071            if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1072    #endif
1073    #endif
1074            }
1075    
1076          if (c < 0)
1077            {
1078            while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1079            *errorcodeptr = ERR34;
1080          }          }
1081    
1082        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1083          {          {
1084          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1085          ptr = pt;          ptr = pt;
1086          break;          break;
1087          }          }
# Line 863  else Line 1093  else
1093      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1094    
1095      c = 0;      c = 0;
1096      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1097        {        {
1098        int cc;                                  /* Some compilers don't like */        int cc;                                  /* Some compilers don't like */
1099        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
# Line 921  else Line 1151  else
1151    }    }
1152    
1153  /* Perl supports \N{name} for character names, as well as plain \N for "not  /* Perl supports \N{name} for character names, as well as plain \N for "not
1154  newline". PCRE does not support \N{name}. However, it does support  newline". PCRE does not support \N{name}. However, it does support
1155  quantification such as \N{2,3}. */  quantification such as \N{2,3}. */
1156    
1157  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
# Line 961  Returns:         type value from ucp_typ Line 1191  Returns:         type value from ucp_typ
1191  */  */
1192    
1193  static int  static int
1194  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1195  {  {
1196  int c, i, bot, top;  int c, i, bot, top;
1197  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
1198  char name[32];  pcre_uchar name[32];
1199    
1200  c = *(++ptr);  c = *(++ptr);
1201  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
# Line 982  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1212  if (c == CHAR_LEFT_CURLY_BRACKET)
1212      *negptr = TRUE;      *negptr = TRUE;
1213      ptr++;      ptr++;
1214      }      }
1215    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1216      {      {
1217      c = *(++ptr);      c = *(++ptr);
1218      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 1006  else Line 1236  else
1236  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1237    
1238  bot = 0;  bot = 0;
1239  top = _pcre_utt_size;  top = PRIV(utt_size);
1240    
1241  while (bot < top)  while (bot < top)
1242    {    {
1243    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1244    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);    c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1245    if (c == 0)    if (c == 0)
1246      {      {
1247      *dptr = _pcre_utt[i].value;      *dptr = PRIV(utt)[i].value;
1248      return _pcre_utt[i].type;      return PRIV(utt)[i].type;
1249      }      }
1250    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
1251    }    }
# Line 1053  Returns:         pointer to '}' on succe Line 1283  Returns:         pointer to '}' on succe
1283                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1284  */  */
1285    
1286  static const uschar *  static const pcre_uchar *
1287  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1288  {  {
1289  int min = 0;  int min = 0;
1290  int max = -1;  int max = -1;
# Line 1062  int max = -1; Line 1292  int max = -1;
1292  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1293  an integer overflow. */  an integer overflow. */
1294    
1295  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1296  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1297    {    {
1298    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1077  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1307  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1307    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1308      {      {
1309      max = 0;      max = 0;
1310      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1311      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1312        {        {
1313        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 1132  Arguments: Line 1362  Arguments:
1362    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1363    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1364    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1365    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1366    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1367    
1368  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1369  */  */
1370    
1371  static int  static int
1372  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1373    BOOL xmode, BOOL utf8, int *count)    BOOL xmode, BOOL utf, int *count)
1374  {  {
1375  uschar *ptr = *ptrptr;  pcre_uchar *ptr = *ptrptr;
1376  int start_count = *count;  int start_count = *count;
1377  int hwm_count = start_count;  int hwm_count = start_count;
1378  BOOL dup_parens = FALSE;  BOOL dup_parens = FALSE;
# Line 1209  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1439  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1439          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)          ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1440        {        {
1441        int term;        int term;
1442        const uschar *thisname;        const pcre_uchar *thisname;
1443        *count += 1;        *count += 1;
1444        if (name == NULL && *count == lorn) return *count;        if (name == NULL && *count == lorn) return *count;
1445        term = *ptr++;        term = *ptr++;
# Line 1217  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1447  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1447        thisname = ptr;        thisname = ptr;
1448        while (*ptr != term) ptr++;        while (*ptr != term) ptr++;
1449        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1450            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            STRNCMP_UC_UC(name, thisname, lorn) == 0)
1451          return *count;          return *count;
1452        term++;        term++;
1453        }        }
# Line 1260  for (; ptr < cd->end_pattern; ptr++) Line 1490  for (; ptr < cd->end_pattern; ptr++)
1490          {          {
1491          if (ptr[2] == CHAR_E)          if (ptr[2] == CHAR_E)
1492            ptr+= 2;            ptr+= 2;
1493          else if (strncmp((const char *)ptr+2,          else if (STRNCMP_UC_C8(ptr + 2,
1494                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1495            ptr += 4;            ptr += 4;
1496          else          else
# Line 1308  for (; ptr < cd->end_pattern; ptr++) Line 1538  for (; ptr < cd->end_pattern; ptr++)
1538        {        {
1539        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1540        ptr++;        ptr++;
1541  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1542        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;        if (utf) FORWARDCHAR(ptr);
1543  #endif  #endif
1544        }        }
1545      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
# Line 1320  for (; ptr < cd->end_pattern; ptr++) Line 1550  for (; ptr < cd->end_pattern; ptr++)
1550    
1551    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1552      {      {
1553      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1554      if (rc > 0) return rc;      if (rc > 0) return rc;
1555      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1556      }      }
# Line 1366  Arguments: Line 1596  Arguments:
1596    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1597    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1598    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1599    utf8         TRUE if we are in UTF-8 mode    utf          TRUE if we are in UTF-8 / UTF-16 mode
1600    
1601  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1602  */  */
1603    
1604  static int  static int
1605  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,  find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1606    BOOL utf8)    BOOL utf)
1607  {  {
1608  uschar *ptr = (uschar *)cd->start_pattern;  pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1609  int count = 0;  int count = 0;
1610  int rc;  int rc;
1611    
# Line 1386  matching closing parens. That is why we Line 1616  matching closing parens. That is why we
1616    
1617  for (;;)  for (;;)
1618    {    {
1619    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1620    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1621    }    }
1622    
# Line 1413  Arguments: Line 1643  Arguments:
1643  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1644  */  */
1645    
1646  static const uschar*  static const pcre_uchar*
1647  first_significant_code(const uschar *code, BOOL skipassert)  first_significant_code(const pcre_uchar *code, BOOL skipassert)
1648  {  {
1649  for (;;)  for (;;)
1650    {    {
# Line 1425  for (;;) Line 1655  for (;;)
1655      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1656      if (!skipassert) return code;      if (!skipassert) return code;
1657      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1658      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1659      break;      break;
1660    
1661      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 1439  for (;;) Line 1669  for (;;)
1669      case OP_RREF:      case OP_RREF:
1670      case OP_NRREF:      case OP_NRREF:
1671      case OP_DEF:      case OP_DEF:
1672      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1673      break;      break;
1674    
1675      default:      default:
# Line 1469  and doing the check at the end; a flag s Line 1699  and doing the check at the end; a flag s
1699    
1700  Arguments:  Arguments:
1701    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1702    utf8     TRUE in UTF-8 mode    utf      TRUE in UTF-8 / UTF-16 mode
1703    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1704    cd       the "compile data" structure    cd       the "compile data" structure
1705    
1706  Returns:   the fixed length,  Returns:   the fixed length,
1707               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1708               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1709               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1710                 or -4 if an unknown opcode was encountered (internal error)
1711  */  */
1712    
1713  static int  static int
1714  find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1715  {  {
1716  int length = -1;  int length = -1;
1717    
1718  register int branchlength = 0;  register int branchlength = 0;
1719  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1720    
1721  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1722  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1493  branch, check the length against that of Line 1724  branch, check the length against that of
1724  for (;;)  for (;;)
1725    {    {
1726    int d;    int d;
1727    uschar *ce, *cs;    pcre_uchar *ce, *cs;
1728    register int op = *cc;    register int op = *cc;
1729    
1730    switch (op)    switch (op)
1731      {      {
1732      /* We only need to continue for OP_CBRA (normal capturing bracket) and      /* We only need to continue for OP_CBRA (normal capturing bracket) and
1733      OP_BRA (normal non-capturing bracket) because the other variants of these      OP_BRA (normal non-capturing bracket) because the other variants of these
1734      opcodes are all concerned with unlimited repeated groups, which of course      opcodes are all concerned with unlimited repeated groups, which of course
1735      are not of fixed length. They will cause a -1 response from the default      are not of fixed length. */
     case of this switch. */  
1736    
1737      case OP_CBRA:      case OP_CBRA:
1738      case OP_BRA:      case OP_BRA:
1739      case OP_ONCE:      case OP_ONCE:
1740        case OP_ONCE_NC:
1741      case OP_COND:      case OP_COND:
1742      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1743      if (d < 0) return d;      if (d < 0) return d;
1744      branchlength += d;      branchlength += d;
1745      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1746      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1747      break;      break;
1748    
1749      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1750      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1751      END it's the end of the outer call. All can be handled by the same code.      an ALT. If it is END it's the end of the outer call. All can be handled by
1752      Note that we must not include the OP_KETRxxx opcodes here, because they      the same code. Note that we must not include the OP_KETRxxx opcodes here,
1753      all imply an unlimited repeat. */      because they all imply an unlimited repeat. */
1754    
1755      case OP_ALT:      case OP_ALT:
1756      case OP_KET:      case OP_KET:
1757      case OP_END:      case OP_END:
1758        case OP_ACCEPT:
1759        case OP_ASSERT_ACCEPT:
1760      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1761        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1762      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1536  for (;;) Line 1770  for (;;)
1770    
1771      case OP_RECURSE:      case OP_RECURSE:
1772      if (!atend) return -3;      if (!atend) return -3;
1773      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1774      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1775      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                    /* Recursion */
1776      d = find_fixedlength(cs + 2, utf8, atend, cd);      d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1777      if (d < 0) return d;      if (d < 0) return d;
1778      branchlength += d;      branchlength += d;
1779      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1552  for (;;) Line 1786  for (;;)
1786      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1787      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1788      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1789      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1790        break;
1791    
1792      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1793    
1794      case OP_REVERSE:      case OP_MARK:
1795      case OP_CREF:      case OP_PRUNE_ARG:
1796      case OP_NCREF:      case OP_SKIP_ARG:
1797      case OP_RREF:      case OP_THEN_ARG:
1798      case OP_NRREF:      cc += cc[1] + PRIV(OP_lengths)[*cc];
1799      case OP_DEF:      break;
1800    
1801      case OP_CALLOUT:      case OP_CALLOUT:
     case OP_SOD:  
     case OP_SOM:  
     case OP_SET_SOM:  
     case OP_EOD:  
     case OP_EODN:  
1802      case OP_CIRC:      case OP_CIRC:
1803      case OP_CIRCM:      case OP_CIRCM:
1804        case OP_CLOSE:
1805        case OP_COMMIT:
1806        case OP_CREF:
1807        case OP_DEF:
1808      case OP_DOLL:      case OP_DOLL:
1809      case OP_DOLLM:      case OP_DOLLM:
1810        case OP_EOD:
1811        case OP_EODN:
1812        case OP_FAIL:
1813        case OP_NCREF:
1814        case OP_NRREF:
1815      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1816        case OP_PRUNE:
1817        case OP_REVERSE:
1818        case OP_RREF:
1819        case OP_SET_SOM:
1820        case OP_SKIP:
1821        case OP_SOD:
1822        case OP_SOM:
1823        case OP_THEN:
1824      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1825      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1826      break;      break;
1827    
1828      /* Handle literal characters */      /* Handle literal characters */
# Line 1585  for (;;) Line 1833  for (;;)
1833      case OP_NOTI:      case OP_NOTI:
1834      branchlength++;      branchlength++;
1835      cc += 2;      cc += 2;
1836  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1837      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1838  #endif  #endif
1839      break;      break;
1840    
# Line 1594  for (;;) Line 1842  for (;;)
1842      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1843    
1844      case OP_EXACT:      case OP_EXACT:
1845        case OP_EXACTI:
1846        case OP_NOTEXACT:
1847        case OP_NOTEXACTI:
1848      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1849      cc += 4;      cc += 2 + IMM2_SIZE;
1850  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1851      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1852  #endif  #endif
1853      break;      break;
1854    
1855      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1856      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1857      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1858      cc += 4;      cc += 1 + IMM2_SIZE + 1;
1859      break;      break;
1860    
1861      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1614  for (;;) Line 1865  for (;;)
1865      cc += 2;      cc += 2;
1866      /* Fall through */      /* Fall through */
1867    
1868        case OP_HSPACE:
1869        case OP_VSPACE:
1870        case OP_NOT_HSPACE:
1871        case OP_NOT_VSPACE:
1872      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1873      case OP_DIGIT:      case OP_DIGIT:
1874      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1626  for (;;) Line 1881  for (;;)
1881      cc++;      cc++;
1882      break;      break;
1883    
1884      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1885        otherwise \C is coded as OP_ALLANY. */
1886    
1887      case OP_ANYBYTE:      case OP_ANYBYTE:
1888      return -2;      return -2;
1889    
1890      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1891    
1892  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1893      case OP_XCLASS:      case OP_XCLASS:
1894      cc += GET(cc, 1) - 33;      cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1895      /* Fall through */      /* Fall through */
1896  #endif  #endif
1897    
1898      case OP_CLASS:      case OP_CLASS:
1899      case OP_NCLASS:      case OP_NCLASS:
1900      cc += 33;      cc += PRIV(OP_lengths)[OP_CLASS];
1901    
1902      switch (*cc)      switch (*cc)
1903        {        {
1904          case OP_CRPLUS:
1905          case OP_CRMINPLUS:
1906        case OP_CRSTAR:        case OP_CRSTAR:
1907        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1908        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1653  for (;;) Line 1911  for (;;)
1911    
1912        case OP_CRRANGE:        case OP_CRRANGE:
1913        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1914        if (GET2(cc,1) != GET2(cc,3)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1915        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
1916        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
1917        break;        break;
1918    
1919        default:        default:
# Line 1665  for (;;) Line 1923  for (;;)
1923    
1924      /* Anything else is variable length */      /* Anything else is variable length */
1925    
1926      default:      case OP_ANYNL:
1927        case OP_BRAMINZERO:
1928        case OP_BRAPOS:
1929        case OP_BRAPOSZERO:
1930        case OP_BRAZERO:
1931        case OP_CBRAPOS:
1932        case OP_EXTUNI:
1933        case OP_KETRMAX:
1934        case OP_KETRMIN:
1935        case OP_KETRPOS:
1936        case OP_MINPLUS:
1937        case OP_MINPLUSI:
1938        case OP_MINQUERY:
1939        case OP_MINQUERYI:
1940        case OP_MINSTAR:
1941        case OP_MINSTARI:
1942        case OP_MINUPTO:
1943        case OP_MINUPTOI:
1944        case OP_NOTMINPLUS:
1945        case OP_NOTMINPLUSI:
1946        case OP_NOTMINQUERY:
1947        case OP_NOTMINQUERYI:
1948        case OP_NOTMINSTAR:
1949        case OP_NOTMINSTARI:
1950        case OP_NOTMINUPTO:
1951        case OP_NOTMINUPTOI:
1952        case OP_NOTPLUS:
1953        case OP_NOTPLUSI:
1954        case OP_NOTPOSPLUS:
1955        case OP_NOTPOSPLUSI:
1956        case OP_NOTPOSQUERY:
1957        case OP_NOTPOSQUERYI:
1958        case OP_NOTPOSSTAR:
1959        case OP_NOTPOSSTARI:
1960        case OP_NOTPOSUPTO:
1961        case OP_NOTPOSUPTOI:
1962        case OP_NOTQUERY:
1963        case OP_NOTQUERYI:
1964        case OP_NOTSTAR:
1965        case OP_NOTSTARI:
1966        case OP_NOTUPTO:
1967        case OP_NOTUPTOI:
1968        case OP_PLUS:
1969        case OP_PLUSI:
1970        case OP_POSPLUS:
1971        case OP_POSPLUSI:
1972        case OP_POSQUERY:
1973        case OP_POSQUERYI:
1974        case OP_POSSTAR:
1975        case OP_POSSTARI:
1976        case OP_POSUPTO:
1977        case OP_POSUPTOI:
1978        case OP_QUERY:
1979        case OP_QUERYI:
1980        case OP_REF:
1981        case OP_REFI:
1982        case OP_SBRA:
1983        case OP_SBRAPOS:
1984        case OP_SCBRA:
1985        case OP_SCBRAPOS:
1986        case OP_SCOND:
1987        case OP_SKIPZERO:
1988        case OP_STAR:
1989        case OP_STARI:
1990        case OP_TYPEMINPLUS:
1991        case OP_TYPEMINQUERY:
1992        case OP_TYPEMINSTAR:
1993        case OP_TYPEMINUPTO:
1994        case OP_TYPEPLUS:
1995        case OP_TYPEPOSPLUS:
1996        case OP_TYPEPOSQUERY:
1997        case OP_TYPEPOSSTAR:
1998        case OP_TYPEPOSUPTO:
1999        case OP_TYPEQUERY:
2000        case OP_TYPESTAR:
2001        case OP_TYPEUPTO:
2002        case OP_UPTO:
2003        case OP_UPTOI:
2004      return -1;      return -1;
2005    
2006        /* Catch unrecognized opcodes so that when new ones are added they
2007        are not forgotten, as has happened in the past. */
2008    
2009        default:
2010        return -4;
2011      }      }
2012    }    }
2013  /* Control never gets here */  /* Control never gets here */
# Line 1687  length. Line 2028  length.
2028    
2029  Arguments:  Arguments:
2030    code        points to start of expression    code        points to start of expression
2031    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2032    number      the required bracket number or negative to find a lookbehind    number      the required bracket number or negative to find a lookbehind
2033    
2034  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2035  */  */
2036    
2037  const uschar *  const pcre_uchar *
2038  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2039  {  {
2040  for (;;)  for (;;)
2041    {    {
# Line 1712  for (;;) Line 2053  for (;;)
2053    
2054    else if (c == OP_REVERSE)    else if (c == OP_REVERSE)
2055      {      {
2056      if (number < 0) return (uschar *)code;      if (number < 0) return (pcre_uchar *)code;
2057      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2058      }      }
2059    
2060    /* Handle capturing bracket */    /* Handle capturing bracket */
# Line 1722  for (;;) Line 2063  for (;;)
2063             c == OP_CBRAPOS || c == OP_SCBRAPOS)             c == OP_CBRAPOS || c == OP_SCBRAPOS)
2064      {      {
2065      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
2066      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2067      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2068      }      }
2069    
2070    /* Otherwise, we can get the item's length from the table, except that for    /* Otherwise, we can get the item's length from the table, except that for
# Line 1751  for (;;) Line 2092  for (;;)
2092        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2093        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2094        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
2095        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2096            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2097        break;        break;
2098    
2099        case OP_MARK:        case OP_MARK:
# Line 1761  for (;;) Line 2103  for (;;)
2103        break;        break;
2104    
2105        case OP_THEN_ARG:        case OP_THEN_ARG:
2106        code += code[1+LINK_SIZE];        code += code[1];
2107        break;        break;
2108        }        }
2109    
2110      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2111    
2112      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2113    
2114    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2115    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2116    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2117    
2118  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2119      if (utf8) switch(c)      if (utf) switch(c)
2120        {        {
2121        case OP_CHAR:        case OP_CHAR:
2122        case OP_CHARI:        case OP_CHARI:
# Line 1804  for (;;) Line 2146  for (;;)
2146        case OP_MINQUERYI:        case OP_MINQUERYI:
2147        case OP_POSQUERY:        case OP_POSQUERY:
2148        case OP_POSQUERYI:        case OP_POSQUERYI:
2149        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2150        break;        break;
2151        }        }
2152  #else  #else
2153      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2154  #endif  #endif
2155      }      }
2156    }    }
# Line 1825  instance of OP_RECURSE. Line 2167  instance of OP_RECURSE.
2167    
2168  Arguments:  Arguments:
2169    code        points to start of expression    code        points to start of expression
2170    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 mode
2171    
2172  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2173  */  */
2174    
2175  static const uschar *  static const pcre_uchar *
2176  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2177  {  {
2178  for (;;)  for (;;)
2179    {    {
# Line 1870  for (;;) Line 2212  for (;;)
2212        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2213        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2214        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2215        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        if (code[1 + IMM2_SIZE] == OP_PROP
2216            || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2217        break;        break;
2218    
2219        case OP_MARK:        case OP_MARK:
# Line 1880  for (;;) Line 2223  for (;;)
2223        break;        break;
2224    
2225        case OP_THEN_ARG:        case OP_THEN_ARG:
2226        code += code[1+LINK_SIZE];        code += code[1];
2227        break;        break;
2228        }        }
2229    
2230      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
2231    
2232      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2233    
2234      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
2235      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
2236      to arrange to skip the extra bytes. */      to arrange to skip the extra bytes. */
2237    
2238  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2239      if (utf8) switch(c)      if (utf) switch(c)
2240        {        {
2241        case OP_CHAR:        case OP_CHAR:
2242        case OP_CHARI:        case OP_CHARI:
2243          case OP_NOT:
2244          case OP_NOTI:
2245        case OP_EXACT:        case OP_EXACT:
2246        case OP_EXACTI:        case OP_EXACTI:
2247          case OP_NOTEXACT:
2248          case OP_NOTEXACTI:
2249        case OP_UPTO:        case OP_UPTO:
2250        case OP_UPTOI:        case OP_UPTOI:
2251          case OP_NOTUPTO:
2252          case OP_NOTUPTOI:
2253        case OP_MINUPTO:        case OP_MINUPTO:
2254        case OP_MINUPTOI:        case OP_MINUPTOI:
2255          case OP_NOTMINUPTO:
2256          case OP_NOTMINUPTOI:
2257        case OP_POSUPTO:        case OP_POSUPTO:
2258        case OP_POSUPTOI:        case OP_POSUPTOI:
2259          case OP_NOTPOSUPTO:
2260          case OP_NOTPOSUPTOI:
2261        case OP_STAR:        case OP_STAR:
2262        case OP_STARI:        case OP_STARI:
2263          case OP_NOTSTAR:
2264          case OP_NOTSTARI:
2265        case OP_MINSTAR:        case OP_MINSTAR:
2266        case OP_MINSTARI:        case OP_MINSTARI:
2267          case OP_NOTMINSTAR:
2268          case OP_NOTMINSTARI:
2269        case OP_POSSTAR:        case OP_POSSTAR:
2270        case OP_POSSTARI:        case OP_POSSTARI:
2271          case OP_NOTPOSSTAR:
2272          case OP_NOTPOSSTARI:
2273        case OP_PLUS:        case OP_PLUS:
2274        case OP_PLUSI:        case OP_PLUSI:
2275          case OP_NOTPLUS:
2276          case OP_NOTPLUSI:
2277        case OP_MINPLUS:        case OP_MINPLUS:
2278        case OP_MINPLUSI:        case OP_MINPLUSI:
2279          case OP_NOTMINPLUS:
2280          case OP_NOTMINPLUSI:
2281        case OP_POSPLUS:        case OP_POSPLUS:
2282        case OP_POSPLUSI:        case OP_POSPLUSI:
2283          case OP_NOTPOSPLUS:
2284          case OP_NOTPOSPLUSI:
2285        case OP_QUERY:        case OP_QUERY:
2286        case OP_QUERYI:        case OP_QUERYI:
2287          case OP_NOTQUERY:
2288          case OP_NOTQUERYI:
2289        case OP_MINQUERY:        case OP_MINQUERY:
2290        case OP_MINQUERYI:        case OP_MINQUERYI:
2291          case OP_NOTMINQUERY:
2292          case OP_NOTMINQUERYI:
2293        case OP_POSQUERY:        case OP_POSQUERY:
2294        case OP_POSQUERYI:        case OP_POSQUERYI:
2295        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_NOTPOSQUERY:
2296          case OP_NOTPOSQUERYI:
2297          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2298        break;        break;
2299        }        }
2300  #else  #else
2301      (void)(utf8);  /* Keep compiler happy by referencing function argument */      (void)(utf);  /* Keep compiler happy by referencing function argument */
2302  #endif  #endif
2303      }      }
2304    }    }
# Line 1950  bracket whose current branch will alread Line 2321  bracket whose current branch will alread
2321  Arguments:  Arguments:
2322    code        points to start of search    code        points to start of search
2323    endcode     points to where to stop    endcode     points to where to stop
2324    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2325    cd          contains pointers to tables etc.    cd          contains pointers to tables etc.
2326    
2327  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2328  */  */
2329    
2330  static BOOL  static BOOL
2331  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2332    compile_data *cd)    BOOL utf, compile_data *cd)
2333  {  {
2334  register int c;  register int c;
2335  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2336       code < endcode;       code < endcode;
2337       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2338    {    {
2339    const uschar *ccode;    const pcre_uchar *ccode;
2340    
2341    c = *code;    c = *code;
2342    
# Line 1982  for (code = first_significant_code(code Line 2353  for (code = first_significant_code(code
2353    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
2354    implies a backward reference subroutine call, we can scan it. If it's a    implies a backward reference subroutine call, we can scan it. If it's a
2355    forward reference subroutine call, we can't. To detect forward reference    forward reference subroutine call, we can't. To detect forward reference
2356    we have to scan up the list that is kept in the workspace. This function is    we have to scan up the list that is kept in the workspace. This function is
2357    called only when doing the real compile, not during the pre-compile that    called only when doing the real compile, not during the pre-compile that
2358    measures the size of the compiled pattern. */    measures the size of the compiled pattern. */
2359    
2360    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2361      {      {
2362      const uschar *scode;      const pcre_uchar *scode;
2363      BOOL empty_branch;      BOOL empty_branch;
2364    
2365      /* Test for forward reference */      /* Test for forward reference */
2366    
2367      for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)      for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2368        if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;        if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2369    
2370      /* Not a forward reference, test for completed backward reference */      /* Not a forward reference, test for completed backward reference */
2371    
2372      empty_branch = FALSE;      empty_branch = FALSE;
2373      scode = cd->start_code + GET(code, 1);      scode = cd->start_code + GET(code, 1);
2374      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2375    
2376      /* Completed backwards reference */      /* Completed backwards reference */
2377    
2378      do      do
2379        {        {
2380        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf, cd))
2381          {          {
2382          empty_branch = TRUE;          empty_branch = TRUE;
2383          break;          break;
# Line 2014  for (code = first_significant_code(code Line 2385  for (code = first_significant_code(code
2385        scode += GET(scode, 1);        scode += GET(scode, 1);
2386        }        }
2387      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2388    
2389      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2390      continue;      continue;
2391      }      }
# Line 2024  for (code = first_significant_code(code Line 2395  for (code = first_significant_code(code
2395    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2396        c == OP_BRAPOSZERO)        c == OP_BRAPOSZERO)
2397      {      {
2398      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2399      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
2400      c = *code;      c = *code;
2401      continue;      continue;
# Line 2045  for (code = first_significant_code(code Line 2416  for (code = first_significant_code(code
2416    
2417    if (c == OP_BRA  || c == OP_BRAPOS ||    if (c == OP_BRA  || c == OP_BRAPOS ||
2418        c == OP_CBRA || c == OP_CBRAPOS ||        c == OP_CBRA || c == OP_CBRAPOS ||
2419        c == OP_ONCE || c == OP_COND)        c == OP_ONCE || c == OP_ONCE_NC ||
2420          c == OP_COND)
2421      {      {
2422      BOOL empty_branch;      BOOL empty_branch;
2423      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 2061  for (code = first_significant_code(code Line 2433  for (code = first_significant_code(code
2433        empty_branch = FALSE;        empty_branch = FALSE;
2434        do        do
2435          {          {
2436          if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2437            empty_branch = TRUE;            empty_branch = TRUE;
2438          code += GET(code, 1);          code += GET(code, 1);
2439          }          }
# Line 2079  for (code = first_significant_code(code Line 2451  for (code = first_significant_code(code
2451      {      {
2452      /* Check for quantifiers after a class. XCLASS is used for classes that      /* Check for quantifiers after a class. XCLASS is used for classes that
2453      cannot be represented just by a bit map. This includes negated single      cannot be represented just by a bit map. This includes negated single
2454      high-valued characters. The length in _pcre_OP_lengths[] is zero; the      high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2455      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2456      here. */      here. */
2457    
2458  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2459      case OP_XCLASS:      case OP_XCLASS:
2460      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2461      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2091  for (code = first_significant_code(code Line 2463  for (code = first_significant_code(code
2463    
2464      case OP_CLASS:      case OP_CLASS:
2465      case OP_NCLASS:      case OP_NCLASS:
2466      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2467    
2468  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2469      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2470  #endif  #endif
2471    
# Line 2166  for (code = first_significant_code(code Line 2538  for (code = first_significant_code(code
2538      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2539      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2540      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2541      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;      if (code[1 + IMM2_SIZE] == OP_PROP
2542          || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2543      break;      break;
2544    
2545      /* End of branch */      /* End of branch */
# Line 2181  for (code = first_significant_code(code Line 2554  for (code = first_significant_code(code
2554      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2555      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
2556    
2557  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2558      case OP_STAR:      case OP_STAR:
2559      case OP_STARI:      case OP_STARI:
2560      case OP_MINSTAR:      case OP_MINSTAR:
# Line 2194  for (code = first_significant_code(code Line 2567  for (code = first_significant_code(code
2567      case OP_MINQUERYI:      case OP_MINQUERYI:
2568      case OP_POSQUERY:      case OP_POSQUERY:
2569      case OP_POSQUERYI:      case OP_POSQUERYI:
2570      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2571      break;      break;
2572    
2573      case OP_UPTO:      case OP_UPTO:
# Line 2203  for (code = first_significant_code(code Line 2576  for (code = first_significant_code(code
2576      case OP_MINUPTOI:      case OP_MINUPTOI:
2577      case OP_POSUPTO:      case OP_POSUPTO:
2578      case OP_POSUPTOI:      case OP_POSUPTOI:
2579      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2580      break;      break;
2581  #endif  #endif
2582    
# Line 2217  for (code = first_significant_code(code Line 2590  for (code = first_significant_code(code
2590      break;      break;
2591    
2592      case OP_THEN_ARG:      case OP_THEN_ARG:
2593      code += code[1+LINK_SIZE];      code += code[1];
2594      break;      break;
2595    
2596      /* None of the remaining opcodes are required to match a character. */      /* None of the remaining opcodes are required to match a character. */
# Line 2240  return TRUE; Line 2613  return TRUE;
2613  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2614  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2615  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2616  This function is called only during the real compile, not during the  This function is called only during the real compile, not during the
2617  pre-compile.  pre-compile.
2618    
2619  Arguments:  Arguments:
2620    code        points to start of the recursion    code        points to start of the recursion
2621    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2622    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2623    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 mode
2624    cd          pointers to tables etc    cd          pointers to tables etc
2625    
2626  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2627  */  */
2628    
2629  static BOOL  static BOOL
2630  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2631    BOOL utf8, compile_data *cd)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2632  {  {
2633  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2634    {    {
2635    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2636      return FALSE;      return FALSE;
2637    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2638    }    }
# Line 2295  I think. Line 2668  I think.
2668  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2669  It seems that the appearance of a nested POSIX class supersedes an apparent  It seems that the appearance of a nested POSIX class supersedes an apparent
2670  external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or  external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2671  a digit. Also, unescaped square brackets may also appear as part of class  a digit.
2672  names. For example, [:a[:abc]b:] gives unknown class "[:abc]b:]"in Perl.  
2673    In Perl, unescaped square brackets may also appear as part of class names. For
2674    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2675    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2676    seem right at all. PCRE does not allow closing square brackets in POSIX class
2677    names.
2678    
2679  Arguments:  Arguments:
2680    ptr      pointer to the initial [    ptr      pointer to the initial [
# Line 2306  Returns:   TRUE or FALSE Line 2684  Returns:   TRUE or FALSE
2684  */  */
2685    
2686  static BOOL  static BOOL
2687  check_posix_syntax(const uschar *ptr, const uschar **endptr)  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2688  {  {
2689  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2690  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2691  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2692    {    {
2693    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2694      ptr++;      ptr++;
2695      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2696    else    else
2697      {      {
2698      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
# Line 2325  for (++ptr; *ptr != 0; ptr++) Line 2704  for (++ptr; *ptr != 0; ptr++)
2704           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2705            ptr[1] == CHAR_EQUALS_SIGN) &&            ptr[1] == CHAR_EQUALS_SIGN) &&
2706          check_posix_syntax(ptr, endptr))          check_posix_syntax(ptr, endptr))
2707        return FALSE;        return FALSE;
2708      }      }
2709    }    }
2710  return FALSE;  return FALSE;
# Line 2349  Returns:     a value representing the na Line 2728  Returns:     a value representing the na
2728  */  */
2729    
2730  static int  static int
2731  check_posix_name(const uschar *ptr, int len)  check_posix_name(const pcre_uchar *ptr, int len)
2732  {  {
2733  const char *pn = posix_names;  const char *pn = posix_names;
2734  register int yield = 0;  register int yield = 0;
2735  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2736    {    {
2737    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2738      strncmp((const char *)ptr, pn, len) == 0) return yield;      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2739    pn += posix_name_lengths[yield] + 1;    pn += posix_name_lengths[yield] + 1;
2740    yield++;    yield++;
2741    }    }
# Line 2388  value in the reference (which is a group Line 2767  value in the reference (which is a group
2767  Arguments:  Arguments:
2768    group      points to the start of the group    group      points to the start of the group
2769    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2770    utf8       TRUE in UTF-8 mode    utf        TRUE in UTF-8 / UTF-16 mode
2771    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2772    save_hwm   the hwm forward reference pointer at the start of the group    save_hwm   the hwm forward reference pointer at the start of the group
2773    
# Line 2396  Returns:     nothing Line 2775  Returns:     nothing
2775  */  */
2776    
2777  static void  static void
2778  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2779    uschar *save_hwm)    pcre_uchar *save_hwm)
2780  {  {
2781  uschar *ptr = group;  pcre_uchar *ptr = group;
2782    
2783  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2784    {    {
2785    int offset;    int offset;
2786    uschar *hc;    pcre_uchar *hc;
2787    
2788    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
2789    reference. */    reference. */
# Line 2449  Arguments: Line 2828  Arguments:
2828  Returns:         new code pointer  Returns:         new code pointer
2829  */  */
2830    
2831  static uschar *  static pcre_uchar *
2832  auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2833  {  {
2834  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2835  *code++ = 255;  *code++ = 255;
2836  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2837  PUT(code, LINK_SIZE, 0);                       /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2838  return code + 2*LINK_SIZE;  return code + 2 * LINK_SIZE;
2839  }  }
2840    
2841    
# Line 2478  Returns:             nothing Line 2857  Returns:             nothing
2857  */  */
2858    
2859  static void  static void
2860  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2861  {  {
2862  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2863  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
# Line 2561  switch(ptype) Line 2940  switch(ptype)
2940            prop->chartype == ucp_Lt) == negated;            prop->chartype == ucp_Lt) == negated;
2941    
2942    case PT_GC:    case PT_GC:
2943    return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2944    
2945    case PT_PC:    case PT_PC:
2946    return (pdata == prop->chartype) == negated;    return (pdata == prop->chartype) == negated;
# Line 2572  switch(ptype) Line 2951  switch(ptype)
2951    /* These are specials */    /* These are specials */
2952    
2953    case PT_ALNUM:    case PT_ALNUM:
2954    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2955            _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2956    
2957    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
2958    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2959            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2960            == negated;            == negated;
2961    
2962    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2963    return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2964            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2965            c == CHAR_FF || c == CHAR_CR)            c == CHAR_FF || c == CHAR_CR)
2966            == negated;            == negated;
2967    
2968    case PT_WORD:    case PT_WORD:
2969    return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2970            _pcre_ucp_gentype[prop->chartype] == ucp_N ||            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2971            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
2972    }    }
2973  return FALSE;  return FALSE;
# Line 2607  sense to automatically possessify the re Line 2986  sense to automatically possessify the re
2986    
2987  Arguments:  Arguments:
2988    previous      pointer to the repeated opcode    previous      pointer to the repeated opcode
2989    utf8          TRUE in UTF-8 mode    utf           TRUE in UTF-8 / UTF-16 mode
2990    ptr           next character in pattern    ptr           next character in pattern
2991    options       options bits    options       options bits
2992    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 2616  Returns:        TRUE if possessifying is Line 2995  Returns:        TRUE if possessifying is
2995  */  */
2996    
2997  static BOOL  static BOOL
2998  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2999    int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
3000  {  {
3001  int c, next;  pcre_int32 c, next;
3002  int op_code = *previous++;  int op_code = *previous++;
3003    
3004  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2628  if ((options & PCRE_EXTENDED) != 0) Line 3007  if ((options & PCRE_EXTENDED) != 0)
3007    {    {
3008    for (;;)    for (;;)
3009      {      {
3010      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3011      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3012        {        {
3013        ptr++;        ptr++;
# Line 2636  if ((options & PCRE_EXTENDED) != 0) Line 3015  if ((options & PCRE_EXTENDED) != 0)
3015          {          {
3016          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3017          ptr++;          ptr++;
3018  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3019          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3020  #endif  #endif
3021          }          }
3022        }        }
# Line 2655  if (*ptr == CHAR_BACKSLASH) Line 3034  if (*ptr == CHAR_BACKSLASH)
3034    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
3035    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
3036    }    }
3037    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
3038    {    {
3039  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3040    if (utf8) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
3041  #endif  #endif
3042    next = *ptr++;    next = *ptr++;
3043    }    }
   
3044  else return FALSE;  else return FALSE;
3045    
3046  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2672  if ((options & PCRE_EXTENDED) != 0) Line 3049  if ((options & PCRE_EXTENDED) != 0)
3049    {    {
3050    for (;;)    for (;;)
3051      {      {
3052      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3053      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3054        {        {
3055        ptr++;        ptr++;
# Line 2680  if ((options & PCRE_EXTENDED) != 0) Line 3057  if ((options & PCRE_EXTENDED) != 0)
3057          {          {
3058          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3059          ptr++;          ptr++;
3060  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3061          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3062  #endif  #endif
3063          }          }
3064        }        }
# Line 2692  if ((options & PCRE_EXTENDED) != 0) Line 3069  if ((options & PCRE_EXTENDED) != 0)
3069  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
3070    
3071  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3072    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3073      return FALSE;      return FALSE;
3074    
3075  /* Now compare the next item with the previous opcode. First, handle cases when  /* Now compare the next item with the previous opcode. First, handle cases when
# Line 2701  the next item is a character. */ Line 3078  the next item is a character. */
3078  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3079    {    {
3080    case OP_CHAR:    case OP_CHAR:
3081  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3082    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3083  #else  #else
3084    c = *previous;    c = *previous;
# Line 2713  if (next >= 0) switch(op_code) Line 3090  if (next >= 0) switch(op_code)
3090    high-valued characters. */    high-valued characters. */
3091    
3092    case OP_CHARI:    case OP_CHARI:
3093  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3094    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3095  #else  #else
3096    c = *previous;    c = *previous;
3097  #endif  #endif
3098    if (c == next) return FALSE;    if (c == next) return FALSE;
3099  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3100    if (utf8)    if (utf)
3101      {      {
3102      unsigned int othercase;      unsigned int othercase;
3103      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
# Line 2732  if (next >= 0) switch(op_code) Line 3109  if (next >= 0) switch(op_code)
3109      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3110      }      }
3111    else    else
3112  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3113    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
   
   /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These  
   opcodes are not used for multi-byte characters, because they are coded using  
   an XCLASS instead. */  
3114    
3115    case OP_NOT:    case OP_NOT:
3116    return (c = *previous) == next;  #ifdef SUPPORT_UTF
3117      GETCHARTEST(c, previous);
3118    #else
3119      c = *previous;
3120    #endif
3121      return c == next;
3122    
3123    case OP_NOTI:    case OP_NOTI:
3124    if ((c = *previous) == next) return TRUE;  #ifdef SUPPORT_UTF
3125  #ifdef SUPPORT_UTF8    GETCHARTEST(c, previous);
3126    if (utf8)  #else
3127      c = *previous;
3128    #endif
3129      if (c == next) return TRUE;
3130    #ifdef SUPPORT_UTF
3131      if (utf)
3132      {      {
3133      unsigned int othercase;      unsigned int othercase;
3134      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
3135  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3136      othercase = UCD_OTHERCASE(next);      othercase = UCD_OTHERCASE((unsigned int)next);
3137  #else  #else
3138      othercase = NOTACHAR;      othercase = NOTACHAR;
3139  #endif  #endif
3140      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3141      }      }
3142    else    else
3143  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3144    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3145    
3146    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3147    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3148    
3149    case OP_DIGIT:    case OP_DIGIT:
3150    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3151    
3152    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3153    return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3154    
3155    case OP_WHITESPACE:    case OP_WHITESPACE:
3156    return next > 127 || (cd->ctypes[next] & ctype_space) == 0;    return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3157    
3158    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3159    return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3160    
3161    case OP_WORDCHAR:    case OP_WORDCHAR:
3162    return next > 127 || (cd->ctypes[next] & ctype_word) == 0;    return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3163    
3164    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3165    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;    return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3166    
3167    case OP_HSPACE:    case OP_HSPACE:
3168    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3169    switch(next)    switch(next)
3170      {      {
3171      case 0x09:      HSPACE_CASES:
     case 0x20:  
     case 0xa0:  
     case 0x1680:  
     case 0x180e:  
     case 0x2000:  
     case 0x2001:  
     case 0x2002:  
     case 0x2003:  
     case 0x2004:  
     case 0x2005:  
     case 0x2006:  
     case 0x2007:  
     case 0x2008:  
     case 0x2009:  
     case 0x200A:  
     case 0x202f:  
     case 0x205f:  
     case 0x3000:  
3172      return op_code == OP_NOT_HSPACE;      return op_code == OP_NOT_HSPACE;
3173    
3174      default:      default:
3175      return op_code != OP_NOT_HSPACE;      return op_code != OP_NOT_HSPACE;
3176      }      }
# Line 2814  if (next >= 0) switch(op_code) Line 3180  if (next >= 0) switch(op_code)
3180    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3181    switch(next)    switch(next)
3182      {      {
3183      case 0x0a:      VSPACE_CASES:
     case 0x0b:  
     case 0x0c:  
     case 0x0d:  
     case 0x85:  
     case 0x2028:  
     case 0x2029:  
3184      return op_code == OP_NOT_VSPACE;      return op_code == OP_NOT_VSPACE;
3185    
3186      default:      default:
3187      return op_code != OP_NOT_VSPACE;      return op_code != OP_NOT_VSPACE;
3188      }      }
# Line 2849  switch(op_code) Line 3210  switch(op_code)
3210    {    {
3211    case OP_CHAR:    case OP_CHAR:
3212    case OP_CHARI:    case OP_CHARI:
3213  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3214    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3215  #else  #else
3216    c = *previous;    c = *previous;
# Line 2857  switch(op_code) Line 3218  switch(op_code)
3218    switch(-next)    switch(-next)
3219      {      {
3220      case ESC_d:      case ESC_d:
3221      return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;      return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3222    
3223      case ESC_D:      case ESC_D:
3224      return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3225    
3226      case ESC_s:      case ESC_s:
3227      return c > 127 || (cd->ctypes[c] & ctype_space) == 0;      return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3228    
3229      case ESC_S:      case ESC_S:
3230      return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3231    
3232      case ESC_w:      case ESC_w:
3233      return c > 127 || (cd->ctypes[c] & ctype_word) == 0;      return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3234    
3235      case ESC_W:      case ESC_W:
3236      return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;      return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3237    
3238      case ESC_h:      case ESC_h:
3239      case ESC_H:      case ESC_H:
3240      switch(c)      switch(c)
3241        {        {
3242        case 0x09:        HSPACE_CASES:
       case 0x20:  
       case 0xa0:  
       case 0x1680:  
       case 0x180e:  
       case 0x2000:  
       case 0x2001:  
       case 0x2002:  
       case 0x2003:  
       case 0x2004:  
       case 0x2005:  
       case 0x2006:  
       case 0x2007:  
       case 0x2008:  
       case 0x2009:  
       case 0x200A:  
       case 0x202f:  
       case 0x205f:  
       case 0x3000:  
3243        return -next != ESC_h;        return -next != ESC_h;
3244    
3245        default:        default:
3246        return -next == ESC_h;        return -next == ESC_h;
3247        }        }
# Line 2906  switch(op_code) Line 3250  switch(op_code)
3250      case ESC_V:      case ESC_V:
3251      switch(c)      switch(c)
3252        {        {
3253        case 0x0a:        VSPACE_CASES:
       case 0x0b:  
       case 0x0c:  
       case 0x0d:  
       case 0x85:  
       case 0x2028:  
       case 0x2029:  
3254        return -next != ESC_v;        return -next != ESC_v;
3255    
3256        default:        default:
3257        return -next == ESC_v;        return -next == ESC_v;
3258        }        }
# Line 2954  switch(op_code) Line 3293  switch(op_code)
3293        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
3294    
3295        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3296          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3297            return FALSE;            return FALSE;
3298    
3299        /* Do the property check. */        /* Do the property check. */
# Line 2981  switch(op_code) Line 3320  switch(op_code)
3320    return next == -ESC_d;    return next == -ESC_d;
3321    
3322    case OP_WHITESPACE:    case OP_WHITESPACE:
3323    return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;    return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3324    
3325    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3326    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3327    
3328    case OP_HSPACE:    case OP_HSPACE:
3329    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
# Line 3032  Arguments: Line 3371  Arguments:
3371    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3372    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3373    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3374    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3375    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3376    bcptr          points to current branch chain    bcptr          points to current branch chain
3377      cond_depth     conditional nesting depth
3378    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3379    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3380                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 3044  Returns:         TRUE on success Line 3384  Returns:         TRUE on success
3384  */  */
3385    
3386  static BOOL  static BOOL
3387  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3388    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3389      pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3390    compile_data *cd, int *lengthptr)    compile_data *cd, int *lengthptr)
3391  {  {
3392  int repeat_type, op_type;  int repeat_type, op_type;
3393  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3394  int bravalue = 0;  int bravalue = 0;
3395  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3396  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3397  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3398  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3399  int options = *optionsptr;               /* May change dynamically */  int options = *optionsptr;               /* May change dynamically */
3400  int after_manual_callout = 0;  int after_manual_callout = 0;
3401  int length_prevgroup = 0;  int length_prevgroup = 0;
3402  register int c;  register int c;
3403  register uschar *code = *codeptr;  register pcre_uchar *code = *codeptr;
3404  uschar *last_code = code;  pcre_uchar *last_code = code;
3405  uschar *orig_code = code;  pcre_uchar *orig_code = code;
3406  uschar *tempcode;  pcre_uchar *tempcode;
3407  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3408  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3409  const uschar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3410  const uschar *tempptr;  const pcre_uchar *tempptr;
3411  const uschar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
3412  uschar *previous = NULL;  pcre_uchar *previous = NULL;
3413  uschar *previous_callout = NULL;  pcre_uchar *previous_callout = NULL;
3414  uschar *save_hwm = NULL;  pcre_uchar *save_hwm = NULL;
3415  uschar classbits[32];  pcre_uint8 classbits[32];
3416    
3417  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we  /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3418  must not do this for other options (e.g. PCRE_EXTENDED) because they may change  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3419  dynamically as we process the pattern. */  dynamically as we process the pattern. */
3420    
3421  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3422  BOOL class_utf8;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3423  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
3424  uschar *class_utf8data;  pcre_uchar utf_chars[6];
 uschar *class_utf8data_base;  
 uschar utf8_char[6];  
3425  #else  #else
3426  BOOL utf8 = FALSE;  BOOL utf = FALSE;
3427  uschar *utf8_char = NULL;  #endif
3428    
3429    /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3430    
3431    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3432    BOOL xclass;
3433    pcre_uchar *class_uchardata;
3434    pcre_uchar *class_uchardata_base;
3435  #endif  #endif
3436    
3437  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 3099  greedy_non_default = greedy_default ^ 1; Line 3445  greedy_non_default = greedy_default ^ 1;
3445    
3446  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3447  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3448  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3449  find one.  find one.
3450    
3451  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3452  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3453  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3454  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3455    
3456  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3457    
3458  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3459  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3460  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3461  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3462    value. This is used only for ASCII characters. */
3463    
3464  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3465    
3466  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3467    
# Line 3126  for (;; ptr++) Line 3473  for (;; ptr++)
3473    BOOL is_quantifier;    BOOL is_quantifier;
3474    BOOL is_recurse;    BOOL is_recurse;
3475    BOOL reset_bracount;    BOOL reset_bracount;
3476    int class_charcount;    int class_has_8bitchar;
3477    int class_lastchar;    int class_single_char;
3478    int newoptions;    int newoptions;
3479    int recno;    int recno;
3480    int refsign;    int refsign;
3481    int skipbytes;    int skipbytes;
3482    int subreqbyte;    int subreqchar;
3483    int subfirstbyte;    int subfirstchar;
3484    int terminator;    int terminator;
3485    int mclength;    int mclength;
3486    uschar mcbuffer[8];    int tempbracount;
3487      pcre_uchar mcbuffer[8];
3488    
3489    /* Get next byte in the pattern */    /* Get next character in the pattern */
3490    
3491    c = *ptr;    c = *ptr;
3492    
# Line 3160  for (;; ptr++) Line 3508  for (;; ptr++)
3508  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3509      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3510  #endif  #endif
3511      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3512            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3513        {        {
3514        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3515        goto FAILED;        goto FAILED;
# Line 3183  for (;; ptr++) Line 3532  for (;; ptr++)
3532        }        }
3533    
3534      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3535      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3536          (int)(code - last_code), c, c));
3537    
3538      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3539      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 3193  for (;; ptr++) Line 3543  for (;; ptr++)
3543        {        {
3544        if (previous > orig_code)        if (previous > orig_code)
3545          {          {
3546          memmove(orig_code, previous, code - previous);          memmove(orig_code, previous, IN_UCHARS(code - previous));
3547          code -= previous - orig_code;          code -= previous - orig_code;
3548          previous = orig_code;          previous = orig_code;
3549          }          }
# Line 3209  for (;; ptr++) Line 3559  for (;; ptr++)
3559    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3560    reference list. */    reference list. */
3561    
3562    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3563               WORK_SIZE_SAFETY_MARGIN)
3564      {      {
3565      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3566      goto FAILED;      goto FAILED;
# Line 3261  for (;; ptr++) Line 3612  for (;; ptr++)
3612    
3613    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3614      {      {
3615      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3616      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3617        {        {
3618        ptr++;        ptr++;
# Line 3269  for (;; ptr++) Line 3620  for (;; ptr++)
3620          {          {
3621          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3622          ptr++;          ptr++;
3623  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3624          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;          if (utf) FORWARDCHAR(ptr);
3625  #endif  #endif
3626          }          }
3627        if (*ptr != 0) continue;        if (*ptr != 0) continue;
# Line 3294  for (;; ptr++) Line 3645  for (;; ptr++)
3645      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3646      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3647      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3648      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3649      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3650      *codeptr = code;      *codeptr = code;
3651      *ptrptr = ptr;      *ptrptr = ptr;
3652      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3319  for (;; ptr++) Line 3670  for (;; ptr++)
3670      previous = NULL;      previous = NULL;
3671      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3672        {        {
3673        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3674        *code++ = OP_CIRCM;        *code++ = OP_CIRCM;
3675        }        }
3676      else *code++ = OP_CIRC;      else *code++ = OP_CIRC;
# Line 3331  for (;; ptr++) Line 3682  for (;; ptr++)
3682      break;      break;
3683    
3684      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3685      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3686    
3687      case CHAR_DOT:      case CHAR_DOT:
3688      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3689      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3690      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3691      previous = code;      previous = code;
3692      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3693      break;      break;
# Line 3391  for (;; ptr++) Line 3742  for (;; ptr++)
3742          {          {
3743          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
3744            ptr++;            ptr++;
3745          else if (strncmp((const char *)ptr+1,          else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
                           STR_Q STR_BACKSLASH STR_E, 3) == 0)  
3746            ptr += 3;            ptr += 3;
3747          else          else
3748            break;            break;
# Line 3411  for (;; ptr++) Line 3761  for (;; ptr++)
3761          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3762        {        {
3763        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3764        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3765        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3766        break;        break;
3767        }        }
3768    
# Line 3422  for (;; ptr++) Line 3772  for (;; ptr++)
3772    
3773      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3774    
3775      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3776      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3777      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3778        a single character. */
3779    
3780      class_charcount = 0;      class_has_8bitchar = 0;
3781      class_lastchar = -1;      class_single_char = 0;
3782    
3783      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3784      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
3785      than 256), because in that case the compiled code doesn't use the bit map.      than 256), because in that case the compiled code doesn't use the bit map.
3786      */      */
3787    
3788      memset(classbits, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
3789    
3790  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3791      class_utf8 = FALSE;                       /* No chars >= 256 */      xclass = FALSE;                           /* No chars >= 256 */
3792      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3793      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3794  #endif  #endif
3795    
3796      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3448  for (;; ptr++) Line 3799  for (;; ptr++)
3799    
3800      if (c != 0) do      if (c != 0) do
3801        {        {
3802        const uschar *oldptr;        const pcre_uchar *oldptr;
3803    
3804  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3805        if (utf8 && c > 127)        if (utf && HAS_EXTRALEN(c))
3806          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3807          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3808          }          }
3809    #endif
3810    
3811        /* In the pre-compile phase, accumulate the length of any UTF-8 extra  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3812          /* In the pre-compile phase, accumulate the length of any extra
3813        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3814        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
3815        (which is on the stack). */        (which is on the stack). */
3816    
3817        if (lengthptr != NULL)        if (lengthptr != NULL)
3818          {          {
3819          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_uchardata - class_uchardata_base;
3820          class_utf8data = class_utf8data_base;          class_uchardata = class_uchardata_base;
3821          }          }
   
3822  #endif  #endif
3823    
3824        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3494  for (;; ptr++) Line 3846  for (;; ptr++)
3846          {          {
3847          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3848          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3849          register const uschar *cbits = cd->cbits;          register const pcre_uint8 *cbits = cd->cbits;
3850          uschar pbits[32];          pcre_uint8 pbits[32];
3851    
3852          if (ptr[1] != CHAR_COLON)          if (ptr[1] != CHAR_COLON)
3853            {            {
# Line 3550  for (;; ptr++) Line 3902  for (;; ptr++)
3902          /* Copy in the first table (always present) */          /* Copy in the first table (always present) */
3903    
3904          memcpy(pbits, cbits + posix_class_maps[posix_class],          memcpy(pbits, cbits + posix_class_maps[posix_class],
3905            32 * sizeof(uschar));            32 * sizeof(pcre_uint8));
3906    
3907          /* If there is a second table, add or remove it as required. */          /* If there is a second table, add or remove it as required. */
3908    
# Line 3581  for (;; ptr++) Line 3933  for (;; ptr++)
3933            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3934    
3935          ptr = tempptr + 1;          ptr = tempptr + 1;
3936          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3937            class_has_8bitchar = 1;
3938            /* Every class contains at least two characters. */
3939            class_single_char = 2;
3940          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3941          }          }
3942    
3943        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3944        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3945        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3946        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3947        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3948        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3949          as literal characters (by default), or are faulted if
3950        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3951    
3952        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3599  for (;; ptr++) Line 3955  for (;; ptr++)
3955          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3956    
3957          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3958            else if (-c == ESC_N)            /* \N is not supported in a class */
3959              {
3960              *errorcodeptr = ERR71;
3961              goto FAILED;
3962              }
3963          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3964            {            {
3965            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3612  for (;; ptr++) Line 3973  for (;; ptr++)
3973    
3974          if (c < 0)          if (c < 0)
3975            {            {
3976            register const uschar *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3977            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3978              class_has_8bitchar++;
3979              /* Every class contains at least two characters. */
3980              class_single_char += 2;
3981    
3982            switch (-c)            switch (-c)
3983              {              {
# Line 3626  for (;; ptr++) Line 3990  for (;; ptr++)
3990              case ESC_SU:              case ESC_SU:
3991              nestptr = ptr;              nestptr = ptr;
3992              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3993              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
3994              continue;              continue;
3995  #endif  #endif
3996              case ESC_d:              case ESC_d:
# Line 3649  for (;; ptr++) Line 4013  for (;; ptr++)
4013    
4014              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4015              if it was previously set by something earlier in the character              if it was previously set by something earlier in the character
4016              class. */              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and
4017                EBCDIC, so we lazily just adjust the appropriate bit. */
4018    
4019              case ESC_s:              case ESC_s:
4020              classbits[0] |= cbits[cbit_space];              classbits[0] |= cbits[cbit_space];
# Line 3664  for (;; ptr++) Line 4029  for (;; ptr++)
4029              continue;              continue;
4030    
4031              case ESC_h:              case ESC_h:
4032              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, CHAR_HT);
4033              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, CHAR_SPACE);
4034    #ifndef EBCDIC
4035              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
4036  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4037              if (utf8)              xclass = TRUE;
4038                *class_uchardata++ = XCL_SINGLE;
4039                *class_uchardata++ = 0x1680;
4040                *class_uchardata++ = XCL_SINGLE;
4041                *class_uchardata++ = 0x180e;
4042                *class_uchardata++ = XCL_RANGE;
4043                *class_uchardata++ = 0x2000;
4044                *class_uchardata++ = 0x200a;
4045                *class_uchardata++ = XCL_SINGLE;
4046                *class_uchardata++ = 0x202f;
4047                *class_uchardata++ = XCL_SINGLE;
4048                *class_uchardata++ = 0x205f;
4049                *class_uchardata++ = XCL_SINGLE;
4050                *class_uchardata++ = 0x3000;
4051    #elif defined SUPPORT_UTF
4052                if (utf)
4053                {                {
4054                class_utf8 = TRUE;                xclass = TRUE;
4055                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4056                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4057                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4058                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4059                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4060                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4061                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4062                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4063                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4064                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4065                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4066                *class_utf8data++ = XCL_SINGLE;                *class_uchardata++ = XCL_SINGLE;
4067                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4068                }                }
4069  #endif  #endif
4070    #endif  /* Not EBCDIC */
4071              continue;              continue;
4072    
4073              case ESC_H:              case ESC_H:
# Line 3694  for (;; ptr++) Line 4076  for (;; ptr++)
4076                int x = 0xff;                int x = 0xff;
4077                switch (c)                switch (c)
4078                  {                  {
4079                  case 0x09/8: x ^= 1 << (0x09%8); break;                  case CHAR_HT/8:    x ^= 1 << (CHAR_HT%8); break;
4080                  case 0x20/8: x ^= 1 << (0x20%8); break;                  case CHAR_SPACE/8: x ^= 1 << (CHAR_SPACE%8); break;
4081                  case 0xa0/8: x ^= 1 << (0xa0%8); break;  #ifndef EBCDIC
4082                    case 0xa0/8: x ^= 1 << (0xa0%8); break;  /* NSBSP */
4083    #endif
4084                  default: break;                  default: break;
4085                  }                  }
4086                classbits[c] |= x;                classbits[c] |= x;
4087                }                }
4088    #ifndef EBCDIC
4089  #ifdef SUPPORT_UTF8  #ifndef COMPILE_PCRE8
4090              if (utf8)              xclass = TRUE;
4091                *class_uchardata++ = XCL_RANGE;
4092                *class_uchardata++ = 0x0100;
4093                *class_uchardata++ = 0x167f;
4094                *class_uchardata++ = XCL_RANGE;
4095                *class_uchardata++ = 0x1681;
4096                *class_uchardata++ = 0x180d;
4097                *class_uchardata++ = XCL_RANGE;
4098                *class_uchardata++ = 0x180f;
4099                *class_uchardata++ = 0x1fff;
4100                *class_uchardata++ = XCL_RANGE;
4101                *class_uchardata++ = 0x200b;
4102                *class_uchardata++ = 0x202e;
4103                *class_uchardata++ = XCL_RANGE;
4104                *class_uchardata++ = 0x2030;
4105                *class_uchardata++ = 0x205e;
4106                *class_uchardata++ = XCL_RANGE;
4107                *class_uchardata++ = 0x2060;
4108                *class_uchardata++ = 0x2fff;
4109                *class_uchardata++ = XCL_RANGE;
4110                *class_uchardata++ = 0x3001;
4111    #ifdef SUPPORT_UTF
4112                if (utf)
4113                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4114                else
4115    #endif   /* SUPPORT_UTF */
4116                  *class_uchardata++ = 0xffff;
4117    #elif defined SUPPORT_UTF
4118                if (utf)
4119                {                {
4120                class_utf8 = TRUE;                xclass = TRUE;
4121                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4122                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4123                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4124                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4125                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4126                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4127                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4128                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4129                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4130                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4131                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4132                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4133                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4134                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4135                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4136                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4137                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4138                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4139                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4140                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4141                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4142                }                }
4143  #endif  #endif
4144    #endif  /* Not EBCDIC */
4145              continue;              continue;
4146    
4147              case ESC_v:              case ESC_v:
4148              SETBIT(classbits, 0x0a); /* LF */              SETBIT(classbits, CHAR_LF);
4149              SETBIT(classbits, 0x0b); /* VT */              SETBIT(classbits, CHAR_VT);
4150              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, CHAR_FF);
4151              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, CHAR_CR);
4152              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, CHAR_NEL);
4153  #ifdef SUPPORT_UTF8  #ifndef EBCDIC
4154              if (utf8)  #ifndef COMPILE_PCRE8
4155                xclass = TRUE;
4156                *class_uchardata++ = XCL_RANGE;
4157                *class_uchardata++ = 0x2028;
4158                *class_uchardata++ = 0x2029;
4159    #elif defined SUPPORT_UTF
4160                if (utf)
4161                {                {
4162                class_utf8 = TRUE;                xclass = TRUE;
4163                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4164                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4165                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4166                }                }
4167  #endif  #endif
4168    #endif  /* Not EBCDIC */
4169              continue;              continue;
4170    
4171              case ESC_V:              case ESC_V:
# Line 3754  for (;; ptr++) Line 4174  for (;; ptr++)
4174                int x = 0xff;                int x = 0xff;
4175                switch (c)                switch (c)
4176                  {                  {
4177                  case 0x0a/8: x ^= 1 << (0x0a%8);                  case CHAR_LF/8: x ^= 1 << (CHAR_LF%8);
4178                               x ^= 1 << (0x0b%8);                                  x ^= 1 << (CHAR_VT%8);
4179                               x ^= 1 << (0x0c%8);                                  x ^= 1 << (CHAR_FF%8);
4180                               x ^= 1 << (0x0d%8);                                  x ^= 1 << (CHAR_CR%8);
4181                               break;                                  break;
4182                  case 0x85/8: x ^= 1 << (0x85%8); break;                  case CHAR_NEL/8: x ^= 1 << (CHAR_NEL%8); break;
4183                  default: break;                  default: break;
4184                  }                  }
4185                classbits[c] |= x;                classbits[c] |= x;
4186                }                }
4187    
4188  #ifdef SUPPORT_UTF8  #ifndef EBCDIC
4189              if (utf8)  #ifndef COMPILE_PCRE8
4190                xclass = TRUE;
4191                *class_uchardata++ = XCL_RANGE;
4192                *class_uchardata++ = 0x0100;
4193                *class_uchardata++ = 0x2027;
4194                *class_uchardata++ = XCL_RANGE;
4195                *class_uchardata++ = 0x202a;
4196    #ifdef SUPPORT_UTF
4197                if (utf)
4198                  class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4199                else
4200    #endif
4201                  *class_uchardata++ = 0xffff;
4202    #elif defined SUPPORT_UTF
4203                if (utf)
4204                {                {
4205                class_utf8 = TRUE;                xclass = TRUE;
4206                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4207                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4208                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4209                *class_utf8data++ = XCL_RANGE;                *class_uchardata++ = XCL_RANGE;
4210                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4211                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4212                }                }
4213  #endif  #endif
4214    #endif  /* Not EBCDIC */
4215              continue;              continue;
4216    
4217  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
# Line 3787  for (;; ptr++) Line 4222  for (;; ptr++)
4222                int pdata;                int pdata;
4223                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4224                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
4225                class_utf8 = TRUE;                xclass = TRUE;
4226                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_uchardata++ = ((-c == ESC_p) != negated)?
4227                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4228                *class_utf8data++ = ptype;                *class_uchardata++ = ptype;
4229                *class_utf8data++ = pdata;                *class_uchardata++ = pdata;
4230                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4231                continue;                continue;
4232                }                }
4233  #endif  #endif
# Line 3806  for (;; ptr++) Line 4241  for (;; ptr++)
4241                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4242                goto FAILED;                goto FAILED;
4243                }                }
4244              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4245              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4246                c = *ptr;                /* Get the final character and fall through */
4247              break;              break;
4248              }              }
4249            }            }
4250    
4251          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4252          greater than 256 in UTF-8 mode. */          greater than 256. */
4253    
4254          }   /* End of backslash handling */          }   /* End of backslash handling */
4255    
# Line 3861  for (;; ptr++) Line 4297  for (;; ptr++)
4297            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4298            }            }
4299    
4300  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4301          if (utf8)          if (utf)
4302            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4303            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4304            }            }
# Line 3906  for (;; ptr++) Line 4342  for (;; ptr++)
4342    
4343          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4344    
4345            /* Since we found a character range, single character optimizations
4346            cannot be done anymore. */
4347            class_single_char = 2;
4348    
4349          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4350          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4351          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4352          available. */          available. */
4353    
4354  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4355          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4356    #elif defined  SUPPORT_UTF
4357            if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4358    #elif !(defined COMPILE_PCRE8)
4359            if (d > 255)
4360    #endif
4361    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4362            {            {
4363            class_utf8 = TRUE;            xclass = TRUE;
4364    
4365            /* With UCP support, we can find the other case equivalents of            /* With UCP support, we can find the other case equivalents of
4366            the relevant characters. There may be several ranges. Optimize how            the relevant characters. There may be several ranges. Optimize how
4367            they fit with the basic range. */            they fit with the basic range. */
4368    
4369  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4370    #ifndef COMPILE_PCRE8
4371              if (utf && (options & PCRE_CASELESS) != 0)
4372    #else
4373            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4374    #endif
4375              {              {
4376              unsigned int occ, ocd;              unsigned int occ, ocd;
4377              unsigned int cc = c;              unsigned int cc = c;
# Line 3947  for (;; ptr++) Line 4397  for (;; ptr++)
4397    
4398                if (occ == ocd)                if (occ == ocd)
4399                  {                  {
4400                  *class_utf8data++ = XCL_SINGLE;                  *class_uchardata++ = XCL_SINGLE;
4401                  }                  }
4402                else                else
4403                  {                  {
4404                  *class_utf8data++ = XCL_RANGE;                  *class_uchardata++ = XCL_RANGE;
4405                  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);                  class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4406                  }                  }
4407                class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);                class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4408                }                }
4409              }              }
4410  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 3962  for (;; ptr++) Line 4412  for (;; ptr++)
4412            /* Now record the original range, possibly modified for UCP caseless            /* Now record the original range, possibly modified for UCP caseless
4413            overlapping ranges. */            overlapping ranges. */
4414    
4415            *class_utf8data++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4416            class_utf8data += _pcre_ord2utf8(c, class_utf8data);  #ifdef SUPPORT_UTF
4417            class_utf8data += _pcre_ord2utf8(d, class_utf8data);  #ifndef COMPILE_PCRE8
4418              if (utf)
4419                {
4420                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4421                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4422                }
4423              else
4424                {
4425                *class_uchardata++ = c;
4426                *class_uchardata++ = d;
4427                }
4428    #else
4429              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4430              class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4431    #endif
4432    #else /* SUPPORT_UTF */
4433              *class_uchardata++ = c;
4434              *class_uchardata++ = d;
4435    #endif /* SUPPORT_UTF */
4436    
4437            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4438            caseless matching for UTF-8 characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
4439            for the smaller ones. */            for the smaller ones. As for 16 bit characters without UTF, we
4440              can still use  */
4441    
4442  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4443            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4444  #else            if (utf)
4445            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  #endif
4446                continue;    /* With next character in the class */
4447    #endif  /* SUPPORT_UCP */
4448    
4449    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4450              if (utf)
4451                {
4452                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4453                /* Adjust upper limit and fall through to set up the map */
4454                d = 127;
4455                }
4456              else
4457                {
4458                if (c > 255) continue;
4459                /* Adjust upper limit and fall through to set up the map */
4460                d = 255;
4461                }
4462    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4463              if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4464            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
   
4465            d = 127;            d = 127;
4466    #else
4467  #endif  /* SUPPORT_UCP */            if (c > 255) continue;
4468              /* Adjust upper limit and fall through to set up the map */
4469              d = 255;
4470    #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4471            }            }
4472  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4473    
4474          /* We use the bit map for all cases when not in UTF-8 mode; else          /* We use the bit map for 8 bit mode, or when the characters fall
4475          ranges that lie entirely within 0-127 when there is UCP support; else          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
         for partial ranges without UCP support. */  
4476    
4477          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4478    
4479          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4480    
# Line 3997  for (;; ptr++) Line 4483  for (;; ptr++)
4483            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4484            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4485              {              {
4486              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c]; /* flip case */
4487              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
4488              }              }
4489            }            }
# Line 4011  for (;; ptr++) Line 4497  for (;; ptr++)
4497    
4498        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4499    
4500        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4501    
4502  #ifdef SUPPORT_UTF8        if (class_single_char < 2) class_single_char++;
4503        if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))  
4504          /* If class_charcount is 1, we saw precisely one character. As long as
4505          there was no use of \p or \P, in other words, no use of any XCLASS
4506          features, we can optimize.
4507    
4508          The optimization throws away the bit map. We turn the item into a
4509          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4510          In the positive case, it can cause firstchar to be set. Otherwise, there
4511          can be no first char if this item is first, whatever repeat count may
4512          follow. In the case of reqchar, save the previous value for reinstating. */
4513    
4514          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4515          {          {
4516          class_utf8 = TRUE;          ptr++;
4517          *class_utf8data++ = XCL_SINGLE;          zeroreqchar = reqchar;
4518          class_utf8data += _pcre_ord2utf8(c, class_utf8data);  
4519            if (negate_class)
4520              {
4521              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4522              zerofirstchar = firstchar;
4523              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4524    #ifdef SUPPORT_UTF
4525              if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4526                code += PRIV(ord2utf)(c, code);
4527              else
4528    #endif
4529                *code++ = c;
4530              goto NOT_CHAR;
4531              }
4532    
4533            /* For a single, positive character, get the value into mcbuffer, and
4534            then we can handle this with the normal one-character code. */
4535    
4536    #ifdef SUPPORT_UTF
4537            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4538              mclength = PRIV(ord2utf)(c, mcbuffer);
4539            else
4540    #endif
4541              {
4542              mcbuffer[0] = c;
4543              mclength = 1;
4544              }
4545            goto ONE_CHAR;
4546            }       /* End of 1-char optimization */
4547    
4548          /* Handle a character that cannot go in the bit map. */
4549    
4550    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4551          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4552    #elif defined SUPPORT_UTF
4553          if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4554    #elif !(defined COMPILE_PCRE8)
4555          if (c > 255)
4556    #endif
4557    
4558    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4559            {
4560            xclass = TRUE;
4561            *class_uchardata++ = XCL_SINGLE;
4562    #ifdef SUPPORT_UTF
4563    #ifndef COMPILE_PCRE8
4564            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4565            if (!utf)
4566              *class_uchardata++ = c;
4567            else
4568    #endif
4569              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4570    #else /* SUPPORT_UTF */
4571            *class_uchardata++ = c;
4572    #endif /* SUPPORT_UTF */
4573    
4574  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4575    #ifdef COMPILE_PCRE8
4576          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4577    #else
4578            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4579            if (utf && (options & PCRE_CASELESS) != 0)
4580    #endif
4581            {            {
4582            unsigned int othercase;            unsigned int othercase;
4583            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((int)(othercase = UCD_OTHERCASE(c)) != c)
4584              {              {
4585              *class_utf8data++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4586              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4587              }              }
4588            }            }
4589  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
4590    
4591          }          }
4592        else        else
4593  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4594    
4595        /* Handle a single-byte character */        /* Handle a single-byte character */
4596          {          {
4597            class_has_8bitchar = 1;
4598          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4599          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4600            {            {
4601            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c]; /* flip case */
4602            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4603            }            }
         class_charcount++;  
         class_lastchar = c;  
4604          }          }
4605        }        }
4606    
# Line 4066  for (;; ptr++) Line 4621  for (;; ptr++)
4621        goto FAILED;        goto FAILED;
4622        }        }
4623    
4624      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4625      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4626      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
4627      optimize.  
4628        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4629      In UTF-8 mode, we can optimize the negative case only if there were no      zerofirstchar = firstchar;
4630      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR      zeroreqchar = reqchar;
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstbyte to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqbyte, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF8  
     if (class_charcount == 1 && !class_utf8 &&  
       (!utf8 || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqbyte = reqbyte;  
   
       /* The OP_NOT[I] opcodes work on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
         zerofirstbyte = firstbyte;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF8  
       if (utf8 && class_lastchar > 127)  
         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqbyte setting must remain unchanged after any kind of  
     repeat. */  
   
     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;  
     zerofirstbyte = firstbyte;  
     zeroreqbyte = reqbyte;  
4631    
4632      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4633      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
# Line 4135  for (;; ptr++) Line 4637  for (;; ptr++)
4637      be listed) there are no characters < 256, we can omit the bitmap in the      be listed) there are no characters < 256, we can omit the bitmap in the
4638      actual compiled code. */      actual compiled code. */
4639    
4640  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4641      if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4642    #elif !defined COMPILE_PCRE8
4643        if (xclass && !should_flip_negation)
4644    #endif
4645    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4646        {        {
4647        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4648        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
4649        code += LINK_SIZE;        code += LINK_SIZE;
4650        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT:0;
4651    
4652        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4653        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4654    
4655        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4656          {          {
4657          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4658          memmove(code + 32, code, class_utf8data - code);          memmove(code + (32 / sizeof(pcre_uchar)), code,
4659              IN_UCHARS(class_uchardata - code));
4660          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
4661          code = class_utf8data + 32;          code = class_uchardata + (32 / sizeof(pcre_uchar));
4662          }          }
4663        else code = class_utf8data;        else code = class_uchardata;
4664    
4665        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4666    
4667        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4668        break;   /* End of class handling */        break;   /* End of class handling */
4669        }        }
4670  #endif  #endif
# Line 4169  for (;; ptr++) Line 4676  for (;; ptr++)
4676      negating it if necessary. */      negating it if necessary. */
4677    
4678      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4679      if (negate_class)      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
       {  
       if (lengthptr == NULL)    /* Save time in the pre-compile phase */  
         for (c = 0; c < 32; c++) code[c] = ~classbits[c];  
       }  
     else  
4680        {        {
4681          if (negate_class)
4682            for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4683        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4684        }        }
4685      code += 32;      code += 32 / sizeof(pcre_uchar);
4686        NOT_CHAR:
4687      break;      break;
4688    
4689    
# Line 4215  for (;; ptr++) Line 4720  for (;; ptr++)
4720    
4721      if (repeat_min == 0)      if (repeat_min == 0)
4722        {        {
4723        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */        firstchar = zerofirstchar;    /* Adjust for zero repeat */
4724        reqbyte = zeroreqbyte;        /* Ditto */        reqchar = zeroreqchar;        /* Ditto */
4725        }        }
4726    
4727      /* Remember whether this is a variable length repeat */      /* Remember whether this is a variable length repeat */
# Line 4249  for (;; ptr++) Line 4754  for (;; ptr++)
4754        ptr++;        ptr++;
4755        }        }
4756      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4757    
4758      /* If previous was a recursion call, wrap it in atomic brackets so that      /* If previous was a recursion call, wrap it in atomic brackets so that
4759      previous becomes the atomic group. All recursions were so wrapped in the      previous becomes the atomic group. All recursions were so wrapped in the
4760      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4761      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4762      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4763    
4764      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4765        {        {
4766        memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4767        *previous = OP_ONCE;        *previous = OP_ONCE;
4768        PUT(previous, 1, 2 + 2*LINK_SIZE);        PUT(previous, 1, 2 + 2*LINK_SIZE);
4769        previous[2 + 2*LINK_SIZE] = OP_KET;        previous[2 + 2*LINK_SIZE] = OP_KET;
4770        PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);        PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4771        code += 2 + 2 * LINK_SIZE;        code += 2 + 2 * LINK_SIZE;
4772        length_prevgroup = 3 + 3*LINK_SIZE;        length_prevgroup = 3 + 3*LINK_SIZE;
4773    
4774        /* When actually compiling, we need to check whether this was a forward        /* When actually compiling, we need to check whether this was a forward
4775        reference, and if so, adjust the offset. */        reference, and if so, adjust the offset. */
4776    
4777        if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)        if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4778          {          {
4779          int offset = GET(cd->hwm, -LINK_SIZE);          int offset = GET(cd->hwm, -LINK_SIZE);
4780          if (offset == previous + 1 - cd->start_code)          if (offset == previous + 1 - cd->start_code)
4781            PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);            PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4782          }          }
4783        }        }
4784    
4785      /* Now handle repetition for the different types of item. */      /* Now handle repetition for the different types of item. */
4786    
4787      /* If previous was a character match, abolish the item and generate a      /* If previous was a character or negated character match, abolish the item
4788      repeat item instead. If a char item has a minumum of more than one, ensure      and generate a repeat item instead. If a char item has a minimum of more
4789      that it is set in reqbyte - it might not be if a sequence such as x{3} is      than one, ensure that it is set in reqchar - it might not be if a sequence
4790      the first thing in a branch because the x will have gone into firstbyte      such as x{3} is the first thing in a branch because the x will have gone
4791      instead.  */      into firstchar instead.  */
4792    
4793      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI
4794            || *previous == OP_NOT || *previous == OP_NOTI)
4795        {        {
4796        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;        switch (*previous)
4797            {
4798            default: /* Make compiler happy. */
4799            case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
4800            case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4801            case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
4802            case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
4803            }
4804    
4805        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF characters that take up more than one character. It's
4806        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4807        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus UTF_LENGTH to flag that
4808        length rather than a small character. */        it's a length rather than a small character. */
4809    
4810  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4811        if (utf8 && (code[-1] & 0x80) != 0)        if (utf && NOT_FIRSTCHAR(code[-1]))
4812          {          {
4813          uschar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4814          while((*lastchar & 0xc0) == 0x80) lastchar--;          BACKCHAR(lastchar);
4815          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4816          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4817          c |= 0x80;                      /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4818          }          }
4819        else        else
4820  #endif  #endif /* SUPPORT_UTF */
   
       /* Handle the case of a single byte - either with no UTF8 support, or  
       with UTF-8 disabled, or for a UTF-8 character < 128. */  
4821    
4822          /* Handle the case of a single charater - either with no UTF support, or
4823          with UTF disabled, or for a single character UTF character. */
4824          {          {
4825          c = code[-1];          c = code[-1];
4826          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (*previous <= OP_CHARI && repeat_min > 1)
4827              reqchar = c | req_caseopt | cd->req_varyopt;
4828          }          }
4829    
4830        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4321  for (;; ptr++) Line 4834  for (;; ptr++)
4834    
4835        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4836            repeat_max < 0 &&            repeat_max < 0 &&
4837            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4838          {          {
4839          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4840          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4330  for (;; ptr++) Line 4843  for (;; ptr++)
4843        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4844        }        }
4845    
     /* If previous was a single negated character ([^a] or similar), we use  
     one of the special opcodes, replacing it. The code is shared with single-  
     character repeats by setting opt_type to add a suitable offset into  
     repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI  
     are currently used only for single-byte chars. */  
   
     else if (*previous == OP_NOT || *previous == OP_NOTI)  
       {  
       op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;  
       c = previous[1];  
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf8, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
       goto OUTPUT_SINGLE_REPEAT;  
       }  
   
4846      /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
4847      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
4848      repeats by setting op_type to add a suitable offset into repeat_type. Note      repeats by setting op_type to add a suitable offset into repeat_type. Note
# Line 4359  for (;; ptr++) Line 4852  for (;; ptr++)
4852    
4853      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
4854        {        {
4855        uschar *oldcode;        pcre_uchar *oldcode;
4856        int prop_type, prop_value;        int prop_type, prop_value;
4857        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4858        c = *previous;        c = *previous;
4859    
4860        if (!possessive_quantifier &&        if (!possessive_quantifier &&
4861            repeat_max < 0 &&            repeat_max < 0 &&
4862            check_auto_possessive(previous, utf8, ptr + 1, options, cd))            check_auto_possessive(previous, utf, ptr + 1, options, cd))
4863          {          {
4864          repeat_type = 0;    /* Force greedy */          repeat_type = 0;    /* Force greedy */
4865          possessive_quantifier = TRUE;          possessive_quantifier = TRUE;
# Line 4446  for (;; ptr++) Line 4939  for (;; ptr++)
4939          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
4940          Unicode property match, there are two extra bytes that define the          Unicode property match, there are two extra bytes that define the
4941          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
4942          c, with the 0x80 bit as a flag. */          c, with the UTF_LENGTH bit as a flag. */
4943    
4944          if (repeat_max < 0)          if (repeat_max < 0)
4945            {            {
4946  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4947            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4948              {              {
4949              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4950              code += c & 7;              code += c & 7;
4951              }              }
4952            else            else
# Line 4475  for (;; ptr++) Line 4968  for (;; ptr++)
4968    
4969          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
4970            {            {
4971  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4972            if (utf8 && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4973              {              {
4974              memcpy(code, utf8_char, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4975              code += c & 7;              code += c & 7;
4976              }              }
4977            else            else
# Line 4505  for (;; ptr++) Line 4998  for (;; ptr++)
4998    
4999        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
5000    
5001  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
5002        if (utf8 && c >= 128)        if (utf && (c & UTF_LENGTH) != 0)
5003          {          {
5004          memcpy(code, utf8_char, c & 7);          memcpy(code, utf_chars, IN_UCHARS(c & 7));
5005          code += c & 7;          code += c & 7;
5006          }          }
5007        else        else
# Line 4532  for (;; ptr++) Line 5025  for (;; ptr++)
5025    
5026      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
5027               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
5028  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5029               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
5030  #endif  #endif
5031               *previous == OP_REF ||               *previous == OP_REF ||
# Line 4574  for (;; ptr++) Line 5067  for (;; ptr++)
5067      opcodes such as BRA and CBRA, as this is the place where they get converted      opcodes such as BRA and CBRA, as this is the place where they get converted
5068      into the more special varieties such as BRAPOS and SBRA. A test for >=      into the more special varieties such as BRAPOS and SBRA. A test for >=
5069      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5070      ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow      ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5071      repetition of assertions, but now it does, for Perl compatibility. */      repetition of assertions, but now it does, for Perl compatibility. */
5072    
5073      else if (*previous >= OP_ASSERT && *previous <= OP_COND)      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5074        {        {
5075        register int i;        register int i;
5076        int len = (int)(code - previous);        int len = (int)(code - previous);
5077        uschar *bralink = NULL;        pcre_uchar *bralink = NULL;
5078        uschar *brazeroptr = NULL;        pcre_uchar *brazeroptr = NULL;
5079    
5080        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5081        we just ignore the repeat. */        we just ignore the repeat. */
5082    
5083        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5084          goto END_REPEAT;          goto END_REPEAT;
5085    
5086        /* There is no sense in actually repeating assertions. The only potential        /* There is no sense in actually repeating assertions. The only potential
5087        use of repetition is in cases when the assertion is optional. Therefore,        use of repetition is in cases when the assertion is optional. Therefore,
5088        if the minimum is greater than zero, just ignore the repeat. If the        if the minimum is greater than zero, just ignore the repeat. If the
5089        maximum is not not zero or one, set it to 1. */        maximum is not not zero or one, set it to 1. */
5090    
5091        if (*previous < OP_ONCE)    /* Assertion */        if (*previous < OP_ONCE)    /* Assertion */
5092          {          {
5093          if (repeat_min > 0) goto END_REPEAT;          if (repeat_min > 0) goto END_REPEAT;
5094          if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;          if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5095          }          }
5096    
5097        /* The case of a zero minimum is special because of the need to stick        /* The case of a zero minimum is special because of the need to stick
5098        OP_BRAZERO in front of it, and because the group appears once in the        OP_BRAZERO in front of it, and because the group appears once in the
# Line 4635  for (;; ptr++) Line 5128  for (;; ptr++)
5128          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5129            {            {
5130            *code = OP_END;            *code = OP_END;
5131            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf, cd, save_hwm);
5132            memmove(previous+1, previous, len);            memmove(previous + 1, previous, IN_UCHARS(len));
5133            code++;            code++;
5134            if (repeat_max == 0)            if (repeat_max == 0)
5135              {              {
# Line 4659  for (;; ptr++) Line 5152  for (;; ptr++)
5152            {            {
5153            int offset;            int offset;
5154            *code = OP_END;            *code = OP_END;
5155            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);            adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5156            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5157            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
5158            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
5159            *previous++ = OP_BRA;            *previous++ = OP_BRA;
# Line 4706  for (;; ptr++) Line 5199  for (;; ptr++)
5199              *lengthptr += delta;              *lengthptr += delta;
5200              }              }
5201    
5202            /* This is compiling for real */            /* This is compiling for real. If there is a set first byte for
5203              the group, and we have not yet set a "required byte", set it. Make
5204              sure there is enough workspace for copying forward references before
5205              doing the copy. */
5206    
5207            else            else
5208              {              {
5209              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5210    
5211              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5212                {                {
5213                uschar *hc;                pcre_uchar *hc;
5214                uschar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5215                memcpy(code, previous, len);                memcpy(code, previous, IN_UCHARS(len));
5216    
5217                  while (cd->hwm > cd->start_workspace + cd->workspace_size -
5218                         WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5219                    {
5220                    int save_offset = save_hwm - cd->start_workspace;
5221                    int this_offset = this_hwm - cd->start_workspace;
5222                    *errorcodeptr = expand_workspace(cd);
5223                    if (*errorcodeptr != 0) goto FAILED;
5224                    save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5225                    this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5226                    }
5227    
5228                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5229                  {                  {
5230                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 4765  for (;; ptr++) Line 5274  for (;; ptr++)
5274    
5275          else for (i = repeat_max - 1; i >= 0; i--)          else for (i = repeat_max - 1; i >= 0; i--)
5276            {            {
5277            uschar *hc;            pcre_uchar *hc;
5278            uschar *this_hwm = cd->hwm;            pcre_uchar *this_hwm = cd->hwm;
5279    
5280            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
5281    
# Line 4782  for (;; ptr++) Line 5291  for (;; ptr++)
5291              PUTINC(code, 0, offset);              PUTINC(code, 0, offset);
5292              }              }
5293    
5294            memcpy(code, previous, len);            memcpy(code, previous, IN_UCHARS(len));
5295    
5296              /* Ensure there is enough workspace for forward references before
5297              copying them. */
5298    
5299              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5300                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5301                {
5302                int save_offset = save_hwm - cd->start_workspace;
5303                int this_offset = this_hwm - cd->start_workspace;
5304                *errorcodeptr = expand_workspace(cd);
5305                if (*errorcodeptr != 0) goto FAILED;
5306                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5307                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5308                }
5309    
5310            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5311              {              {
5312              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 4799  for (;; ptr++) Line 5323  for (;; ptr++)
5323            {            {
5324            int oldlinkoffset;            int oldlinkoffset;
5325            int offset = (int)(code - bralink + 1);            int offset = (int)(code - bralink + 1);
5326            uschar *bra = code - offset;            pcre_uchar *bra = code - offset;
5327            oldlinkoffset = GET(bra, 1);            oldlinkoffset = GET(bra, 1);
5328            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5329            *code++ = OP_KET;            *code++ = OP_KET;
# Line 4814  for (;; ptr++) Line 5338  for (;; ptr++)
5338        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5339        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5340    
5341        Otherwise, if the quantifier was possessive, we convert the BRA code to        Otherwise, when we are doing the actual compile phase, check to see
5342        the POS form, and the KET code to KETRPOS. (It turns out to be convenient        whether this group is one that could match an empty string. If so,
5343        at runtime to detect this kind of subpattern at both the start and at the        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5344        end.) The use of special opcodes makes it possible to reduce greatly the        that runtime checking can be done. [This check is also applied to ONCE
5345        stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO,        groups at runtime, but in a different way.]
5346        convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that  
5347        the default action below, of wrapping everything inside atomic brackets,        Then, if the quantifier was possessive and the bracket is not a
5348        does not happen.        conditional, we convert the BRA code to the POS form, and the KET code to
5349          KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5350        Then, when we are doing the actual compile phase, check to see whether        subpattern at both the start and at the end.) The use of special opcodes
5351        this group is one that could match an empty string. If so, convert the        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5352        initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5353        checking can be done. [This check is also applied to ONCE groups at  
5354        runtime, but in a different way.] */        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5355          flag so that the default action below, of wrapping everything inside
5356          atomic brackets, does not happen. When the minimum is greater than 1,
5357          there will be earlier copies of the group, and so we still have to wrap
5358          the whole thing. */
5359    
5360        else        else
5361          {          {
5362          uschar *ketcode = code - 1 - LINK_SIZE;          pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5363          uschar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5364    
5365          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;          /* Convert possessive ONCE brackets to non-capturing */
5366          if (*bracode == OP_ONCE)  
5367            if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5368                possessive_quantifier) *bracode = OP_BRA;
5369    
5370            /* For non-possessive ONCE brackets, all we need to do is to
5371            set the KET. */
5372    
5373            if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5374            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5375    
5376            /* Handle non-ONCE brackets and possessive ONCEs (which have been
5377            converted to non-capturing above). */
5378    
5379          else          else
5380            {            {
5381            if (possessive_quantifier)            /* In the compile phase, check for empty string matching. */
             {  
             *bracode += 1;                   /* Switch to xxxPOS opcodes */  
             *ketcode = OP_KETRPOS;  
             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;  
             possessive_quantifier = FALSE;  
             }  
           else *ketcode = OP_KETRMAX + repeat_type;  
5382    
5383            if (lengthptr == NULL)            if (lengthptr == NULL)
5384              {              {
5385              uschar *scode = bracode;              pcre_uchar *scode = bracode;
5386              do              do
5387                {                {
5388                if (could_be_empty_branch(scode, ketcode, utf8, cd))                if (could_be_empty_branch(scode, ketcode, utf, cd))
5389                  {                  {
5390                  *bracode += OP_SBRA - OP_BRA;                  *bracode += OP_SBRA - OP_BRA;
5391                  break;                  break;
# Line 4862  for (;; ptr++) Line 5394  for (;; ptr++)
5394                }                }
5395              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5396              }              }
5397    
5398              /* Handle possessive quantifiers. */
5399    
5400              if (possessive_quantifier)
5401                {
5402                /* For COND brackets, we wrap the whole thing in a possessively
5403                repeated non-capturing bracket, because we have not invented POS
5404                versions of the COND opcodes. Because we are moving code along, we
5405                must ensure that any pending recursive references are updated. */
5406    
5407                if (*bracode == OP_COND || *bracode == OP_SCOND)
5408                  {
5409                  int nlen = (int)(code - bracode);
5410                  *code = OP_END;
5411                  adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5412                  memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5413                  code += 1 + LINK_SIZE;
5414                  nlen += 1 + LINK_SIZE;
5415                  *bracode = OP_BRAPOS;
5416                  *code++ = OP_KETRPOS;
5417                  PUTINC(code, 0, nlen);
5418                  PUT(bracode, 1, nlen);
5419                  }
5420    
5421                /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5422    
5423                else
5424                  {
5425                  *bracode += 1;              /* Switch to xxxPOS opcodes */
5426                  *ketcode = OP_KETRPOS;
5427                  }
5428    
5429                /* If the minimum is zero, mark it as possessive, then unset the
5430                possessive flag when the minimum is 0 or 1. */
5431    
5432                if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5433                if (repeat_min < 2) possessive_quantifier = FALSE;
5434                }
5435    
5436              /* Non-possessive quantifier */
5437    
5438              else *ketcode = OP_KETRMAX + repeat_type;
5439            }            }
5440          }          }
5441        }        }
# Line 4886  for (;; ptr++) Line 5460  for (;; ptr++)
5460      there are special alternative opcodes for this case. For anything else, we      there are special alternative opcodes for this case. For anything else, we
5461      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5462      notation is just syntactic sugar, taken from Sun's Java package, but the      notation is just syntactic sugar, taken from Sun's Java package, but the
5463      special opcodes can optimize it.      special opcodes can optimize it.
5464    
5465      Possessively repeated subpatterns have already been handled in the code      Some (but not all) possessively repeated subpatterns have already been
5466      just above, so possessive_quantifier is always FALSE for them at this      completely handled in the code just above. For them, possessive_quantifier
5467      stage.      is always FALSE at this stage.
5468    
5469      Note that the repeated item starts at tempcode, not at previous, which      Note that the repeated item starts at tempcode, not at previous, which
5470      might be the first part of a string whose (former) last char we repeated.      might be the first part of a string whose (former) last char we repeated.
5471    
# Line 4904  for (;; ptr++) Line 5478  for (;; ptr++)
5478        int len;        int len;
5479    
5480        if (*tempcode == OP_TYPEEXACT)        if (*tempcode == OP_TYPEEXACT)
5481          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
5482            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);            ((tempcode[1 + IMM2_SIZE] == OP_PROP
5483              || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);