/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 782 by zherczeg, Sat Dec 3 23:58:37 2011 UTC revision 806 by zherczeg, Thu Dec 15 11:57:39 2011 UTC
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
# Line 88  so this number is very generous. Line 92  so this number is very generous.
92  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
93  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
94  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
96    filled up by repetitions of forward references, for example patterns like
97    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98    that the workspace is expanded using malloc() in this situation. The value
99    below is therefore a minimum, and we put a maximum on it for safety. The
100    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101    kicks in at the same number of forward references in all cases. */
102    
103  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105    
106  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
107  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
108    
109  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
110    
111  /* Private flags added to firstchar and reqchar. */  /* Private flags added to firstchar and reqchar. */
112    
113  #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */  #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
114  #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */  #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
115    
116    /* Repeated character flags. */
117    
118    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
119    
120  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
122  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
# Line 470  static const char error_texts[] = Line 485  static const char error_texts[] =
485    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486    /* 70 */    /* 70 */
487    "internal error: unknown opcode in find_fixedlength()\0"    "internal error: unknown opcode in find_fixedlength()\0"
488    "Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff)\0"    "\\N is not supported in a class\0"
489      "too many forward references\0"
490      "disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff)\0"
491    ;    ;
492    
493  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 645  return s; Line 662  return s;
662    
663    
664  /*************************************************  /*************************************************
665    *           Expand the workspace                 *
666    *************************************************/
667    
668    /* This function is called during the second compiling phase, if the number of
669    forward references fills the existing workspace, which is originally a block on
670    the stack. A larger block is obtained from malloc() unless the ultimate limit
671    has been reached or the increase will be rather small.
672    
673    Argument: pointer to the compile data block
674    Returns:  0 if all went well, else an error number
675    */
676    
677    static int
678    expand_workspace(compile_data *cd)
679    {
680    pcre_uchar *newspace;
681    int newsize = cd->workspace_size * 2;
682    
683    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
684    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
685        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
686     return ERR72;
687    
688    newspace = (PUBL(malloc))(newsize);
689    if (newspace == NULL) return ERR21;
690    
691    memcpy(newspace, cd->start_workspace, cd->workspace_size);
692    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
693    if (cd->workspace_size > COMPILE_WORK_SIZE)
694      (PUBL(free))((void *)cd->start_workspace);
695    cd->start_workspace = newspace;
696    cd->workspace_size = newsize;
697    return 0;
698    }
699    
700    
701    
702    /*************************************************
703  *            Check for counted repeat            *  *            Check for counted repeat            *
704  *************************************************/  *************************************************/
705    
# Line 1009  else Line 1064  else
1064    
1065        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1066          {          {
1067          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR71;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1068          ptr = pt;          ptr = pt;
1069          break;          break;
1070          }          }
# Line 1807  for (;;) Line 1862  for (;;)
1862      cc++;      cc++;
1863      break;      break;
1864    
1865      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1866      otherwise \C is coded as OP_ALLANY. */      otherwise \C is coded as OP_ALLANY. */
1867    
1868      case OP_ANYBYTE:      case OP_ANYBYTE:
# Line 2353  for (code = first_significant_code(code Line 2408  for (code = first_significant_code(code
2408      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2409      here. */      here. */
2410    
2411  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2412      case OP_XCLASS:      case OP_XCLASS:
2413      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2414      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2363  for (code = first_significant_code(code Line 2418  for (code = first_significant_code(code
2418      case OP_NCLASS:      case OP_NCLASS:
2419      ccode = code + PRIV(OP_lengths)[OP_CLASS];      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2420    
2421  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2422      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2423  #endif  #endif
2424    
# Line 2896  static BOOL Line 2951  static BOOL
2951  check_auto_possessive(const pcre_uchar *previous, BOOL utf,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2952    const pcre_uchar *ptr, int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2953  {  {
2954  int c, next;  pcre_int32 c, next;
2955  int op_code = *previous++;  int op_code = *previous++;
2956    
2957  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2905  if ((options & PCRE_EXTENDED) != 0) Line 2960  if ((options & PCRE_EXTENDED) != 0)
2960    {    {
2961    for (;;)    for (;;)
2962      {      {
2963      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2964      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2965        {        {
2966        ptr++;        ptr++;
# Line 2932  if (*ptr == CHAR_BACKSLASH) Line 2987  if (*ptr == CHAR_BACKSLASH)
2987    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
2988    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
2989    }    }
2990    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
2991    {    {
2992  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2993    if (utf) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
2994  #endif  #endif
2995    next = *ptr++;    next = *ptr++;
2996    }    }
   
2997  else return FALSE;  else return FALSE;
2998    
2999  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2949  if ((options & PCRE_EXTENDED) != 0) Line 3002  if ((options & PCRE_EXTENDED) != 0)
3002    {    {
3003    for (;;)    for (;;)
3004      {      {
3005      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3006      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3007        {        {
3008        ptr++;        ptr++;
# Line 2978  the next item is a character. */ Line 3031  the next item is a character. */
3031  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3032    {    {
3033    case OP_CHAR:    case OP_CHAR:
3034  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3035    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3036  #else  #else
3037    c = *previous;    c = *previous;
# Line 2990  if (next >= 0) switch(op_code) Line 3043  if (next >= 0) switch(op_code)
3043    high-valued characters. */    high-valued characters. */
3044    
3045    case OP_CHARI:    case OP_CHARI:
3046  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3047    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3048  #else  #else
3049    c = *previous;    c = *previous;
3050  #endif  #endif
3051    if (c == next) return FALSE;    if (c == next) return FALSE;
3052  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3053    if (utf)    if (utf)
3054      {      {
3055      unsigned int othercase;      unsigned int othercase;
# Line 3009  if (next >= 0) switch(op_code) Line 3062  if (next >= 0) switch(op_code)
3062      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3063      }      }
3064    else    else
3065  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3066    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3067    
3068    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
3069    opcodes are not used for multi-byte characters, because they are coded using    opcodes are not used for multi-byte characters, because they are coded using
# Line 3021  if (next >= 0) switch(op_code) Line 3074  if (next >= 0) switch(op_code)
3074    
3075    case OP_NOTI:    case OP_NOTI:
3076    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
3077  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3078    if (utf)    if (utf)
3079      {      {
3080      unsigned int othercase;      unsigned int othercase;
# Line 3034  if (next >= 0) switch(op_code) Line 3087  if (next >= 0) switch(op_code)
3087      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3088      }      }
3089    else    else
3090  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3091    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3092    
3093    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3094    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
# Line 3126  switch(op_code) Line 3179  switch(op_code)
3179    {    {
3180    case OP_CHAR:    case OP_CHAR:
3181    case OP_CHARI:    case OP_CHARI:
3182  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3183    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3184  #else  #else
3185    c = *previous;    c = *previous;
# Line 3356  pcre_uint8 classbits[32]; Line 3409  pcre_uint8 classbits[32];
3409  must not do this for other options (e.g. PCRE_EXTENDED) because they may change  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3410  dynamically as we process the pattern. */  dynamically as we process the pattern. */
3411    
3412  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3413  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3414  BOOL utf = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
3415  pcre_uchar utf_chars[6];  pcre_uchar utf_chars[6];
# Line 3411  for (;; ptr++) Line 3464  for (;; ptr++)
3464    BOOL is_quantifier;    BOOL is_quantifier;
3465    BOOL is_recurse;    BOOL is_recurse;
3466    BOOL reset_bracount;    BOOL reset_bracount;
3467    int class_charcount;    int class_has_8bitchar;
3468    int class_lastchar;    int class_single_char;
3469    int newoptions;    int newoptions;
3470    int recno;    int recno;
3471    int refsign;    int refsign;
# Line 3446  for (;; ptr++) Line 3499  for (;; ptr++)
3499  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3500      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3501  #endif  #endif
3502      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3503            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3504        {        {
3505        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3506        goto FAILED;        goto FAILED;
# Line 3471  for (;; ptr++) Line 3525  for (;; ptr++)
3525      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3526      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3527        (int)(code - last_code), c, c));        (int)(code - last_code), c, c));
3528    
3529      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3530      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
3531      if "previous" is NULL, reset the current code pointer to the start. */      if "previous" is NULL, reset the current code pointer to the start. */
# Line 3496  for (;; ptr++) Line 3550  for (;; ptr++)
3550    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3551    reference list. */    reference list. */
3552    
3553    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3554               WORK_SIZE_SAFETY_MARGIN)
3555      {      {
3556      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3557      goto FAILED;      goto FAILED;
# Line 3548  for (;; ptr++) Line 3603  for (;; ptr++)
3603    
3604    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3605      {      {
3606      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3607      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3608        {        {
3609        ptr++;        ptr++;
# Line 3708  for (;; ptr++) Line 3763  for (;; ptr++)
3763    
3764      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3765    
3766      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3767      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3768      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3769        a single character. */
3770    
3771      class_charcount = 0;      class_has_8bitchar = 0;
3772      class_lastchar = -1;      class_single_char = 0;
3773    
3774      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3775      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
# Line 3736  for (;; ptr++) Line 3792  for (;; ptr++)
3792        {        {
3793        const pcre_uchar *oldptr;        const pcre_uchar *oldptr;
3794    
3795  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3796        if (utf && c > 127)        if (utf && HAS_EXTRALEN(c))
3797          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3798          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3799          }          }
# Line 3868  for (;; ptr++) Line 3924  for (;; ptr++)
3924            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3925    
3926          ptr = tempptr + 1;          ptr = tempptr + 1;
3927          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3928            class_has_8bitchar = 1;
3929            /* Every class contains at least two characters. */
3930            class_single_char = 2;
3931          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3932          }          }
3933    
3934        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3935        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3936        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3937        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3938        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3939        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3940          as literal characters (by default), or are faulted if
3941        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3942    
3943        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3886  for (;; ptr++) Line 3946  for (;; ptr++)
3946          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3947    
3948          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3949            else if (-c == ESC_N)            /* \N is not supported in a class */
3950              {
3951              *errorcodeptr = ERR71;
3952              goto FAILED;
3953              }
3954          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3955            {            {
3956            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3900  for (;; ptr++) Line 3965  for (;; ptr++)
3965          if (c < 0)          if (c < 0)
3966            {            {
3967            register const pcre_uint8 *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3968            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3969              class_has_8bitchar++;
3970              /* Every class contains at least two characters. */
3971              class_single_char += 2;
3972    
3973            switch (-c)            switch (-c)
3974              {              {
# Line 3913  for (;; ptr++) Line 3981  for (;; ptr++)
3981              case ESC_SU:              case ESC_SU:
3982              nestptr = ptr;              nestptr = ptr;
3983              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3984              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
3985              continue;              continue;
3986  #endif  #endif
3987              case ESC_d:              case ESC_d:
# Line 4079  for (;; ptr++) Line 4147  for (;; ptr++)
4147                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4148                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
4149                *class_uchardata++ = pdata;                *class_uchardata++ = pdata;
4150                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4151                continue;                continue;
4152                }                }
4153  #endif  #endif
# Line 4093  for (;; ptr++) Line 4161  for (;; ptr++)
4161                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4162                goto FAILED;                goto FAILED;
4163                }                }
4164              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4165              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4166                c = *ptr;                /* Get the final character and fall through */
4167              break;              break;
4168              }              }
4169            }            }
4170    
4171          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4172          greater than 256 mode. */          greater than 256. */
4173    
4174          }   /* End of backslash handling */          }   /* End of backslash handling */
4175    
# Line 4148  for (;; ptr++) Line 4217  for (;; ptr++)
4217            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4218            }            }
4219    
4220  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4221          if (utf)          if (utf)
4222            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4223            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
# Line 4193  for (;; ptr++) Line 4262  for (;; ptr++)
4262    
4263          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4264    
4265            /* Since we found a character range, single character optimizations
4266            cannot be done anymore. */
4267            class_single_char = 2;
4268    
4269          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4270          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4271          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4272          available. */          available. */
4273    
4274  #ifdef SUPPORT_UTF  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4275            if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4276    #elif defined  SUPPORT_UTF
4277          if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4278  #endif  #elif !(defined COMPILE_PCRE8)
 #ifndef COMPILE_PCRE8  
4279          if (d > 255)          if (d > 255)
4280  #endif  #endif
4281  #if defined SUPPORT_UTF || defined COMPILE_PCRE16  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4282            {            {
4283            xclass = TRUE;            xclass = TRUE;
4284    
# Line 4213  for (;; ptr++) Line 4287  for (;; ptr++)
4287            they fit with the basic range. */            they fit with the basic range. */
4288    
4289  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4290    #ifndef COMPILE_PCRE8
4291              if (utf && (options & PCRE_CASELESS) != 0)
4292    #else
4293            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4294    #endif
4295              {              {
4296              unsigned int occ, ocd;              unsigned int occ, ocd;
4297              unsigned int cc = c;              unsigned int cc = c;
# Line 4256  for (;; ptr++) Line 4334  for (;; ptr++)
4334    
4335            *class_uchardata++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4336  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4337    #ifndef COMPILE_PCRE8
4338              if (utf)
4339                {
4340                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4341                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4342                }
4343              else
4344                {
4345                *class_uchardata++ = c;
4346                *class_uchardata++ = d;
4347                }
4348    #else
4349            class_uchardata += PRIV(ord2utf)(c, class_uchardata);            class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4350            class_uchardata += PRIV(ord2utf)(d, class_uchardata);            class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4351  #else  #endif
4352    #else /* SUPPORT_UTF */
4353            *class_uchardata++ = c;            *class_uchardata++ = c;
4354            *class_uchardata++ = d;            *class_uchardata++ = d;
4355  #endif  #endif /* SUPPORT_UTF */
4356    
4357            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4358            caseless matching for UTF characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
# Line 4269  for (;; ptr++) Line 4360  for (;; ptr++)
4360            can still use  */            can still use  */
4361    
4362  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4363            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4364  #else            if (utf)
4365  #ifdef SUPPORT_UTF  #endif
4366                continue;    /* With next character in the class */
4367    #endif  /* SUPPORT_UCP */
4368    
4369    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4370              if (utf)
4371                {
4372                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4373                /* Adjust upper limit and fall through to set up the map */
4374                d = 127;
4375                }
4376              else
4377                {
4378                if (c > 255) continue;
4379                /* Adjust upper limit and fall through to set up the map */
4380                d = 255;
4381                }
4382    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4383            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4384            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
4385            d = 127;            d = 127;
# Line 4279  for (;; ptr++) Line 4387  for (;; ptr++)
4387            if (c > 255) continue;            if (c > 255) continue;
4388            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
4389            d = 255;            d = 255;
4390  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
 #endif  /* SUPPORT_UCP */  
4391            }            }
4392  #endif  /* SUPPORT_UTF8 || COMPILE_PCRE16 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4393    
4394          /* We use the bit map for 8 bit mode, or when the characters fall          /* We use the bit map for 8 bit mode, or when the characters fall
4395          partially or entirely to [0-255] ([0-127] for UCP) ranges. */          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4396    
4397          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4398    
4399          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4400    
# Line 4297  for (;; ptr++) Line 4403  for (;; ptr++)
4403            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4404            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4405              {              {
4406              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c]; /* flip case */
4407              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
4408              }              }
4409            }            }
# Line 4311  for (;; ptr++) Line 4417  for (;; ptr++)
4417    
4418        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4419    
4420        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4421          if (class_single_char < 2) class_single_char++;
4422    
4423          /* If class_charcount is 1, we saw precisely one character. As long as
4424          there were no negated characters >= 128 and there was no use of \p or \P,
4425          in other words, no use of any XCLASS features, we can optimize.
4426    
4427          In UTF-8 mode, we can optimize the negative case only if there were no
4428          characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4429          operate on single-bytes characters only. This is an historical hangover.
4430          Maybe one day we can tidy these opcodes to handle multi-byte characters.
4431    
4432          The optimization throws away the bit map. We turn the item into a
4433          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4434          Note that OP_NOT[I] does not support multibyte characters. In the positive
4435          case, it can cause firstchar to be set. Otherwise, there can be no first
4436          char if this item is first, whatever repeat count may follow. In the case
4437          of reqchar, save the previous value for reinstating. */
4438    
4439  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4440        if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))        if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4441            && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4442    #else
4443          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4444  #endif  #endif
4445  #ifndef COMPILE_PCRE8          {
4446            ptr++;
4447            zeroreqchar = reqchar;
4448    
4449            /* The OP_NOT[I] opcodes work on single characters only. */
4450    
4451            if (negate_class)
4452              {
4453              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4454              zerofirstchar = firstchar;
4455              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4456              *code++ = c;
4457              goto NOT_CHAR;
4458              }
4459    
4460            /* For a single, positive character, get the value into mcbuffer, and
4461            then we can handle this with the normal one-character code. */
4462    
4463    #ifdef SUPPORT_UTF
4464            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4465              mclength = PRIV(ord2utf)(c, mcbuffer);
4466            else
4467    #endif
4468              {
4469              mcbuffer[0] = c;
4470              mclength = 1;
4471              }
4472            goto ONE_CHAR;
4473            }       /* End of 1-char optimization */
4474    
4475          /* Handle a character that cannot go in the bit map. */
4476    
4477    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4478          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4479    #elif defined SUPPORT_UTF
4480          if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4481    #elif !(defined COMPILE_PCRE8)
4482        if (c > 255)        if (c > 255)
4483  #endif  #endif
4484  #if defined SUPPORT_UTF || defined COMPILE_PCRE16  
4485    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4486          {          {
4487          xclass = TRUE;          xclass = TRUE;
4488          *class_uchardata++ = XCL_SINGLE;          *class_uchardata++ = XCL_SINGLE;
4489  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4490          class_uchardata += PRIV(ord2utf)(c, class_uchardata);  #ifndef COMPILE_PCRE8
4491  #else          /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4492          *class_uchardata++ = c;          if (!utf)
4493              *class_uchardata++ = c;
4494            else
4495  #endif  #endif
4496              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4497    #else /* SUPPORT_UTF */
4498            *class_uchardata++ = c;
4499    #endif /* SUPPORT_UTF */
4500    
4501  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4502    #ifdef COMPILE_PCRE8
4503          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4504    #else
4505            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4506            if (utf && (options & PCRE_CASELESS) != 0)
4507    #endif
4508            {            {
4509            unsigned int othercase;            unsigned int othercase;
4510            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((othercase = UCD_OTHERCASE(c)) != c)
# Line 4343  for (;; ptr++) Line 4517  for (;; ptr++)
4517    
4518          }          }
4519        else        else
4520  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4521    
4522        /* Handle a single-byte character */        /* Handle a single-byte character */
4523          {          {
4524            class_has_8bitchar = 1;
4525          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4526          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4527            {            {
4528            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c]; /* flip case */
4529            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4530            }            }
         class_charcount++;  
         class_lastchar = c;  
4531          }          }
4532        }        }
4533    
# Line 4375  for (;; ptr++) Line 4548  for (;; ptr++)
4548        goto FAILED;        goto FAILED;
4549        }        }
4550    
4551      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4552      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4553      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
     optimize.  
   
     In UTF-8 mode, we can optimize the negative case only if there were no  
     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR  
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstchar to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqchar, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF  
     if (class_charcount == 1 && !xclass &&  
       (!utf || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqchar = reqchar;  
   
       /* The OP_NOT[I] opcodes work on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstchar == REQ_UNSET) firstchar = REQ_NONE;  
         zerofirstchar = firstchar;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF8  
       if (utf && class_lastchar > 127)  
         mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqchar setting must remain unchanged after any kind of  
     repeat. */  
4554    
4555      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4556      zerofirstchar = firstchar;      zerofirstchar = firstchar;
# Line 4460  for (;; ptr++) Line 4580  for (;; ptr++)
4580        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4581        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4582    
4583        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4584          {          {
4585          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4586          memmove(code + (32 / sizeof(pcre_uchar)), code,          memmove(code + (32 / sizeof(pcre_uchar)), code,
# Line 4472  for (;; ptr++) Line 4592  for (;; ptr++)
4592    
4593        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4594    
4595        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4596        break;   /* End of class handling */        break;   /* End of class handling */
4597        }        }
4598  #endif  #endif
# Line 4491  for (;; ptr++) Line 4611  for (;; ptr++)
4611        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4612        }        }
4613      code += 32 / sizeof(pcre_uchar);      code += 32 / sizeof(pcre_uchar);
4614        NOT_CHAR:
4615      break;      break;
4616    
4617    
# Line 4567  for (;; ptr++) Line 4688  for (;; ptr++)
4688      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4689      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4690      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4691    
4692      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4693        {        {
4694        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
# Line 4603  for (;; ptr++) Line 4724  for (;; ptr++)
4724    
4725        /* Deal with UTF characters that take up more than one character. It's        /* Deal with UTF characters that take up more than one character. It's
4726        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4727        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus UTF_LENGTH to flag that
4728        length rather than a small character. */        it's a length rather than a small character. */
4729    
4730  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4731        if (utf && (code[-1] & 0x80) != 0)        if (utf && NOT_FIRSTCHAR(code[-1]))
4732          {          {
4733          pcre_uchar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4734          BACKCHAR(lastchar);          BACKCHAR(lastchar);
4735          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4736          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4737          c |= 0x80;                      /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4738          }          }
4739        else        else
4740  #endif  #endif /* SUPPORT_UTF */
4741    
4742        /* Handle the case of a single charater - either with no UTF support, or        /* Handle the case of a single charater - either with no UTF support, or
4743        with UTF disabled, or for a single character UTF character. */        with UTF disabled, or for a single character UTF character. */
   
4744          {          {
4745          c = code[-1];          c = code[-1];
4746          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
# Line 4758  for (;; ptr++) Line 4878  for (;; ptr++)
4878          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
4879          Unicode property match, there are two extra bytes that define the          Unicode property match, there are two extra bytes that define the
4880          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
4881          c, with the 0x80 bit as a flag. */          c, with the UTF_LENGTH bit as a flag. */
4882    
4883          if (repeat_max < 0)          if (repeat_max < 0)
4884            {            {
4885  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4886            if (utf && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4887              {              {
4888              memcpy(code, utf_chars, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4889              code += c & 7;              code += c & 7;
4890              }              }
4891            else            else
# Line 4787  for (;; ptr++) Line 4907  for (;; ptr++)
4907    
4908          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
4909            {            {
4910  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4911            if (utf && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4912              {              {
4913              memcpy(code, utf_chars, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4914              code += c & 7;              code += c & 7;
4915              }              }
4916            else            else
# Line 4817  for (;; ptr++) Line 4937  for (;; ptr++)
4937    
4938        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
4939    
4940  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4941        if (utf && c >= 128)        if (utf && (c & UTF_LENGTH) != 0)
4942          {          {
4943          memcpy(code, utf_chars, c & 7);          memcpy(code, utf_chars, IN_UCHARS(c & 7));
4944          code += c & 7;          code += c & 7;
4945          }          }
4946        else        else
# Line 4844  for (;; ptr++) Line 4964  for (;; ptr++)
4964    
4965      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
4966               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
4967  #if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4968               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
4969  #endif  #endif
4970               *previous == OP_REF ||               *previous == OP_REF ||
# Line 5018  for (;; ptr++) Line 5138  for (;; ptr++)
5138              *lengthptr += delta;              *lengthptr += delta;
5139              }              }
5140    
5141            /* This is compiling for real */            /* This is compiling for real. If there is a set first byte for
5142              the group, and we have not yet set a "required byte", set it. Make
5143              sure there is enough workspace for copying forward references before
5144              doing the copy. */
5145    
5146            else            else
5147              {              {
5148              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5149    
5150              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5151                {                {
5152                pcre_uchar *hc;                pcre_uchar *hc;
5153                pcre_uchar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5154                memcpy(code, previous, IN_UCHARS(len));                memcpy(code, previous, IN_UCHARS(len));
5155    
5156                  while (cd->hwm > cd->start_workspace + cd->workspace_size -
5157                         WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5158                    {
5159                    int save_offset = save_hwm - cd->start_workspace;
5160                    int this_offset = this_hwm - cd->start_workspace;
5161                    *errorcodeptr = expand_workspace(cd);
5162                    if (*errorcodeptr != 0) goto FAILED;
5163                    save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5164                    this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5165                    }
5166    
5167                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5168                  {                  {
5169                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 5095  for (;; ptr++) Line 5231  for (;; ptr++)
5231              }              }
5232    
5233            memcpy(code, previous, IN_UCHARS(len));            memcpy(code, previous, IN_UCHARS(len));
5234    
5235              /* Ensure there is enough workspace for forward references before
5236              copying them. */
5237    
5238              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5239                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5240                {
5241                int save_offset = save_hwm - cd->start_workspace;
5242                int this_offset = this_hwm - cd->start_workspace;
5243                *errorcodeptr = expand_workspace(cd);
5244                if (*errorcodeptr != 0) goto FAILED;
5245                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5246                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5247                }
5248    
5249            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5250              {              {
5251              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 5125  for (;; ptr++) Line 5276  for (;; ptr++)
5276        ONCE brackets can be converted into non-capturing brackets, as the        ONCE brackets can be converted into non-capturing brackets, as the
5277        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5278        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5279    
5280        Otherwise, when we are doing the actual compile phase, check to see        Otherwise, when we are doing the actual compile phase, check to see
5281        whether this group is one that could match an empty string. If so,        whether this group is one that could match an empty string. If so,
5282        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5283        that runtime checking can be done. [This check is also applied to ONCE        that runtime checking can be done. [This check is also applied to ONCE
5284        groups at runtime, but in a different way.]        groups at runtime, but in a different way.]
5285    
5286        Then, if the quantifier was possessive and the bracket is not a        Then, if the quantifier was possessive and the bracket is not a
5287        conditional, we convert the BRA code to the POS form, and the KET code to        conditional, we convert the BRA code to the POS form, and the KET code to
5288        KETRPOS. (It turns out to be convenient at runtime to detect this kind of        KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5289        subpattern at both the start and at the end.) The use of special opcodes        subpattern at both the start and at the end.) The use of special opcodes
5290        makes it possible to reduce greatly the stack usage in pcre_exec(). If        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5291        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5292    
5293        Then, if the minimum number of matches is 1 or 0, cancel the possessive        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5294        flag so that the default action below, of wrapping everything inside        flag so that the default action below, of wrapping everything inside
5295        atomic brackets, does not happen. When the minimum is greater than 1,        atomic brackets, does not happen. When the minimum is greater than 1,
5296        there will be earlier copies of the group, and so we still have to wrap        there will be earlier copies of the group, and so we still have to wrap
5297        the whole thing. */        the whole thing. */
5298    
5299        else        else
# Line 5151  for (;; ptr++) Line 5302  for (;; ptr++)
5302          pcre_uchar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5303    
5304          /* Convert possessive ONCE brackets to non-capturing */          /* Convert possessive ONCE brackets to non-capturing */
5305    
5306          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5307              possessive_quantifier) *bracode = OP_BRA;              possessive_quantifier) *bracode = OP_BRA;
5308    
5309          /* For non-possessive ONCE brackets, all we need to do is to          /* For non-possessive ONCE brackets, all we need to do is to
5310          set the KET. */          set the KET. */
5311    
5312          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5313            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5314    
5315          /* Handle non-ONCE brackets and possessive ONCEs (which have been          /* Handle non-ONCE brackets and possessive ONCEs (which have been
5316          converted to non-capturing above). */          converted to non-capturing above). */
5317    
5318          else          else
5319            {            {
5320            /* In the compile phase, check for empty string matching. */            /* In the compile phase, check for empty string matching. */
5321    
5322            if (lengthptr == NULL)            if (lengthptr == NULL)
5323              {              {
5324              pcre_uchar *scode = bracode;              pcre_uchar *scode = bracode;
# Line 5182  for (;; ptr++) Line 5333  for (;; ptr++)
5333                }                }
5334              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5335              }              }
5336    
5337            /* Handle possessive quantifiers. */            /* Handle possessive quantifiers. */
5338    
5339            if (possessive_quantifier)            if (possessive_quantifier)
# Line 5191  for (;; ptr++) Line 5342  for (;; ptr++)
5342              repeated non-capturing bracket, because we have not invented POS              repeated non-capturing bracket, because we have not invented POS
5343              versions of the COND opcodes. Because we are moving code along, we              versions of the COND opcodes. Because we are moving code along, we
5344              must ensure that any pending recursive references are updated. */              must ensure that any pending recursive references are updated. */
5345    
5346              if (*bracode == OP_COND || *bracode == OP_SCOND)              if (*bracode == OP_COND || *bracode == OP_SCOND)
5347                {                {
5348                int nlen = (int)(code - bracode);                int nlen = (int)(code - bracode);
# Line 5204  for (;; ptr++) Line 5355  for (;; ptr++)
5355                *code++ = OP_KETRPOS;                *code++ = OP_KETRPOS;
5356                PUTINC(code, 0, nlen);                PUTINC(code, 0, nlen);
5357                PUT(bracode, 1, nlen);                PUT(bracode, 1, nlen);
5358                }                }
5359    
5360              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5361    
5362              else              else
5363                {                {
5364                *bracode += 1;              /* Switch to xxxPOS opcodes */                *bracode += 1;              /* Switch to xxxPOS opcodes */
5365                *ketcode = OP_KETRPOS;                *ketcode = OP_KETRPOS;
5366                }                }
5367    
5368              /* If the minimum is zero, mark it as possessive, then unset the              /* If the minimum is zero, mark it as possessive, then unset the
5369              possessive flag when the minimum is 0 or 1. */              possessive flag when the minimum is 0 or 1. */
5370    
5371              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5372              if (repeat_min < 2) possessive_quantifier = FALSE;              if (repeat_min < 2) possessive_quantifier = FALSE;
5373              }              }
5374    
5375            /* Non-possessive quantifier */            /* Non-possessive quantifier */
5376    
5377            else *ketcode = OP_KETRMAX + repeat_type;            else *ketcode = OP_KETRMAX + repeat_type;
5378            }            }
5379          }          }
# Line 5348  for (;; ptr++) Line 5499  for (;; ptr++)
5499    
5500      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
5501    
5502      if (*(++ptr) == CHAR_ASTERISK &&      ptr++;
5503           ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5504             || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5505        {        {
5506        int i, namelen;        int i, namelen;
5507        int arglen = 0;        int arglen = 0;
# Line 5357  for (;; ptr++) Line 5509  for (;; ptr++)
5509        const pcre_uchar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5510        const pcre_uchar *arg = NULL;        const pcre_uchar *arg = NULL;
5511        previous = NULL;        previous = NULL;
5512        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        ptr++;
5513          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5514        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5515    
5516        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
# Line 5543  for (;; ptr++) Line 5696  for (;; ptr++)
5696    
5697          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
5698    
5699          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5700            {            {
5701            ptr += 1;  /* To get the right offset */            ptr += 1;  /* To get the right offset */
5702            *errorcodeptr = ERR28;            *errorcodeptr = ERR28;
# Line 5554  for (;; ptr++) Line 5707  for (;; ptr++)
5707    
5708          recno = 0;          recno = 0;
5709          name = ++ptr;          name = ++ptr;
5710          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5711            {            {
5712            if (recno >= 0)            if (recno >= 0)
5713              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
# Line 5725  for (;; ptr++) Line 5878  for (;; ptr++)
5878            break;            break;
5879    
5880            default:                /* Could be name define, else bad */            default:                /* Could be name define, else bad */
5881            if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;            if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5882                goto DEFINE_NAME;
5883            ptr++;                  /* Correct offset for error */            ptr++;                  /* Correct offset for error */
5884            *errorcodeptr = ERR24;            *errorcodeptr = ERR24;
5885            goto FAILED;            goto FAILED;
# Line 5794  for (;; ptr++) Line 5948  for (;; ptr++)
5948              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5949            name = ++ptr;            name = ++ptr;
5950    
5951            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5952            namelen = (int)(ptr - name);            namelen = (int)(ptr - name);
5953    
5954            /* In the pre-compile phase, just do a syntax check. */            /* In the pre-compile phase, just do a syntax check. */
# Line 5811  for (;; ptr++) Line 5965  for (;; ptr++)
5965                *errorcodeptr = ERR49;                *errorcodeptr = ERR49;
5966                goto FAILED;                goto FAILED;
5967                }                }
5968              if (namelen + 3 > cd->name_entry_size)              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
5969                {                {
5970                cd->name_entry_size = namelen + 3;                cd->name_entry_size = namelen + IMM2_SIZE + 1;
5971                if (namelen > MAX_NAME_SIZE)                if (namelen > MAX_NAME_SIZE)
5972                  {                  {
5973                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR48;
# Line 5842  for (;; ptr++) Line 5996  for (;; ptr++)
5996    
5997              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
5998                {                {
5999                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6000                if (crc == 0)                if (crc == 0)
6001                  {                  {
6002                  if (slot[2+namelen] == 0)                  if (slot[IMM2_SIZE+namelen] == 0)
6003                    {                    {
6004                    if (GET2(slot, 0) != cd->bracount + 1 &&                    if (GET2(slot, 0) != cd->bracount + 1 &&
6005                        (options & PCRE_DUPNAMES) == 0)                        (options & PCRE_DUPNAMES) == 0)
# Line 5897  for (;; ptr++) Line 6051  for (;; ptr++)
6051                }                }
6052    
6053              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
6054              memcpy(slot + 2, name, IN_UCHARS(namelen));              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6055              slot[2 + namelen] = 0;              slot[IMM2_SIZE + namelen] = 0;
6056              }              }
6057            }            }
6058    
# Line 5924  for (;; ptr++) Line 6078  for (;; ptr++)
6078    
6079          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
6080          name = ++ptr;          name = ++ptr;
6081          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6082          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6083    
6084          /* In the pre-compile phase, do a syntax check. We used to just set          /* In the pre-compile phase, do a syntax check. We used to just set
# Line 5982  for (;; ptr++) Line 6136  for (;; ptr++)
6136            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
6137              {              {
6138              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6139                  slot[2+namelen] == 0)                  slot[IMM2_SIZE+namelen] == 0)
6140                break;                break;
6141              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6142              }              }
# Line 6115  for (;; ptr++) Line 6269  for (;; ptr++)
6269                of the group. Then remember the forward reference. */                of the group. Then remember the forward reference. */
6270    
6271                called = cd->start_code + recno;                called = cd->start_code + recno;
6272                  if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6273                      WORK_SIZE_SAFETY_MARGIN)
6274                    {
6275                    *errorcodeptr = expand_workspace(cd);
6276                    if (*errorcodeptr != 0) goto FAILED;
6277                    }
6278                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6279                }                }
6280    
# Line 6135  for (;; ptr++) Line 6295  for (;; ptr++)
6295                }                }
6296              }              }
6297    
6298            /* Insert the recursion/subroutine item. */            /* Insert the recursion/subroutine item. It does not have a set first
6299              character (relevant if it is repeated, because it will then be
6300              wrapped with ONCE brackets). */
6301    
6302            *code = OP_RECURSE;            *code = OP_RECURSE;
6303            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
6304            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
6305              groupsetfirstchar = FALSE;
6306            }            }
6307    
6308          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 6501  for (;; ptr++) Line 6664  for (;; ptr++)
6664            BOOL isnumber = TRUE;            BOOL isnumber = TRUE;
6665            for (p = ptr + 1; *p != 0 && *p != terminator; p++)            for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6666              {              {
6667                if (!MAX_255(*p)) { isnumber = FALSE; break; }
6668              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
6669              if ((cd->ctypes[*p] & ctype_word) == 0) break;              if ((cd->ctypes[*p] & ctype_word) == 0) break;
6670              }              }
# Line 6623  for (;; ptr++) Line 6787  for (;; ptr++)
6787  #endif  #endif
6788          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6789          so that it works in DFA mode and in lookbehinds. */          so that it works in DFA mode and in lookbehinds. */
6790    
6791            {            {
6792            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6793            *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;            *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
6794            }            }
# Line 6636  for (;; ptr++) Line 6800  for (;; ptr++)
6800      a value > 127. We set its representation in the length/buffer, and then      a value > 127. We set its representation in the length/buffer, and then
6801      handle it as a data character. */      handle it as a data character. */
6802    
6803  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
6804      if (utf && c > 127)      if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6805        mclength = PRIV(ord2utf)(c, mcbuffer);        mclength = PRIV(ord2utf)(c, mcbuffer);
6806      else      else
6807  #endif  #endif
# Line 6661  for (;; ptr++) Line 6825  for (;; ptr++)
6825    
6826  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6827      if (utf && HAS_EXTRALEN(c))      if (utf && HAS_EXTRALEN(c))
6828        {        ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
       INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));  
       }  
6829  #endif  #endif
6830    
6831      /* At this point we have the character's bytes in mcbuffer, and the length      /* At this point we have the character's bytes in mcbuffer, and the length
# Line 7379  compile_data *cd = &compile_block; Line 7541  compile_data *cd = &compile_block;
7541  computing the amount of memory that is needed. Compiled items are thrown away  computing the amount of memory that is needed. Compiled items are thrown away
7542  as soon as possible, so that a fairly large buffer should be sufficient for  as soon as possible, so that a fairly large buffer should be sufficient for
7543  this purpose. The same space is used in the second phase for remembering where  this purpose. The same space is used in the second phase for remembering where
7544  to fill in forward references to subpatterns. */  to fill in forward references to subpatterns. That may overflow, in which case
7545    new memory is obtained from malloc(). */
7546    
7547  pcre_uchar cworkspace[COMPILE_WORK_SIZE];  pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7548    
# Line 7435  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 7598  while (ptr[skipatstart] == CHAR_LEFT_PAR
7598    int newnl = 0;    int newnl = 0;
7599    int newbsr = 0;    int newbsr = 0;
7600    
7601    if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)  #ifdef COMPILE_PCRE8
7602      if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 5) == 0)
7603      { skipatstart += 7; options |= PCRE_UTF8; continue; }      { skipatstart += 7; options |= PCRE_UTF8; continue; }
7604    #endif
7605    #ifdef COMPILE_PCRE16
7606      if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0)
7607        { skipatstart += 8; options |= PCRE_UTF16; continue; }
7608    #endif
7609    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
7610      { skipatstart += 6; options |= PCRE_UCP; continue; }      { skipatstart += 6; options |= PCRE_UCP; continue; }
7611    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
# Line 7468  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 7637  while (ptr[skipatstart] == CHAR_LEFT_PAR
7637  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
7638  utf = (options & PCRE_UTF8) != 0;  utf = (options & PCRE_UTF8) != 0;
7639    
7640  /* Can't support UTF8 unless PCRE has been compiled to include the code. The  /* Can't support UTF unless PCRE has been compiled to include the code. The
7641  return of an error code from PRIV(valid_utf)() is a new feature, introduced in  return of an error code from PRIV(valid_utf)() is a new feature, introduced in
7642  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7643  not used here. */  not used here. */
7644    
7645  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
7646  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7647       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7648    {    {
# Line 7573  cd->bracount = cd->final_bracount = 0; Line 7742  cd->bracount = cd->final_bracount = 0;
7742  cd->names_found = 0;  cd->names_found = 0;
7743  cd->name_entry_size = 0;  cd->name_entry_size = 0;
7744  cd->name_table = NULL;  cd->name_table = NULL;
 cd->start_workspace = cworkspace;  
7745  cd->start_code = cworkspace;  cd->start_code = cworkspace;
7746  cd->hwm = cworkspace;  cd->hwm = cworkspace;
7747    cd->start_workspace = cworkspace;
7748    cd->workspace_size = COMPILE_WORK_SIZE;
7749  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
7750  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7751  cd->req_varyopt = 0;  cd->req_varyopt = 0;
# Line 7610  externally provided function. Integer ov Line 7780  externally provided function. Integer ov
7780  because nowadays we limit the maximum value of cd->names_found and  because nowadays we limit the maximum value of cd->names_found and
7781  cd->name_entry_size. */  cd->name_entry_size. */
7782    
7783  size = sizeof(real_pcre) + (length + cd->names_found * (cd->name_entry_size + 3)) * sizeof(pcre_uchar);  size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7784  re = (real_pcre *)(pcre_malloc)(size);  re = (real_pcre *)(PUBL(malloc))(size);
7785    
7786  if (re == NULL)  if (re == NULL)
7787    {    {
# Line 7653  cd->names_found = 0; Line 7823  cd->names_found = 0;
7823  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7824  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
7825  cd->start_code = codestart;  cd->start_code = codestart;
7826  cd->hwm = cworkspace;  cd->hwm = (pcre_uchar *)(cd->start_workspace);
7827  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7828  cd->had_accept = FALSE;  cd->had_accept = FALSE;
7829  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
# Line 7670  code = (pcre_uchar *)codestart; Line 7840  code = (pcre_uchar *)codestart;
7840    &firstchar, &reqchar, NULL, cd, NULL);    &firstchar, &reqchar, NULL, cd, NULL);
7841  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
7842  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7843  re->flags = cd->external_flags;  re->flags = cd->external_flags | PCRE_MODE;
7844    
7845  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
7846    
# Line 7687  if debugging, leave the test till after Line 7857  if debugging, leave the test till after
7857  if (code - codestart > length) errorcode = ERR23;  if (code - codestart > length) errorcode = ERR23;
7858  #endif  #endif
7859    
7860  /* Fill in any forward references that are required. */  /* Fill in any forward references that are required. There may be repeated
7861    references; optimize for them, as searching a large regex takes time. */
7862    
7863  while (errorcode == 0 && cd->hwm > cworkspace)  if (cd->hwm > cd->start_workspace)
7864    {    {
7865    int offset, recno;    int prev_recno = -1;
7866    const pcre_uchar *groupptr;    const pcre_uchar *groupptr = NULL;
7867    cd->hwm -= LINK_SIZE;    while (errorcode == 0 && cd->hwm > cd->start_workspace)
7868    offset = GET(cd->hwm, 0);      {
7869    recno = GET(codestart, offset);      int offset, recno;
7870    groupptr = PRIV(find_bracket)(codestart, utf, recno);      cd->hwm -= LINK_SIZE;
7871    if (groupptr == NULL) errorcode = ERR53;      offset = GET(cd->hwm, 0);
7872      else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));      recno = GET(codestart, offset);
7873        if (recno != prev_recno)
7874          {
7875          groupptr = PRIV(find_bracket)(codestart, utf, recno);
7876          prev_recno = recno;
7877          }
7878        if (groupptr == NULL) errorcode = ERR53;
7879          else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
7880        }
7881    }    }
7882    
7883    /* If the workspace had to be expanded, free the new memory. */
7884    
7885    if (cd->workspace_size > COMPILE_WORK_SIZE)
7886      (PUBL(free))((void *)cd->start_workspace);
7887    
7888  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
7889  subpattern. */  subpattern. */
7890    
# Line 7753  if (cd->check_lookbehind) Line 7937  if (cd->check_lookbehind)
7937    
7938  if (errorcode != 0)  if (errorcode != 0)
7939    {    {
7940    (pcre_free)(re);    (PUBL(free))(re);
7941    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
7942    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
7943    PCRE_EARLY_ERROR_RETURN2:    PCRE_EARLY_ERROR_RETURN2:
# Line 7789  if ((re->options & PCRE_ANCHORED) == 0) Line 7973  if ((re->options & PCRE_ANCHORED) == 0)
7973        re->first_char = firstchar & 0xffff;        re->first_char = firstchar & 0xffff;
7974  #endif  #endif
7975  #endif  #endif
7976        if ((firstchar & REQ_CASELESS) != 0 && MAX_255(re->first_char)        if ((firstchar & REQ_CASELESS) != 0)
7977          && cd->fcc[re->first_char] != re->first_char)          {
7978          re->flags |= PCRE_FCH_CASELESS;  #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
7979            /* We ignore non-ASCII first chars in 8 bit mode. */
7980            if (utf)
7981              {
7982              if (re->first_char < 128)
7983                {
7984                if (cd->fcc[re->first_char] != re->first_char)
7985                  re->flags |= PCRE_FCH_CASELESS;
7986                }
7987              else if (UCD_OTHERCASE(re->first_char) != re->first_char)
7988                re->flags |= PCRE_FCH_CASELESS;
7989              }
7990            else
7991    #endif
7992            if (MAX_255(re->first_char)
7993                && cd->fcc[re->first_char] != re->first_char)
7994              re->flags |= PCRE_FCH_CASELESS;
7995            }
7996    
7997        re->flags |= PCRE_FIRSTSET;        re->flags |= PCRE_FIRSTSET;
7998        }        }
# Line 7814  if (reqchar >= 0 && Line 8015  if (reqchar >= 0 &&
8015    re->req_char = reqchar & 0xffff;    re->req_char = reqchar & 0xffff;
8016  #endif  #endif
8017  #endif  #endif
8018    if ((reqchar & REQ_CASELESS) != 0 && MAX_255(re->req_char)    if ((reqchar & REQ_CASELESS) != 0)
8019      && cd->fcc[re->req_char] != re->req_char)      {
8020      re->flags |= PCRE_RCH_CASELESS;  #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8021        /* We ignore non-ASCII first chars in 8 bit mode. */
8022        if (utf)
8023          {
8024          if (re->req_char < 128)
8025            {
8026            if (cd->fcc[re->req_char] != re->req_char)
8027              re->flags |= PCRE_RCH_CASELESS;
8028            }
8029          else if (UCD_OTHERCASE(re->req_char) != re->req_char)
8030            re->flags |= PCRE_RCH_CASELESS;
8031          }
8032        else
8033    #endif
8034        if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
8035          re->flags |= PCRE_RCH_CASELESS;
8036        }
8037    
8038    re->flags |= PCRE_REQCHSET;    re->flags |= PCRE_REQCHSET;
8039    }    }
# Line 7848  if ((re->flags & PCRE_REQCHSET) != 0) Line 8065  if ((re->flags & PCRE_REQCHSET) != 0)
8065      else printf("Req char = \\x%02x%s\n", ch, caseless);      else printf("Req char = \\x%02x%s\n", ch, caseless);
8066    }    }
8067    
8068    #ifdef COMPILE_PCRE8
8069  pcre_printint(re, stdout, TRUE);  pcre_printint(re, stdout, TRUE);
8070    #else
8071    pcre16_printint(re, stdout, TRUE);
8072    #endif
8073    
8074  /* This check is done here in the debugging case so that the code that  /* This check is done here in the debugging case so that the code that
8075  was compiled can be seen. */  was compiled can be seen. */
8076    
8077  if (code - codestart > length)  if (code - codestart > length)
8078    {    {
8079    (pcre_free)(re);    (PUBL(free))(re);
8080    *errorptr = find_error_text(ERR23);    *errorptr = find_error_text(ERR23);
8081    *erroroffset = ptr - (pcre_uchar *)pattern;    *erroroffset = ptr - (pcre_uchar *)pattern;
8082    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;

Legend:
Removed from v.782  
changed lines
  Added in v.806

  ViewVC Help
Powered by ViewVC 1.1.5