/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 782 by zherczeg, Sat Dec 3 23:58:37 2011 UTC revision 804 by zherczeg, Wed Dec 14 11:18:01 2011 UTC
# Line 88  so this number is very generous. Line 88  so this number is very generous.
88  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
89  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
90  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
92    filled up by repetitions of forward references, for example patterns like
93    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
94    that the workspace is expanded using malloc() in this situation. The value
95    below is therefore a minimum, and we put a maximum on it for safety. The
96    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
97    kicks in at the same number of forward references in all cases. */
98    
99  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
100    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
101    
102  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
103  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
104    
105  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
106    
107  /* Private flags added to firstchar and reqchar. */  /* Private flags added to firstchar and reqchar. */
108    
109  #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */  #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
110  #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */  #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
111    
112    /* Repeated character flags. */
113    
114    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
115    
116  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
117  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
118  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
# Line 470  static const char error_texts[] = Line 481  static const char error_texts[] =
481    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
482    /* 70 */    /* 70 */
483    "internal error: unknown opcode in find_fixedlength()\0"    "internal error: unknown opcode in find_fixedlength()\0"
484    "Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff)\0"    "\\N is not supported in a class\0"
485      "too many forward references\0"
486      "disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff)\0"
487    ;    ;
488    
489  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 645  return s; Line 658  return s;
658    
659    
660  /*************************************************  /*************************************************
661    *           Expand the workspace                 *
662    *************************************************/
663    
664    /* This function is called during the second compiling phase, if the number of
665    forward references fills the existing workspace, which is originally a block on
666    the stack. A larger block is obtained from malloc() unless the ultimate limit
667    has been reached or the increase will be rather small.
668    
669    Argument: pointer to the compile data block
670    Returns:  0 if all went well, else an error number
671    */
672    
673    static int
674    expand_workspace(compile_data *cd)
675    {
676    pcre_uchar *newspace;
677    int newsize = cd->workspace_size * 2;
678    
679    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
680    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
681        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
682     return ERR72;
683    
684    newspace = (PUBL(malloc))(newsize);
685    if (newspace == NULL) return ERR21;
686    
687    memcpy(newspace, cd->start_workspace, cd->workspace_size);
688    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
689    if (cd->workspace_size > COMPILE_WORK_SIZE)
690      (PUBL(free))((void *)cd->start_workspace);
691    cd->start_workspace = newspace;
692    cd->workspace_size = newsize;
693    return 0;
694    }
695    
696    
697    
698    /*************************************************
699  *            Check for counted repeat            *  *            Check for counted repeat            *
700  *************************************************/  *************************************************/
701    
# Line 1009  else Line 1060  else
1060    
1061        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1062          {          {
1063          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR71;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1064          ptr = pt;          ptr = pt;
1065          break;          break;
1066          }          }
# Line 1807  for (;;) Line 1858  for (;;)
1858      cc++;      cc++;
1859      break;      break;
1860    
1861      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1862      otherwise \C is coded as OP_ALLANY. */      otherwise \C is coded as OP_ALLANY. */
1863    
1864      case OP_ANYBYTE:      case OP_ANYBYTE:
# Line 2353  for (code = first_significant_code(code Line 2404  for (code = first_significant_code(code
2404      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2405      here. */      here. */
2406    
2407  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2408      case OP_XCLASS:      case OP_XCLASS:
2409      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2410      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2363  for (code = first_significant_code(code Line 2414  for (code = first_significant_code(code
2414      case OP_NCLASS:      case OP_NCLASS:
2415      ccode = code + PRIV(OP_lengths)[OP_CLASS];      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2416    
2417  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2418      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2419  #endif  #endif
2420    
# Line 2896  static BOOL Line 2947  static BOOL
2947  check_auto_possessive(const pcre_uchar *previous, BOOL utf,  check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2948    const pcre_uchar *ptr, int options, compile_data *cd)    const pcre_uchar *ptr, int options, compile_data *cd)
2949  {  {
2950  int c, next;  pcre_int32 c, next;
2951  int op_code = *previous++;  int op_code = *previous++;
2952    
2953  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2905  if ((options & PCRE_EXTENDED) != 0) Line 2956  if ((options & PCRE_EXTENDED) != 0)
2956    {    {
2957    for (;;)    for (;;)
2958      {      {
2959      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2960      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2961        {        {
2962        ptr++;        ptr++;
# Line 2932  if (*ptr == CHAR_BACKSLASH) Line 2983  if (*ptr == CHAR_BACKSLASH)
2983    if (temperrorcode != 0) return FALSE;    if (temperrorcode != 0) return FALSE;
2984    ptr++;    /* Point after the escape sequence */    ptr++;    /* Point after the escape sequence */
2985    }    }
2986    else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  
2987    {    {
2988  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2989    if (utf) { GETCHARINC(next, ptr); } else    if (utf) { GETCHARINC(next, ptr); } else
2990  #endif  #endif
2991    next = *ptr++;    next = *ptr++;
2992    }    }
   
2993  else return FALSE;  else return FALSE;
2994    
2995  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
# Line 2949  if ((options & PCRE_EXTENDED) != 0) Line 2998  if ((options & PCRE_EXTENDED) != 0)
2998    {    {
2999    for (;;)    for (;;)
3000      {      {
3001      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3002      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3003        {        {
3004        ptr++;        ptr++;
# Line 2978  the next item is a character. */ Line 3027  the next item is a character. */
3027  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3028    {    {
3029    case OP_CHAR:    case OP_CHAR:
3030  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3031    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3032  #else  #else
3033    c = *previous;    c = *previous;
# Line 2990  if (next >= 0) switch(op_code) Line 3039  if (next >= 0) switch(op_code)
3039    high-valued characters. */    high-valued characters. */
3040    
3041    case OP_CHARI:    case OP_CHARI:
3042  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3043    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3044  #else  #else
3045    c = *previous;    c = *previous;
3046  #endif  #endif
3047    if (c == next) return FALSE;    if (c == next) return FALSE;
3048  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3049    if (utf)    if (utf)
3050      {      {
3051      unsigned int othercase;      unsigned int othercase;
# Line 3009  if (next >= 0) switch(op_code) Line 3058  if (next >= 0) switch(op_code)
3058      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3059      }      }
3060    else    else
3061  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3062    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
3063    
3064    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
# Line 3021  if (next >= 0) switch(op_code) Line 3070  if (next >= 0) switch(op_code)
3070    
3071    case OP_NOTI:    case OP_NOTI:
3072    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
3073  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3074    if (utf)    if (utf)
3075      {      {
3076      unsigned int othercase;      unsigned int othercase;
# Line 3034  if (next >= 0) switch(op_code) Line 3083  if (next >= 0) switch(op_code)
3083      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3084      }      }
3085    else    else
3086  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3087    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
3088    
3089    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
# Line 3126  switch(op_code) Line 3175  switch(op_code)
3175    {    {
3176    case OP_CHAR:    case OP_CHAR:
3177    case OP_CHARI:    case OP_CHARI:
3178  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3179    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3180  #else  #else
3181    c = *previous;    c = *previous;
# Line 3356  pcre_uint8 classbits[32]; Line 3405  pcre_uint8 classbits[32];
3405  must not do this for other options (e.g. PCRE_EXTENDED) because they may change  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3406  dynamically as we process the pattern. */  dynamically as we process the pattern. */
3407    
3408  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3409  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3410  BOOL utf = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
3411  pcre_uchar utf_chars[6];  pcre_uchar utf_chars[6];
# Line 3411  for (;; ptr++) Line 3460  for (;; ptr++)
3460    BOOL is_quantifier;    BOOL is_quantifier;
3461    BOOL is_recurse;    BOOL is_recurse;
3462    BOOL reset_bracount;    BOOL reset_bracount;
3463    int class_charcount;    int class_has_8bitchar;
3464    int class_lastchar;    int class_single_char;
3465    int newoptions;    int newoptions;
3466    int recno;    int recno;
3467    int refsign;    int refsign;
# Line 3446  for (;; ptr++) Line 3495  for (;; ptr++)
3495  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3496      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3497  #endif  #endif
3498      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3499            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3500        {        {
3501        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3502        goto FAILED;        goto FAILED;
# Line 3471  for (;; ptr++) Line 3521  for (;; ptr++)
3521      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3522      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3523        (int)(code - last_code), c, c));        (int)(code - last_code), c, c));
3524    
3525      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3526      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
3527      if "previous" is NULL, reset the current code pointer to the start. */      if "previous" is NULL, reset the current code pointer to the start. */
# Line 3496  for (;; ptr++) Line 3546  for (;; ptr++)
3546    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3547    reference list. */    reference list. */
3548    
3549    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3550               WORK_SIZE_SAFETY_MARGIN)
3551      {      {
3552      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3553      goto FAILED;      goto FAILED;
# Line 3548  for (;; ptr++) Line 3599  for (;; ptr++)
3599    
3600    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3601      {      {
3602      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3603      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3604        {        {
3605        ptr++;        ptr++;
# Line 3708  for (;; ptr++) Line 3759  for (;; ptr++)
3759    
3760      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3761    
3762      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3763      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3764      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3765        a single character. */
3766    
3767      class_charcount = 0;      class_has_8bitchar = 0;
3768      class_lastchar = -1;      class_single_char = 0;
3769    
3770      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3771      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
# Line 3736  for (;; ptr++) Line 3788  for (;; ptr++)
3788        {        {
3789        const pcre_uchar *oldptr;        const pcre_uchar *oldptr;
3790    
3791  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3792        if (utf && c > 127)        if (utf && HAS_EXTRALEN(c))
3793          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3794          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3795          }          }
# Line 3868  for (;; ptr++) Line 3920  for (;; ptr++)
3920            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3921    
3922          ptr = tempptr + 1;          ptr = tempptr + 1;
3923          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3924            class_has_8bitchar = 1;
3925            /* Every class contains at least two characters. */
3926            class_single_char = 2;
3927          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3928          }          }
3929    
3930        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3931        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3932        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3933        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3934        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3935        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3936          as literal characters (by default), or are faulted if
3937        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3938    
3939        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3886  for (;; ptr++) Line 3942  for (;; ptr++)
3942          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3943    
3944          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3945            else if (-c == ESC_N)            /* \N is not supported in a class */
3946              {
3947              *errorcodeptr = ERR71;
3948              goto FAILED;
3949              }
3950          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3951            {            {
3952            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3900  for (;; ptr++) Line 3961  for (;; ptr++)
3961          if (c < 0)          if (c < 0)
3962            {            {
3963            register const pcre_uint8 *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3964            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3965              class_has_8bitchar++;
3966              /* Every class contains at least two characters. */
3967              class_single_char += 2;
3968    
3969            switch (-c)            switch (-c)
3970              {              {
# Line 3913  for (;; ptr++) Line 3977  for (;; ptr++)
3977              case ESC_SU:              case ESC_SU:
3978              nestptr = ptr;              nestptr = ptr;
3979              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3980              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
3981              continue;              continue;
3982  #endif  #endif
3983              case ESC_d:              case ESC_d:
# Line 4079  for (;; ptr++) Line 4143  for (;; ptr++)
4143                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4144                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
4145                *class_uchardata++ = pdata;                *class_uchardata++ = pdata;
4146                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4147                continue;                continue;
4148                }                }
4149  #endif  #endif
# Line 4093  for (;; ptr++) Line 4157  for (;; ptr++)
4157                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4158                goto FAILED;                goto FAILED;
4159                }                }
4160              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4161              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4162                c = *ptr;                /* Get the final character and fall through */
4163              break;              break;
4164              }              }
4165            }            }
4166    
4167          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4168          greater than 256 mode. */          greater than 256. */
4169    
4170          }   /* End of backslash handling */          }   /* End of backslash handling */
4171    
# Line 4148  for (;; ptr++) Line 4213  for (;; ptr++)
4213            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4214            }            }
4215    
4216  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4217          if (utf)          if (utf)
4218            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4219            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
# Line 4193  for (;; ptr++) Line 4258  for (;; ptr++)
4258    
4259          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4260    
4261            /* Since we found a character range, single character optimizations
4262            cannot be done anymore. */
4263            class_single_char = 2;
4264    
4265          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4266          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4267          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4268          available. */          available. */
4269    
4270  #ifdef SUPPORT_UTF  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4271            if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4272    #elif defined  SUPPORT_UTF
4273          if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4274  #endif  #elif !(defined COMPILE_PCRE8)
 #ifndef COMPILE_PCRE8  
4275          if (d > 255)          if (d > 255)
4276  #endif  #endif
4277  #if defined SUPPORT_UTF || defined COMPILE_PCRE16  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4278            {            {
4279            xclass = TRUE;            xclass = TRUE;
4280    
# Line 4213  for (;; ptr++) Line 4283  for (;; ptr++)
4283            they fit with the basic range. */            they fit with the basic range. */
4284    
4285  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4286    #ifndef COMPILE_PCRE8
4287              if (utf && (options & PCRE_CASELESS) != 0)
4288    #else
4289            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4290    #endif
4291              {              {
4292              unsigned int occ, ocd;              unsigned int occ, ocd;
4293              unsigned int cc = c;              unsigned int cc = c;
# Line 4256  for (;; ptr++) Line 4330  for (;; ptr++)
4330    
4331            *class_uchardata++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4332  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4333    #ifndef COMPILE_PCRE8
4334              if (utf)
4335                {
4336                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4337                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4338                }
4339              else
4340                {
4341                *class_uchardata++ = c;
4342                *class_uchardata++ = d;
4343                }
4344    #else
4345            class_uchardata += PRIV(ord2utf)(c, class_uchardata);            class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4346            class_uchardata += PRIV(ord2utf)(d, class_uchardata);            class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4347  #else  #endif
4348    #else /* SUPPORT_UTF */
4349            *class_uchardata++ = c;            *class_uchardata++ = c;
4350            *class_uchardata++ = d;            *class_uchardata++ = d;
4351  #endif  #endif /* SUPPORT_UTF */
4352    
4353            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4354            caseless matching for UTF characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
# Line 4269  for (;; ptr++) Line 4356  for (;; ptr++)
4356            can still use  */            can still use  */
4357    
4358  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4359            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4360  #else            if (utf)
4361  #ifdef SUPPORT_UTF  #endif
4362                continue;    /* With next character in the class */
4363    #endif  /* SUPPORT_UCP */
4364    
4365    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4366              if (utf)
4367                {
4368                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4369                /* Adjust upper limit and fall through to set up the map */
4370                d = 127;
4371                }
4372              else
4373                {
4374                if (c > 255) continue;
4375                /* Adjust upper limit and fall through to set up the map */
4376                d = 255;
4377                }
4378    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4379            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4380            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
4381            d = 127;            d = 127;
# Line 4279  for (;; ptr++) Line 4383  for (;; ptr++)
4383            if (c > 255) continue;            if (c > 255) continue;
4384            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
4385            d = 255;            d = 255;
4386  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
 #endif  /* SUPPORT_UCP */  
4387            }            }
4388  #endif  /* SUPPORT_UTF8 || COMPILE_PCRE16 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4389    
4390          /* We use the bit map for 8 bit mode, or when the characters fall          /* We use the bit map for 8 bit mode, or when the characters fall
4391          partially or entirely to [0-255] ([0-127] for UCP) ranges. */          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4392    
4393          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4394    
4395          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4396    
# Line 4311  for (;; ptr++) Line 4413  for (;; ptr++)
4413    
4414        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4415    
4416        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4417          if (class_single_char < 2) class_single_char++;
4418    
4419          /* If class_charcount is 1, we saw precisely one character. As long as
4420          there were no negated characters >= 128 and there was no use of \p or \P,
4421          in other words, no use of any XCLASS features, we can optimize.
4422    
4423          In UTF-8 mode, we can optimize the negative case only if there were no
4424          characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4425          operate on single-bytes characters only. This is an historical hangover.
4426          Maybe one day we can tidy these opcodes to handle multi-byte characters.
4427    
4428          The optimization throws away the bit map. We turn the item into a
4429          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4430          Note that OP_NOT[I] does not support multibyte characters. In the positive
4431          case, it can cause firstchar to be set. Otherwise, there can be no first
4432          char if this item is first, whatever repeat count may follow. In the case
4433          of reqchar, save the previous value for reinstating. */
4434    
4435  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4436        if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))        if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4437            && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4438    #else
4439          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4440  #endif  #endif
4441  #ifndef COMPILE_PCRE8          {
4442            ptr++;
4443            zeroreqchar = reqchar;
4444    
4445            /* The OP_NOT[I] opcodes work on single characters only. */
4446    
4447            if (negate_class)
4448              {
4449              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4450              zerofirstchar = firstchar;
4451              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4452              *code++ = c;
4453              goto NOT_CHAR;
4454              }
4455    
4456            /* For a single, positive character, get the value into mcbuffer, and
4457            then we can handle this with the normal one-character code. */
4458    
4459    #ifdef SUPPORT_UTF
4460            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4461              mclength = PRIV(ord2utf)(c, mcbuffer);
4462            else
4463    #endif
4464              {
4465              mcbuffer[0] = c;
4466              mclength = 1;
4467              }
4468            goto ONE_CHAR;
4469            }       /* End of 1-char optimization */
4470    
4471          /* Handle a character that cannot go in the bit map. */
4472    
4473    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4474          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4475    #elif defined SUPPORT_UTF
4476          if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4477    #elif !(defined COMPILE_PCRE8)
4478        if (c > 255)        if (c > 255)
4479  #endif  #endif
4480  #if defined SUPPORT_UTF || defined COMPILE_PCRE16  
4481    #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4482          {          {
4483          xclass = TRUE;          xclass = TRUE;
4484          *class_uchardata++ = XCL_SINGLE;          *class_uchardata++ = XCL_SINGLE;
4485  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4486          class_uchardata += PRIV(ord2utf)(c, class_uchardata);  #ifndef COMPILE_PCRE8
4487  #else          /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4488          *class_uchardata++ = c;          if (!utf)
4489              *class_uchardata++ = c;
4490            else
4491  #endif  #endif
4492              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4493    #else /* SUPPORT_UTF */
4494            *class_uchardata++ = c;
4495    #endif /* SUPPORT_UTF */
4496    
4497  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4498    #ifdef COMPILE_PCRE8
4499          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4500    #else
4501            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4502            if (utf && (options & PCRE_CASELESS) != 0)
4503    #endif
4504            {            {
4505            unsigned int othercase;            unsigned int othercase;
4506            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((othercase = UCD_OTHERCASE(c)) != c)
# Line 4343  for (;; ptr++) Line 4513  for (;; ptr++)
4513    
4514          }          }
4515        else        else
4516  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4517    
4518        /* Handle a single-byte character */        /* Handle a single-byte character */
4519          {          {
4520            class_has_8bitchar = 1;
4521          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4522          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4523            {            {
4524            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c];   /* flip case */
4525            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4526            }            }
         class_charcount++;  
         class_lastchar = c;  
4527          }          }
4528        }        }
4529    
# Line 4375  for (;; ptr++) Line 4544  for (;; ptr++)
4544        goto FAILED;        goto FAILED;
4545        }        }
4546    
4547      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4548      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4549      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
     optimize.  
   
     In UTF-8 mode, we can optimize the negative case only if there were no  
     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR  
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstchar to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqchar, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF  
     if (class_charcount == 1 && !xclass &&  
       (!utf || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqchar = reqchar;  
   
       /* The OP_NOT[I] opcodes work on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstchar == REQ_UNSET) firstchar = REQ_NONE;  
         zerofirstchar = firstchar;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF8  
       if (utf && class_lastchar > 127)  
         mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqchar setting must remain unchanged after any kind of  
     repeat. */  
4550    
4551      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4552      zerofirstchar = firstchar;      zerofirstchar = firstchar;
# Line 4460  for (;; ptr++) Line 4576  for (;; ptr++)
4576        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4577        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4578    
4579        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4580          {          {
4581          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4582          memmove(code + (32 / sizeof(pcre_uchar)), code,          memmove(code + (32 / sizeof(pcre_uchar)), code,
# Line 4472  for (;; ptr++) Line 4588  for (;; ptr++)
4588    
4589        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4590    
4591        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4592        break;   /* End of class handling */        break;   /* End of class handling */
4593        }        }
4594  #endif  #endif
# Line 4491  for (;; ptr++) Line 4607  for (;; ptr++)
4607        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4608        }        }
4609      code += 32 / sizeof(pcre_uchar);      code += 32 / sizeof(pcre_uchar);
4610        NOT_CHAR:
4611      break;      break;
4612    
4613    
# Line 4567  for (;; ptr++) Line 4684  for (;; ptr++)
4684      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4685      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4686      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4687    
4688      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4689        {        {
4690        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
# Line 4603  for (;; ptr++) Line 4720  for (;; ptr++)
4720    
4721        /* Deal with UTF characters that take up more than one character. It's        /* Deal with UTF characters that take up more than one character. It's
4722        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4723        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus UTF_LENGTH to flag that
4724        length rather than a small character. */        it's a length rather than a small character. */
4725    
4726  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4727        if (utf && (code[-1] & 0x80) != 0)        if (utf && NOT_FIRSTCHAR(code[-1]))
4728          {          {
4729          pcre_uchar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4730          BACKCHAR(lastchar);          BACKCHAR(lastchar);
4731          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4732          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4733          c |= 0x80;                      /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4734          }          }
4735        else        else
4736  #endif  #endif /* SUPPORT_UTF */
4737    
4738        /* Handle the case of a single charater - either with no UTF support, or        /* Handle the case of a single charater - either with no UTF support, or
4739        with UTF disabled, or for a single character UTF character. */        with UTF disabled, or for a single character UTF character. */
   
4740          {          {
4741          c = code[-1];          c = code[-1];
4742          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
# Line 4758  for (;; ptr++) Line 4874  for (;; ptr++)
4874          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
4875          Unicode property match, there are two extra bytes that define the          Unicode property match, there are two extra bytes that define the
4876          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
4877          c, with the 0x80 bit as a flag. */          c, with the UTF_LENGTH bit as a flag. */
4878    
4879          if (repeat_max < 0)          if (repeat_max < 0)
4880            {            {
4881  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4882            if (utf && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4883              {              {
4884              memcpy(code, utf_chars, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4885              code += c & 7;              code += c & 7;
4886              }              }
4887            else            else
# Line 4787  for (;; ptr++) Line 4903  for (;; ptr++)
4903    
4904          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
4905            {            {
4906  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4907            if (utf && c >= 128)            if (utf && (c & UTF_LENGTH) != 0)
4908              {              {
4909              memcpy(code, utf_chars, c & 7);              memcpy(code, utf_chars, IN_UCHARS(c & 7));
4910              code += c & 7;              code += c & 7;
4911              }              }
4912            else            else
# Line 4817  for (;; ptr++) Line 4933  for (;; ptr++)
4933    
4934        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
4935    
4936  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4937        if (utf && c >= 128)        if (utf && (c & UTF_LENGTH) != 0)
4938          {          {
4939          memcpy(code, utf_chars, c & 7);          memcpy(code, utf_chars, IN_UCHARS(c & 7));
4940          code += c & 7;          code += c & 7;
4941          }          }
4942        else        else
# Line 4844  for (;; ptr++) Line 4960  for (;; ptr++)
4960    
4961      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
4962               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
4963  #if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4964               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
4965  #endif  #endif
4966               *previous == OP_REF ||               *previous == OP_REF ||
# Line 5018  for (;; ptr++) Line 5134  for (;; ptr++)
5134              *lengthptr += delta;              *lengthptr += delta;
5135              }              }
5136    
5137            /* This is compiling for real */            /* This is compiling for real. If there is a set first byte for
5138              the group, and we have not yet set a "required byte", set it. Make
5139              sure there is enough workspace for copying forward references before
5140              doing the copy. */
5141    
5142            else            else
5143              {              {
5144              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5145    
5146              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5147                {                {
5148                pcre_uchar *hc;                pcre_uchar *hc;
5149                pcre_uchar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5150                memcpy(code, previous, IN_UCHARS(len));                memcpy(code, previous, IN_UCHARS(len));
5151    
5152                  while (cd->hwm > cd->start_workspace + cd->workspace_size -
5153                         WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5154                    {
5155                    int save_offset = save_hwm - cd->start_workspace;
5156                    int this_offset = this_hwm - cd->start_workspace;
5157                    *errorcodeptr = expand_workspace(cd);
5158                    if (*errorcodeptr != 0) goto FAILED;
5159                    save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5160                    this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5161                    }
5162    
5163                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5164                  {                  {
5165                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 5095  for (;; ptr++) Line 5227  for (;; ptr++)
5227              }              }
5228    
5229            memcpy(code, previous, IN_UCHARS(len));            memcpy(code, previous, IN_UCHARS(len));
5230    
5231              /* Ensure there is enough workspace for forward references before
5232              copying them. */
5233    
5234              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5235                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5236                {
5237                int save_offset = save_hwm - cd->start_workspace;
5238                int this_offset = this_hwm - cd->start_workspace;
5239                *errorcodeptr = expand_workspace(cd);
5240                if (*errorcodeptr != 0) goto FAILED;
5241                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5242                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5243                }
5244    
5245            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5246              {              {
5247              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 5125  for (;; ptr++) Line 5272  for (;; ptr++)
5272        ONCE brackets can be converted into non-capturing brackets, as the        ONCE brackets can be converted into non-capturing brackets, as the
5273        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5274        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5275    
5276        Otherwise, when we are doing the actual compile phase, check to see        Otherwise, when we are doing the actual compile phase, check to see
5277        whether this group is one that could match an empty string. If so,        whether this group is one that could match an empty string. If so,
5278        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5279        that runtime checking can be done. [This check is also applied to ONCE        that runtime checking can be done. [This check is also applied to ONCE
5280        groups at runtime, but in a different way.]        groups at runtime, but in a different way.]
5281    
5282        Then, if the quantifier was possessive and the bracket is not a        Then, if the quantifier was possessive and the bracket is not a
5283        conditional, we convert the BRA code to the POS form, and the KET code to        conditional, we convert the BRA code to the POS form, and the KET code to
5284        KETRPOS. (It turns out to be convenient at runtime to detect this kind of        KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5285        subpattern at both the start and at the end.) The use of special opcodes        subpattern at both the start and at the end.) The use of special opcodes
5286        makes it possible to reduce greatly the stack usage in pcre_exec(). If        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5287        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5288    
5289        Then, if the minimum number of matches is 1 or 0, cancel the possessive        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5290        flag so that the default action below, of wrapping everything inside        flag so that the default action below, of wrapping everything inside
5291        atomic brackets, does not happen. When the minimum is greater than 1,        atomic brackets, does not happen. When the minimum is greater than 1,
5292        there will be earlier copies of the group, and so we still have to wrap        there will be earlier copies of the group, and so we still have to wrap
5293        the whole thing. */        the whole thing. */
5294    
5295        else        else
# Line 5151  for (;; ptr++) Line 5298  for (;; ptr++)
5298          pcre_uchar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5299    
5300          /* Convert possessive ONCE brackets to non-capturing */          /* Convert possessive ONCE brackets to non-capturing */
5301    
5302          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5303              possessive_quantifier) *bracode = OP_BRA;              possessive_quantifier) *bracode = OP_BRA;
5304    
5305          /* For non-possessive ONCE brackets, all we need to do is to          /* For non-possessive ONCE brackets, all we need to do is to
5306          set the KET. */          set the KET. */
5307    
5308          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5309            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5310    
5311          /* Handle non-ONCE brackets and possessive ONCEs (which have been          /* Handle non-ONCE brackets and possessive ONCEs (which have been
5312          converted to non-capturing above). */          converted to non-capturing above). */
5313    
5314          else          else
5315            {            {
5316            /* In the compile phase, check for empty string matching. */            /* In the compile phase, check for empty string matching. */
5317    
5318            if (lengthptr == NULL)            if (lengthptr == NULL)
5319              {              {
5320              pcre_uchar *scode = bracode;              pcre_uchar *scode = bracode;
# Line 5182  for (;; ptr++) Line 5329  for (;; ptr++)
5329                }                }
5330              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5331              }              }
5332    
5333            /* Handle possessive quantifiers. */            /* Handle possessive quantifiers. */
5334    
5335            if (possessive_quantifier)            if (possessive_quantifier)
# Line 5191  for (;; ptr++) Line 5338  for (;; ptr++)
5338              repeated non-capturing bracket, because we have not invented POS              repeated non-capturing bracket, because we have not invented POS
5339              versions of the COND opcodes. Because we are moving code along, we              versions of the COND opcodes. Because we are moving code along, we
5340              must ensure that any pending recursive references are updated. */              must ensure that any pending recursive references are updated. */
5341    
5342              if (*bracode == OP_COND || *bracode == OP_SCOND)              if (*bracode == OP_COND || *bracode == OP_SCOND)
5343                {                {
5344                int nlen = (int)(code - bracode);                int nlen = (int)(code - bracode);
# Line 5204  for (;; ptr++) Line 5351  for (;; ptr++)
5351                *code++ = OP_KETRPOS;                *code++ = OP_KETRPOS;
5352                PUTINC(code, 0, nlen);                PUTINC(code, 0, nlen);
5353                PUT(bracode, 1, nlen);                PUT(bracode, 1, nlen);
5354                }                }
5355    
5356              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5357    
5358              else              else
5359                {                {
5360                *bracode += 1;              /* Switch to xxxPOS opcodes */                *bracode += 1;              /* Switch to xxxPOS opcodes */
5361                *ketcode = OP_KETRPOS;                *ketcode = OP_KETRPOS;
5362                }                }
5363    
5364              /* If the minimum is zero, mark it as possessive, then unset the              /* If the minimum is zero, mark it as possessive, then unset the
5365              possessive flag when the minimum is 0 or 1. */              possessive flag when the minimum is 0 or 1. */
5366    
5367              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5368              if (repeat_min < 2) possessive_quantifier = FALSE;              if (repeat_min < 2) possessive_quantifier = FALSE;
5369              }              }
5370    
5371            /* Non-possessive quantifier */            /* Non-possessive quantifier */
5372    
5373            else *ketcode = OP_KETRMAX + repeat_type;            else *ketcode = OP_KETRMAX + repeat_type;
5374            }            }
5375          }          }
# Line 5348  for (;; ptr++) Line 5495  for (;; ptr++)
5495    
5496      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
5497    
5498      if (*(++ptr) == CHAR_ASTERISK &&      ptr++;
5499           ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5500             || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5501        {        {
5502        int i, namelen;        int i, namelen;
5503        int arglen = 0;        int arglen = 0;
# Line 5357  for (;; ptr++) Line 5505  for (;; ptr++)
5505        const pcre_uchar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5506        const pcre_uchar *arg = NULL;        const pcre_uchar *arg = NULL;
5507        previous = NULL;        previous = NULL;
5508        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        ptr++;
5509          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5510        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5511    
5512        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
# Line 5543  for (;; ptr++) Line 5692  for (;; ptr++)
5692    
5693          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
5694    
5695          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5696            {            {
5697            ptr += 1;  /* To get the right offset */            ptr += 1;  /* To get the right offset */
5698            *errorcodeptr = ERR28;            *errorcodeptr = ERR28;
# Line 5554  for (;; ptr++) Line 5703  for (;; ptr++)
5703    
5704          recno = 0;          recno = 0;
5705          name = ++ptr;          name = ++ptr;
5706          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5707            {            {
5708            if (recno >= 0)            if (recno >= 0)
5709              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
# Line 5725  for (;; ptr++) Line 5874  for (;; ptr++)
5874            break;            break;
5875    
5876            default:                /* Could be name define, else bad */            default:                /* Could be name define, else bad */
5877            if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;            if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5878                goto DEFINE_NAME;
5879            ptr++;                  /* Correct offset for error */            ptr++;                  /* Correct offset for error */
5880            *errorcodeptr = ERR24;            *errorcodeptr = ERR24;
5881            goto FAILED;            goto FAILED;
# Line 5794  for (;; ptr++) Line 5944  for (;; ptr++)
5944              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5945            name = ++ptr;            name = ++ptr;
5946    
5947            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5948            namelen = (int)(ptr - name);            namelen = (int)(ptr - name);
5949    
5950            /* In the pre-compile phase, just do a syntax check. */            /* In the pre-compile phase, just do a syntax check. */
# Line 5811  for (;; ptr++) Line 5961  for (;; ptr++)
5961                *errorcodeptr = ERR49;                *errorcodeptr = ERR49;
5962                goto FAILED;                goto FAILED;
5963                }                }
5964              if (namelen + 3 > cd->name_entry_size)              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
5965                {                {
5966                cd->name_entry_size = namelen + 3;                cd->name_entry_size = namelen + IMM2_SIZE + 1;
5967                if (namelen > MAX_NAME_SIZE)                if (namelen > MAX_NAME_SIZE)
5968                  {                  {
5969                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR48;
# Line 5842  for (;; ptr++) Line 5992  for (;; ptr++)
5992    
5993              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
5994                {                {
5995                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
5996                if (crc == 0)                if (crc == 0)
5997                  {                  {
5998                  if (slot[2+namelen] == 0)                  if (slot[IMM2_SIZE+namelen] == 0)
5999                    {                    {
6000                    if (GET2(slot, 0) != cd->bracount + 1 &&                    if (GET2(slot, 0) != cd->bracount + 1 &&
6001                        (options & PCRE_DUPNAMES) == 0)                        (options & PCRE_DUPNAMES) == 0)
# Line 5897  for (;; ptr++) Line 6047  for (;; ptr++)
6047                }                }
6048    
6049              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
6050              memcpy(slot + 2, name, IN_UCHARS(namelen));              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6051              slot[2 + namelen] = 0;              slot[IMM2_SIZE + namelen] = 0;
6052              }              }
6053            }            }
6054    
# Line 5924  for (;; ptr++) Line 6074  for (;; ptr++)
6074    
6075          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
6076          name = ++ptr;          name = ++ptr;
6077          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6078          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6079    
6080          /* In the pre-compile phase, do a syntax check. We used to just set          /* In the pre-compile phase, do a syntax check. We used to just set
# Line 5982  for (;; ptr++) Line 6132  for (;; ptr++)
6132            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
6133              {              {
6134              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&              if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6135                  slot[2+namelen] == 0)                  slot[IMM2_SIZE+namelen] == 0)
6136                break;                break;
6137              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6138              }              }
# Line 6115  for (;; ptr++) Line 6265  for (;; ptr++)
6265                of the group. Then remember the forward reference. */                of the group. Then remember the forward reference. */
6266    
6267                called = cd->start_code + recno;                called = cd->start_code + recno;
6268                  if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6269                      WORK_SIZE_SAFETY_MARGIN)
6270                    {
6271                    *errorcodeptr = expand_workspace(cd);
6272                    if (*errorcodeptr != 0) goto FAILED;
6273                    }
6274                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6275                }                }
6276    
# Line 6135  for (;; ptr++) Line 6291  for (;; ptr++)
6291                }                }
6292              }              }
6293    
6294            /* Insert the recursion/subroutine item. */            /* Insert the recursion/subroutine item. It does not have a set first
6295              character (relevant if it is repeated, because it will then be
6296              wrapped with ONCE brackets). */
6297    
6298            *code = OP_RECURSE;            *code = OP_RECURSE;
6299            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
6300            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
6301              groupsetfirstchar = FALSE;
6302            }            }
6303    
6304          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 6501  for (;; ptr++) Line 6660  for (;; ptr++)
6660            BOOL isnumber = TRUE;            BOOL isnumber = TRUE;
6661            for (p = ptr + 1; *p != 0 && *p != terminator; p++)            for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6662              {              {
6663                if (!MAX_255(*p)) { isnumber = FALSE; break; }
6664              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
6665              if ((cd->ctypes[*p] & ctype_word) == 0) break;              if ((cd->ctypes[*p] & ctype_word) == 0) break;
6666              }              }
# Line 6623  for (;; ptr++) Line 6783  for (;; ptr++)
6783  #endif  #endif
6784          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6785          so that it works in DFA mode and in lookbehinds. */          so that it works in DFA mode and in lookbehinds. */
6786    
6787            {            {
6788            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6789            *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;            *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
6790            }            }
# Line 6636  for (;; ptr++) Line 6796  for (;; ptr++)
6796      a value > 127. We set its representation in the length/buffer, and then      a value > 127. We set its representation in the length/buffer, and then
6797      handle it as a data character. */      handle it as a data character. */
6798    
6799  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
6800      if (utf && c > 127)      if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6801        mclength = PRIV(ord2utf)(c, mcbuffer);        mclength = PRIV(ord2utf)(c, mcbuffer);
6802      else      else
6803  #endif  #endif
# Line 6661  for (;; ptr++) Line 6821  for (;; ptr++)
6821    
6822  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6823      if (utf && HAS_EXTRALEN(c))      if (utf && HAS_EXTRALEN(c))
6824        {        ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
       INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));  
       }  
6825  #endif  #endif
6826    
6827      /* At this point we have the character's bytes in mcbuffer, and the length      /* At this point we have the character's bytes in mcbuffer, and the length
# Line 7379  compile_data *cd = &compile_block; Line 7537  compile_data *cd = &compile_block;
7537  computing the amount of memory that is needed. Compiled items are thrown away  computing the amount of memory that is needed. Compiled items are thrown away
7538  as soon as possible, so that a fairly large buffer should be sufficient for  as soon as possible, so that a fairly large buffer should be sufficient for
7539  this purpose. The same space is used in the second phase for remembering where  this purpose. The same space is used in the second phase for remembering where
7540  to fill in forward references to subpatterns. */  to fill in forward references to subpatterns. That may overflow, in which case
7541    new memory is obtained from malloc(). */
7542    
7543  pcre_uchar cworkspace[COMPILE_WORK_SIZE];  pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7544    
# Line 7435  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 7594  while (ptr[skipatstart] == CHAR_LEFT_PAR
7594    int newnl = 0;    int newnl = 0;
7595    int newbsr = 0;    int newbsr = 0;
7596    
7597    if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)  #ifdef COMPILE_PCRE8
7598      if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 5) == 0)
7599      { skipatstart += 7; options |= PCRE_UTF8; continue; }      { skipatstart += 7; options |= PCRE_UTF8; continue; }
7600    #endif
7601    #ifdef COMPILE_PCRE16
7602      if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0)
7603        { skipatstart += 8; options |= PCRE_UTF16; continue; }
7604    #endif
7605    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
7606      { skipatstart += 6; options |= PCRE_UCP; continue; }      { skipatstart += 6; options |= PCRE_UCP; continue; }
7607    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
# Line 7468  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 7633  while (ptr[skipatstart] == CHAR_LEFT_PAR
7633  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
7634  utf = (options & PCRE_UTF8) != 0;  utf = (options & PCRE_UTF8) != 0;
7635    
7636  /* Can't support UTF8 unless PCRE has been compiled to include the code. The  /* Can't support UTF unless PCRE has been compiled to include the code. The
7637  return of an error code from PRIV(valid_utf)() is a new feature, introduced in  return of an error code from PRIV(valid_utf)() is a new feature, introduced in
7638  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7639  not used here. */  not used here. */
7640    
7641  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
7642  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7643       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7644    {    {
# Line 7573  cd->bracount = cd->final_bracount = 0; Line 7738  cd->bracount = cd->final_bracount = 0;
7738  cd->names_found = 0;  cd->names_found = 0;
7739  cd->name_entry_size = 0;  cd->name_entry_size = 0;
7740  cd->name_table = NULL;  cd->name_table = NULL;
 cd->start_workspace = cworkspace;  
7741  cd->start_code = cworkspace;  cd->start_code = cworkspace;
7742  cd->hwm = cworkspace;  cd->hwm = cworkspace;
7743    cd->start_workspace = cworkspace;
7744    cd->workspace_size = COMPILE_WORK_SIZE;
7745  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
7746  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7747  cd->req_varyopt = 0;  cd->req_varyopt = 0;
# Line 7610  externally provided function. Integer ov Line 7776  externally provided function. Integer ov
7776  because nowadays we limit the maximum value of cd->names_found and  because nowadays we limit the maximum value of cd->names_found and
7777  cd->name_entry_size. */  cd->name_entry_size. */
7778    
7779  size = sizeof(real_pcre) + (length + cd->names_found * (cd->name_entry_size + 3)) * sizeof(pcre_uchar);  size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7780  re = (real_pcre *)(pcre_malloc)(size);  re = (real_pcre *)(PUBL(malloc))(size);
7781    
7782  if (re == NULL)  if (re == NULL)
7783    {    {
# Line 7653  cd->names_found = 0; Line 7819  cd->names_found = 0;
7819  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7820  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
7821  cd->start_code = codestart;  cd->start_code = codestart;
7822  cd->hwm = cworkspace;  cd->hwm = (pcre_uchar *)(cd->start_workspace);
7823  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7824  cd->had_accept = FALSE;  cd->had_accept = FALSE;
7825  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
# Line 7670  code = (pcre_uchar *)codestart; Line 7836  code = (pcre_uchar *)codestart;
7836    &firstchar, &reqchar, NULL, cd, NULL);    &firstchar, &reqchar, NULL, cd, NULL);
7837  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
7838  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7839  re->flags = cd->external_flags;  re->flags = cd->external_flags | PCRE_MODE;
7840    
7841  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
7842    
# Line 7687  if debugging, leave the test till after Line 7853  if debugging, leave the test till after
7853  if (code - codestart > length) errorcode = ERR23;  if (code - codestart > length) errorcode = ERR23;
7854  #endif  #endif
7855    
7856  /* Fill in any forward references that are required. */  /* Fill in any forward references that are required. There may be repeated
7857    references; optimize for them, as searching a large regex takes time. */
7858    
7859  while (errorcode == 0 && cd->hwm > cworkspace)  if (cd->hwm > cd->start_workspace)
7860    {    {
7861    int offset, recno;    int prev_recno = -1;
7862    const pcre_uchar *groupptr;    const pcre_uchar *groupptr = NULL;
7863    cd->hwm -= LINK_SIZE;    while (errorcode == 0 && cd->hwm > cd->start_workspace)
7864    offset = GET(cd->hwm, 0);      {
7865    recno = GET(codestart, offset);      int offset, recno;
7866    groupptr = PRIV(find_bracket)(codestart, utf, recno);      cd->hwm -= LINK_SIZE;
7867    if (groupptr == NULL) errorcode = ERR53;      offset = GET(cd->hwm, 0);
7868      else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));      recno = GET(codestart, offset);
7869        if (recno != prev_recno)
7870          {
7871          groupptr = PRIV(find_bracket)(codestart, utf, recno);
7872          prev_recno = recno;
7873          }
7874        if (groupptr == NULL) errorcode = ERR53;
7875          else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
7876        }
7877    }    }
7878    
7879    /* If the workspace had to be expanded, free the new memory. */
7880    
7881    if (cd->workspace_size > COMPILE_WORK_SIZE)
7882      (PUBL(free))((void *)cd->start_workspace);
7883    
7884  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
7885  subpattern. */  subpattern. */
7886    
# Line 7753  if (cd->check_lookbehind) Line 7933  if (cd->check_lookbehind)
7933    
7934  if (errorcode != 0)  if (errorcode != 0)
7935    {    {
7936    (pcre_free)(re);    (PUBL(free))(re);
7937    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
7938    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
7939    PCRE_EARLY_ERROR_RETURN2:    PCRE_EARLY_ERROR_RETURN2:
# Line 7789  if ((re->options & PCRE_ANCHORED) == 0) Line 7969  if ((re->options & PCRE_ANCHORED) == 0)
7969        re->first_char = firstchar & 0xffff;        re->first_char = firstchar & 0xffff;
7970  #endif  #endif
7971  #endif  #endif
7972        if ((firstchar & REQ_CASELESS) != 0 && MAX_255(re->first_char)        if ((firstchar & REQ_CASELESS) != 0)
7973          && cd->fcc[re->first_char] != re->first_char)          {
7974          re->flags |= PCRE_FCH_CASELESS;  #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
7975            /* We ignore non-ASCII first chars in 8 bit mode. */
7976            if (utf)
7977              {
7978              if (re->first_char < 128)
7979                {
7980                if (cd->fcc[re->first_char] != re->first_char)
7981                  re->flags |= PCRE_FCH_CASELESS;
7982                }
7983              else if (UCD_OTHERCASE(re->first_char) != re->first_char)
7984                re->flags |= PCRE_FCH_CASELESS;
7985              }
7986            else
7987    #endif
7988            if (MAX_255(re->first_char)
7989                && cd->fcc[re->first_char] != re->first_char)
7990              re->flags |= PCRE_FCH_CASELESS;
7991            }
7992    
7993        re->flags |= PCRE_FIRSTSET;        re->flags |= PCRE_FIRSTSET;
7994        }        }
# Line 7814  if (reqchar >= 0 && Line 8011  if (reqchar >= 0 &&
8011    re->req_char = reqchar & 0xffff;    re->req_char = reqchar & 0xffff;
8012  #endif  #endif
8013  #endif  #endif
8014    if ((reqchar & REQ_CASELESS) != 0 && MAX_255(re->req_char)    if ((reqchar & REQ_CASELESS) != 0)
8015      && cd->fcc[re->req_char] != re->req_char)      {
8016      re->flags |= PCRE_RCH_CASELESS;  #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8017        /* We ignore non-ASCII first chars in 8 bit mode. */
8018        if (utf)
8019          {
8020          if (re->req_char < 128)
8021            {
8022            if (cd->fcc[re->req_char] != re->req_char)
8023              re->flags |= PCRE_RCH_CASELESS;
8024            }
8025          else if (UCD_OTHERCASE(re->req_char) != re->req_char)
8026            re->flags |= PCRE_RCH_CASELESS;
8027          }
8028        else
8029    #endif
8030        if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
8031          re->flags |= PCRE_RCH_CASELESS;
8032        }
8033    
8034    re->flags |= PCRE_REQCHSET;    re->flags |= PCRE_REQCHSET;
8035    }    }
# Line 7855  was compiled can be seen. */ Line 8068  was compiled can be seen. */
8068    
8069  if (code - codestart > length)  if (code - codestart > length)
8070    {    {
8071    (pcre_free)(re);    (PUBL(free))(re);
8072    *errorptr = find_error_text(ERR23);    *errorptr = find_error_text(ERR23);
8073    *erroroffset = ptr - (pcre_uchar *)pattern;    *erroroffset = ptr - (pcre_uchar *)pattern;
8074    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;

Legend:
Removed from v.782  
changed lines
  Added in v.804

  ViewVC Help
Powered by ViewVC 1.1.5