/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 794 by zherczeg, Thu Dec 8 07:36:41 2011 UTC revision 805 by ph10, Wed Dec 14 16:49:20 2011 UTC
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  #include "pcre_printint.c"
63  #endif  #endif
64    
65    
# Line 88  so this number is very generous. Line 89  so this number is very generous.
89  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
90  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
91  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
92  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
93    filled up by repetitions of forward references, for example patterns like
94    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
95    that the workspace is expanded using malloc() in this situation. The value
96    below is therefore a minimum, and we put a maximum on it for safety. The
97    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
98    kicks in at the same number of forward references in all cases. */
99    
100  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
101    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
102    
103  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
104  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
105    
106  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
107    
108  /* Private flags added to firstchar and reqchar. */  /* Private flags added to firstchar and reqchar. */
109    
# Line 474  static const char error_texts[] = Line 482  static const char error_texts[] =
482    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
483    /* 70 */    /* 70 */
484    "internal error: unknown opcode in find_fixedlength()\0"    "internal error: unknown opcode in find_fixedlength()\0"
485    "Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff)\0"    "\\N is not supported in a class\0"
486      "too many forward references\0"
487      "disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff)\0"
488    ;    ;
489    
490  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 649  return s; Line 659  return s;
659    
660    
661  /*************************************************  /*************************************************
662    *           Expand the workspace                 *
663    *************************************************/
664    
665    /* This function is called during the second compiling phase, if the number of
666    forward references fills the existing workspace, which is originally a block on
667    the stack. A larger block is obtained from malloc() unless the ultimate limit
668    has been reached or the increase will be rather small.
669    
670    Argument: pointer to the compile data block
671    Returns:  0 if all went well, else an error number
672    */
673    
674    static int
675    expand_workspace(compile_data *cd)
676    {
677    pcre_uchar *newspace;
678    int newsize = cd->workspace_size * 2;
679    
680    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
681    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
682        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
683     return ERR72;
684    
685    newspace = (PUBL(malloc))(newsize);
686    if (newspace == NULL) return ERR21;
687    
688    memcpy(newspace, cd->start_workspace, cd->workspace_size);
689    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
690    if (cd->workspace_size > COMPILE_WORK_SIZE)
691      (PUBL(free))((void *)cd->start_workspace);
692    cd->start_workspace = newspace;
693    cd->workspace_size = newsize;
694    return 0;
695    }
696    
697    
698    
699    /*************************************************
700  *            Check for counted repeat            *  *            Check for counted repeat            *
701  *************************************************/  *************************************************/
702    
# Line 1013  else Line 1061  else
1061    
1062        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1063          {          {
1064          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR71;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1065          ptr = pt;          ptr = pt;
1066          break;          break;
1067          }          }
# Line 1811  for (;;) Line 1859  for (;;)
1859      cc++;      cc++;
1860      break;      break;
1861    
1862      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1863      otherwise \C is coded as OP_ALLANY. */      otherwise \C is coded as OP_ALLANY. */
1864    
1865      case OP_ANYBYTE:      case OP_ANYBYTE:
# Line 2357  for (code = first_significant_code(code Line 2405  for (code = first_significant_code(code
2405      actual length is stored in the compiled code, so we must update "code"      actual length is stored in the compiled code, so we must update "code"
2406      here. */      here. */
2407    
2408  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2409      case OP_XCLASS:      case OP_XCLASS:
2410      ccode = code += GET(code, 1);      ccode = code += GET(code, 1);
2411      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
# Line 2367  for (code = first_significant_code(code Line 2415  for (code = first_significant_code(code
2415      case OP_NCLASS:      case OP_NCLASS:
2416      ccode = code + PRIV(OP_lengths)[OP_CLASS];      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2417    
2418  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2419      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2420  #endif  #endif
2421    
# Line 2909  if ((options & PCRE_EXTENDED) != 0) Line 2957  if ((options & PCRE_EXTENDED) != 0)
2957    {    {
2958    for (;;)    for (;;)
2959      {      {
2960      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2961      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2962        {        {
2963        ptr++;        ptr++;
# Line 2951  if ((options & PCRE_EXTENDED) != 0) Line 2999  if ((options & PCRE_EXTENDED) != 0)
2999    {    {
3000    for (;;)    for (;;)
3001      {      {
3002      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3003      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3004        {        {
3005        ptr++;        ptr++;
# Line 2980  the next item is a character. */ Line 3028  the next item is a character. */
3028  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
3029    {    {
3030    case OP_CHAR:    case OP_CHAR:
3031  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3032    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3033  #else  #else
3034    c = *previous;    c = *previous;
# Line 2992  if (next >= 0) switch(op_code) Line 3040  if (next >= 0) switch(op_code)
3040    high-valued characters. */    high-valued characters. */
3041    
3042    case OP_CHARI:    case OP_CHARI:
3043  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3044    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3045  #else  #else
3046    c = *previous;    c = *previous;
3047  #endif  #endif
3048    if (c == next) return FALSE;    if (c == next) return FALSE;
3049  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3050    if (utf)    if (utf)
3051      {      {
3052      unsigned int othercase;      unsigned int othercase;
# Line 3011  if (next >= 0) switch(op_code) Line 3059  if (next >= 0) switch(op_code)
3059      return (unsigned int)c != othercase;      return (unsigned int)c != othercase;
3060      }      }
3061    else    else
3062  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3063    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
3064    
3065    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
# Line 3023  if (next >= 0) switch(op_code) Line 3071  if (next >= 0) switch(op_code)
3071    
3072    case OP_NOTI:    case OP_NOTI:
3073    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
3074  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3075    if (utf)    if (utf)
3076      {      {
3077      unsigned int othercase;      unsigned int othercase;
# Line 3036  if (next >= 0) switch(op_code) Line 3084  if (next >= 0) switch(op_code)
3084      return (unsigned int)c == othercase;      return (unsigned int)c == othercase;
3085      }      }
3086    else    else
3087  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
3088    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
3089    
3090    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
# Line 3128  switch(op_code) Line 3176  switch(op_code)
3176    {    {
3177    case OP_CHAR:    case OP_CHAR:
3178    case OP_CHARI:    case OP_CHARI:
3179  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3180    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
3181  #else  #else
3182    c = *previous;    c = *previous;
# Line 3358  pcre_uint8 classbits[32]; Line 3406  pcre_uint8 classbits[32];
3406  must not do this for other options (e.g. PCRE_EXTENDED) because they may change  must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3407  dynamically as we process the pattern. */  dynamically as we process the pattern. */
3408    
3409  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3410  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3411  BOOL utf = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
3412  pcre_uchar utf_chars[6];  pcre_uchar utf_chars[6];
# Line 3413  for (;; ptr++) Line 3461  for (;; ptr++)
3461    BOOL is_quantifier;    BOOL is_quantifier;
3462    BOOL is_recurse;    BOOL is_recurse;
3463    BOOL reset_bracount;    BOOL reset_bracount;
3464    int class_charcount;    int class_has_8bitchar;
3465    int class_lastchar;    int class_single_char;
3466    int newoptions;    int newoptions;
3467    int recno;    int recno;
3468    int refsign;    int refsign;
# Line 3448  for (;; ptr++) Line 3496  for (;; ptr++)
3496  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3497      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3498  #endif  #endif
3499      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3500            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3501        {        {
3502        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3503        goto FAILED;        goto FAILED;
# Line 3473  for (;; ptr++) Line 3522  for (;; ptr++)
3522      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3523      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3524        (int)(code - last_code), c, c));        (int)(code - last_code), c, c));
3525    
3526      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3527      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
3528      if "previous" is NULL, reset the current code pointer to the start. */      if "previous" is NULL, reset the current code pointer to the start. */
# Line 3498  for (;; ptr++) Line 3547  for (;; ptr++)
3547    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3548    reference list. */    reference list. */
3549    
3550    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3551               WORK_SIZE_SAFETY_MARGIN)
3552      {      {
3553      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3554      goto FAILED;      goto FAILED;
# Line 3550  for (;; ptr++) Line 3600  for (;; ptr++)
3600    
3601    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3602      {      {
3603      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3604      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3605        {        {
3606        ptr++;        ptr++;
# Line 3710  for (;; ptr++) Line 3760  for (;; ptr++)
3760    
3761      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3762    
3763      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3764      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3765      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3766        a single character. */
3767    
3768      class_charcount = 0;      class_has_8bitchar = 0;
3769      class_lastchar = -1;      class_single_char = 0;
3770    
3771      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3772      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
# Line 3870  for (;; ptr++) Line 3921  for (;; ptr++)
3921            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3922    
3923          ptr = tempptr + 1;          ptr = tempptr + 1;
3924          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3925            class_has_8bitchar = 1;
3926            /* Every class contains at least two characters. */
3927            class_single_char = 2;
3928          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3929          }          }
3930    
3931        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3932        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3933        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3934        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3935        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3936        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3937          as literal characters (by default), or are faulted if
3938        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3939    
3940        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3888  for (;; ptr++) Line 3943  for (;; ptr++)
3943          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3944    
3945          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3946            else if (-c == ESC_N)            /* \N is not supported in a class */
3947              {
3948              *errorcodeptr = ERR71;
3949              goto FAILED;
3950              }
3951          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3952            {            {
3953            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3902  for (;; ptr++) Line 3962  for (;; ptr++)
3962          if (c < 0)          if (c < 0)
3963            {            {
3964            register const pcre_uint8 *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3965            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3966              class_has_8bitchar++;
3967              /* Every class contains at least two characters. */
3968              class_single_char += 2;
3969    
3970            switch (-c)            switch (-c)
3971              {              {
# Line 3915  for (;; ptr++) Line 3978  for (;; ptr++)
3978              case ESC_SU:              case ESC_SU:
3979              nestptr = ptr;              nestptr = ptr;
3980              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3981              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
3982              continue;              continue;
3983  #endif  #endif
3984              case ESC_d:              case ESC_d:
# Line 4081  for (;; ptr++) Line 4144  for (;; ptr++)
4144                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4145                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
4146                *class_uchardata++ = pdata;                *class_uchardata++ = pdata;
4147                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4148                continue;                continue;
4149                }                }
4150  #endif  #endif
# Line 4095  for (;; ptr++) Line 4158  for (;; ptr++)
4158                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4159                goto FAILED;                goto FAILED;
4160                }                }
4161              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4162              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4163                c = *ptr;                /* Get the final character and fall through */
4164              break;              break;
4165              }              }
4166            }            }
4167    
4168          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4169          greater than 256 mode. */          greater than 256. */
4170    
4171          }   /* End of backslash handling */          }   /* End of backslash handling */
4172    
# Line 4150  for (;; ptr++) Line 4214  for (;; ptr++)
4214            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
4215            }            }
4216    
4217  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
4218          if (utf)          if (utf)
4219            {                           /* Braces are required because the */            {                           /* Braces are required because the */
4220            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
# Line 4195  for (;; ptr++) Line 4259  for (;; ptr++)
4259    
4260          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4261    
4262            /* Since we found a character range, single character optimizations
4263            cannot be done anymore. */
4264            class_single_char = 2;
4265    
4266          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4267          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4268          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
4269          available. */          available. */
4270    
4271  #ifdef SUPPORT_UTF  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4272            if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4273    #elif defined  SUPPORT_UTF
4274          if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))          if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4275  #elif !(defined COMPILE_PCRE8)  #elif !(defined COMPILE_PCRE8)
4276          if (d > 255)          if (d > 255)
# Line 4214  for (;; ptr++) Line 4284  for (;; ptr++)
4284            they fit with the basic range. */            they fit with the basic range. */
4285    
4286  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4287    #ifndef COMPILE_PCRE8
4288              if (utf && (options & PCRE_CASELESS) != 0)
4289    #else
4290            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4291    #endif
4292              {              {
4293              unsigned int occ, ocd;              unsigned int occ, ocd;
4294              unsigned int cc = c;              unsigned int cc = c;
# Line 4257  for (;; ptr++) Line 4331  for (;; ptr++)
4331    
4332            *class_uchardata++ = XCL_RANGE;            *class_uchardata++ = XCL_RANGE;
4333  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4334    #ifndef COMPILE_PCRE8
4335              if (utf)
4336                {
4337                class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4338                class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4339                }
4340              else
4341                {
4342                *class_uchardata++ = c;
4343                *class_uchardata++ = d;
4344                }
4345    #else
4346            class_uchardata += PRIV(ord2utf)(c, class_uchardata);            class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4347            class_uchardata += PRIV(ord2utf)(d, class_uchardata);            class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4348  #else  #endif
4349    #else /* SUPPORT_UTF */
4350            *class_uchardata++ = c;            *class_uchardata++ = c;
4351            *class_uchardata++ = d;            *class_uchardata++ = d;
4352  #endif  #endif /* SUPPORT_UTF */
4353    
4354            /* With UCP support, we are done. Without UCP support, there is no            /* With UCP support, we are done. Without UCP support, there is no
4355            caseless matching for UTF characters > 127; we can use the bit map            caseless matching for UTF characters > 127; we can use the bit map
# Line 4270  for (;; ptr++) Line 4357  for (;; ptr++)
4357            can still use  */            can still use  */
4358    
4359  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4360            continue;    /* With next character in the class */  #ifndef COMPILE_PCRE8
4361  #else            if (utf)
4362  #ifdef SUPPORT_UTF  #endif
4363                continue;    /* With next character in the class */
4364    #endif  /* SUPPORT_UCP */
4365    
4366    #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4367              if (utf)
4368                {
4369                if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4370                /* Adjust upper limit and fall through to set up the map */
4371                d = 127;
4372                }
4373              else
4374                {
4375                if (c > 255) continue;
4376                /* Adjust upper limit and fall through to set up the map */
4377                d = 255;
4378                }
4379    #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4380            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4381            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
4382            d = 127;            d = 127;
# Line 4280  for (;; ptr++) Line 4384  for (;; ptr++)
4384            if (c > 255) continue;            if (c > 255) continue;
4385            /* Adjust upper limit and fall through to set up the map */            /* Adjust upper limit and fall through to set up the map */
4386            d = 255;            d = 255;
4387  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
 #endif  /* SUPPORT_UCP */  
4388            }            }
4389  #endif  /* SUPPORT_UTF8 || COMPILE_PCRE16 */  #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4390    
4391          /* We use the bit map for 8 bit mode, or when the characters fall          /* We use the bit map for 8 bit mode, or when the characters fall
4392          partially or entirely to [0-255] ([0-127] for UCP) ranges. */          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4393    
4394          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4395    
4396          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4397    
# Line 4312  for (;; ptr++) Line 4414  for (;; ptr++)
4414    
4415        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4416    
4417        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4418          if (class_single_char < 2) class_single_char++;
4419    
4420          /* If class_charcount is 1, we saw precisely one character. As long as
4421          there were no negated characters >= 128 and there was no use of \p or \P,
4422          in other words, no use of any XCLASS features, we can optimize.
4423    
4424          In UTF-8 mode, we can optimize the negative case only if there were no
4425          characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4426          operate on single-bytes characters only. This is an historical hangover.
4427          Maybe one day we can tidy these opcodes to handle multi-byte characters.
4428    
4429          The optimization throws away the bit map. We turn the item into a
4430          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4431          Note that OP_NOT[I] does not support multibyte characters. In the positive
4432          case, it can cause firstchar to be set. Otherwise, there can be no first
4433          char if this item is first, whatever repeat count may follow. In the case
4434          of reqchar, save the previous value for reinstating. */
4435    
4436    #ifdef SUPPORT_UTF
4437          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4438            && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4439    #else
4440          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4441    #endif
4442            {
4443            ptr++;
4444            zeroreqchar = reqchar;
4445    
4446            /* The OP_NOT[I] opcodes work on single characters only. */
4447    
4448            if (negate_class)
4449              {
4450              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4451              zerofirstchar = firstchar;
4452              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4453              *code++ = c;
4454              goto NOT_CHAR;
4455              }
4456    
4457            /* For a single, positive character, get the value into mcbuffer, and
4458            then we can handle this with the normal one-character code. */
4459    
4460  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4461            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4462              mclength = PRIV(ord2utf)(c, mcbuffer);
4463            else
4464    #endif
4465              {
4466              mcbuffer[0] = c;
4467              mclength = 1;
4468              }
4469            goto ONE_CHAR;
4470            }       /* End of 1-char optimization */
4471    
4472          /* Handle a character that cannot go in the bit map. */
4473    
4474    #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4475          if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4476    #elif defined SUPPORT_UTF
4477        if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))        if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4478  #elif !(defined COMPILE_PCRE8)  #elif !(defined COMPILE_PCRE8)
4479        if (c > 255)        if (c > 255)
4480  #endif  #endif
4481    
4482  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4483          {          {
4484          xclass = TRUE;          xclass = TRUE;
4485          *class_uchardata++ = XCL_SINGLE;          *class_uchardata++ = XCL_SINGLE;
4486  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4487          class_uchardata += PRIV(ord2utf)(c, class_uchardata);  #ifndef COMPILE_PCRE8
4488  #else          /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4489          *class_uchardata++ = c;          if (!utf)
4490              *class_uchardata++ = c;
4491            else
4492  #endif  #endif
4493              class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4494    #else /* SUPPORT_UTF */
4495            *class_uchardata++ = c;
4496    #endif /* SUPPORT_UTF */
4497    
4498  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4499    #ifdef COMPILE_PCRE8
4500          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4501    #else
4502            /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4503            if (utf && (options & PCRE_CASELESS) != 0)
4504    #endif
4505            {            {
4506            unsigned int othercase;            unsigned int othercase;
4507            if ((othercase = UCD_OTHERCASE(c)) != c)            if ((othercase = UCD_OTHERCASE(c)) != c)
# Line 4344  for (;; ptr++) Line 4515  for (;; ptr++)
4515          }          }
4516        else        else
4517  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4518    
4519        /* Handle a single-byte character */        /* Handle a single-byte character */
4520          {          {
4521            class_has_8bitchar = 1;
4522          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4523          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4524            {            {
4525            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c];   /* flip case */
4526            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4527            }            }
         class_charcount++;  
         class_lastchar = c;  
4528          }          }
   
4529        }        }
4530    
4531      /* Loop until ']' reached. This "while" is the end of the "do" far above.      /* Loop until ']' reached. This "while" is the end of the "do" far above.
# Line 4375  for (;; ptr++) Line 4545  for (;; ptr++)
4545        goto FAILED;        goto FAILED;
4546        }        }
4547    
4548      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4549      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4550      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
     optimize.  
   
     In UTF-8 mode, we can optimize the negative case only if there were no  
     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR  
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstchar to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqchar, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF  
     if (class_charcount == 1 && !xclass &&  
       (!utf || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqchar = reqchar;  
   
       /* The OP_NOT[I] opcodes work on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstchar == REQ_UNSET) firstchar = REQ_NONE;  
         zerofirstchar = firstchar;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF8  
       if (utf && class_lastchar > 127)  
         mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqchar setting must remain unchanged after any kind of  
     repeat. */  
4551    
4552      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4553      zerofirstchar = firstchar;      zerofirstchar = firstchar;
# Line 4460  for (;; ptr++) Line 4577  for (;; ptr++)
4577        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4578        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4579    
4580        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4581          {          {
4582          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4583          memmove(code + (32 / sizeof(pcre_uchar)), code,          memmove(code + (32 / sizeof(pcre_uchar)), code,
# Line 4472  for (;; ptr++) Line 4589  for (;; ptr++)
4589    
4590        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4591    
4592        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4593        break;   /* End of class handling */        break;   /* End of class handling */
4594        }        }
4595  #endif  #endif
# Line 4491  for (;; ptr++) Line 4608  for (;; ptr++)
4608        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4609        }        }
4610      code += 32 / sizeof(pcre_uchar);      code += 32 / sizeof(pcre_uchar);
4611        NOT_CHAR:
4612      break;      break;
4613    
4614    
# Line 4567  for (;; ptr++) Line 4685  for (;; ptr++)
4685      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4686      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4687      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4688    
4689      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4690        {        {
4691        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
# Line 4611  for (;; ptr++) Line 4729  for (;; ptr++)
4729          {          {
4730          pcre_uchar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4731          BACKCHAR(lastchar);          BACKCHAR(lastchar);
4732          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4733          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4734          c |= UTF_LENGTH;                /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4735          }          }
# Line 4843  for (;; ptr++) Line 4961  for (;; ptr++)
4961    
4962      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS ||
4963               *previous == OP_NCLASS ||               *previous == OP_NCLASS ||
4964  #if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4965               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
4966  #endif  #endif
4967               *previous == OP_REF ||               *previous == OP_REF ||
# Line 5017  for (;; ptr++) Line 5135  for (;; ptr++)
5135              *lengthptr += delta;              *lengthptr += delta;
5136              }              }
5137    
5138            /* This is compiling for real */            /* This is compiling for real. If there is a set first byte for
5139              the group, and we have not yet set a "required byte", set it. Make
5140              sure there is enough workspace for copying forward references before
5141              doing the copy. */
5142    
5143            else            else
5144              {              {
5145              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5146    
5147              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5148                {                {
5149                pcre_uchar *hc;                pcre_uchar *hc;
5150                pcre_uchar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5151                memcpy(code, previous, IN_UCHARS(len));                memcpy(code, previous, IN_UCHARS(len));
5152    
5153                  while (cd->hwm > cd->start_workspace + cd->workspace_size -
5154                         WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5155                    {
5156                    int save_offset = save_hwm - cd->start_workspace;
5157                    int this_offset = this_hwm - cd->start_workspace;
5158                    *errorcodeptr = expand_workspace(cd);
5159                    if (*errorcodeptr != 0) goto FAILED;
5160                    save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5161                    this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5162                    }
5163    
5164                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5165                  {                  {
5166                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 5094  for (;; ptr++) Line 5228  for (;; ptr++)
5228              }              }
5229    
5230            memcpy(code, previous, IN_UCHARS(len));            memcpy(code, previous, IN_UCHARS(len));
5231    
5232              /* Ensure there is enough workspace for forward references before
5233              copying them. */
5234    
5235              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5236                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5237                {
5238                int save_offset = save_hwm - cd->start_workspace;
5239                int this_offset = this_hwm - cd->start_workspace;
5240                *errorcodeptr = expand_workspace(cd);
5241                if (*errorcodeptr != 0) goto FAILED;
5242                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5243                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5244                }
5245    
5246            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5247              {              {
5248              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 5124  for (;; ptr++) Line 5273  for (;; ptr++)
5273        ONCE brackets can be converted into non-capturing brackets, as the        ONCE brackets can be converted into non-capturing brackets, as the
5274        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5275        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5276    
5277        Otherwise, when we are doing the actual compile phase, check to see        Otherwise, when we are doing the actual compile phase, check to see
5278        whether this group is one that could match an empty string. If so,        whether this group is one that could match an empty string. If so,
5279        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5280        that runtime checking can be done. [This check is also applied to ONCE        that runtime checking can be done. [This check is also applied to ONCE
5281        groups at runtime, but in a different way.]        groups at runtime, but in a different way.]
5282    
5283        Then, if the quantifier was possessive and the bracket is not a        Then, if the quantifier was possessive and the bracket is not a
5284        conditional, we convert the BRA code to the POS form, and the KET code to        conditional, we convert the BRA code to the POS form, and the KET code to
5285        KETRPOS. (It turns out to be convenient at runtime to detect this kind of        KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5286        subpattern at both the start and at the end.) The use of special opcodes        subpattern at both the start and at the end.) The use of special opcodes
5287        makes it possible to reduce greatly the stack usage in pcre_exec(). If        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5288        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5289    
5290        Then, if the minimum number of matches is 1 or 0, cancel the possessive        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5291        flag so that the default action below, of wrapping everything inside        flag so that the default action below, of wrapping everything inside
5292        atomic brackets, does not happen. When the minimum is greater than 1,        atomic brackets, does not happen. When the minimum is greater than 1,
5293        there will be earlier copies of the group, and so we still have to wrap        there will be earlier copies of the group, and so we still have to wrap
5294        the whole thing. */        the whole thing. */
5295    
5296        else        else
# Line 5150  for (;; ptr++) Line 5299  for (;; ptr++)
5299          pcre_uchar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5300    
5301          /* Convert possessive ONCE brackets to non-capturing */          /* Convert possessive ONCE brackets to non-capturing */
5302    
5303          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5304              possessive_quantifier) *bracode = OP_BRA;              possessive_quantifier) *bracode = OP_BRA;
5305    
5306          /* For non-possessive ONCE brackets, all we need to do is to          /* For non-possessive ONCE brackets, all we need to do is to
5307          set the KET. */          set the KET. */
5308    
5309          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5310            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5311    
5312          /* Handle non-ONCE brackets and possessive ONCEs (which have been          /* Handle non-ONCE brackets and possessive ONCEs (which have been
5313          converted to non-capturing above). */          converted to non-capturing above). */
5314    
5315          else          else
5316            {            {
5317            /* In the compile phase, check for empty string matching. */            /* In the compile phase, check for empty string matching. */
5318    
5319            if (lengthptr == NULL)            if (lengthptr == NULL)
5320              {              {
5321              pcre_uchar *scode = bracode;              pcre_uchar *scode = bracode;
# Line 5181  for (;; ptr++) Line 5330  for (;; ptr++)
5330                }                }
5331              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5332              }              }
5333    
5334            /* Handle possessive quantifiers. */            /* Handle possessive quantifiers. */
5335    
5336            if (possessive_quantifier)            if (possessive_quantifier)
# Line 5190  for (;; ptr++) Line 5339  for (;; ptr++)
5339              repeated non-capturing bracket, because we have not invented POS              repeated non-capturing bracket, because we have not invented POS
5340              versions of the COND opcodes. Because we are moving code along, we              versions of the COND opcodes. Because we are moving code along, we
5341              must ensure that any pending recursive references are updated. */              must ensure that any pending recursive references are updated. */
5342    
5343              if (*bracode == OP_COND || *bracode == OP_SCOND)              if (*bracode == OP_COND || *bracode == OP_SCOND)
5344                {                {
5345                int nlen = (int)(code - bracode);                int nlen = (int)(code - bracode);
# Line 5203  for (;; ptr++) Line 5352  for (;; ptr++)
5352                *code++ = OP_KETRPOS;                *code++ = OP_KETRPOS;
5353                PUTINC(code, 0, nlen);                PUTINC(code, 0, nlen);
5354                PUT(bracode, 1, nlen);                PUT(bracode, 1, nlen);
5355                }                }
5356    
5357              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5358    
5359              else              else
5360                {                {
5361                *bracode += 1;              /* Switch to xxxPOS opcodes */                *bracode += 1;              /* Switch to xxxPOS opcodes */
5362                *ketcode = OP_KETRPOS;                *ketcode = OP_KETRPOS;
5363                }                }
5364    
5365              /* If the minimum is zero, mark it as possessive, then unset the              /* If the minimum is zero, mark it as possessive, then unset the
5366              possessive flag when the minimum is 0 or 1. */              possessive flag when the minimum is 0 or 1. */
5367    
5368              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5369              if (repeat_min < 2) possessive_quantifier = FALSE;              if (repeat_min < 2) possessive_quantifier = FALSE;
5370              }              }
5371    
5372            /* Non-possessive quantifier */            /* Non-possessive quantifier */
5373    
5374            else *ketcode = OP_KETRMAX + repeat_type;            else *ketcode = OP_KETRMAX + repeat_type;
5375            }            }
5376          }          }
# Line 5347  for (;; ptr++) Line 5496  for (;; ptr++)
5496    
5497      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
5498    
5499      if (*(++ptr) == CHAR_ASTERISK &&      ptr++;
5500           ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5501             || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5502        {        {
5503        int i, namelen;        int i, namelen;
5504        int arglen = 0;        int arglen = 0;
# Line 5356  for (;; ptr++) Line 5506  for (;; ptr++)
5506        const pcre_uchar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5507        const pcre_uchar *arg = NULL;        const pcre_uchar *arg = NULL;
5508        previous = NULL;        previous = NULL;
5509        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        ptr++;
5510          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5511        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5512    
5513        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
# Line 5542  for (;; ptr++) Line 5693  for (;; ptr++)
5693    
5694          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
5695    
5696          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5697            {            {
5698            ptr += 1;  /* To get the right offset */            ptr += 1;  /* To get the right offset */
5699            *errorcodeptr = ERR28;            *errorcodeptr = ERR28;
# Line 5553  for (;; ptr++) Line 5704  for (;; ptr++)
5704    
5705          recno = 0;          recno = 0;
5706          name = ++ptr;          name = ++ptr;
5707          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5708            {            {
5709            if (recno >= 0)            if (recno >= 0)
5710              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
# Line 5724  for (;; ptr++) Line 5875  for (;; ptr++)
5875            break;            break;
5876    
5877            default:                /* Could be name define, else bad */            default:                /* Could be name define, else bad */
5878            if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;            if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5879                goto DEFINE_NAME;
5880            ptr++;                  /* Correct offset for error */            ptr++;                  /* Correct offset for error */
5881            *errorcodeptr = ERR24;            *errorcodeptr = ERR24;
5882            goto FAILED;            goto FAILED;
# Line 5793  for (;; ptr++) Line 5945  for (;; ptr++)
5945              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5946            name = ++ptr;            name = ++ptr;
5947    
5948            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5949            namelen = (int)(ptr - name);            namelen = (int)(ptr - name);
5950    
5951            /* In the pre-compile phase, just do a syntax check. */            /* In the pre-compile phase, just do a syntax check. */
# Line 5923  for (;; ptr++) Line 6075  for (;; ptr++)
6075    
6076          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
6077          name = ++ptr;          name = ++ptr;
6078          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6079          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6080    
6081          /* In the pre-compile phase, do a syntax check. We used to just set          /* In the pre-compile phase, do a syntax check. We used to just set
# Line 6114  for (;; ptr++) Line 6266  for (;; ptr++)
6266                of the group. Then remember the forward reference. */                of the group. Then remember the forward reference. */
6267    
6268                called = cd->start_code + recno;                called = cd->start_code + recno;
6269                  if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6270                      WORK_SIZE_SAFETY_MARGIN)
6271                    {
6272                    *errorcodeptr = expand_workspace(cd);
6273                    if (*errorcodeptr != 0) goto FAILED;
6274                    }
6275                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6276                }                }
6277    
# Line 6134  for (;; ptr++) Line 6292  for (;; ptr++)
6292                }                }
6293              }              }
6294    
6295            /* Insert the recursion/subroutine item. */            /* Insert the recursion/subroutine item. It does not have a set first
6296              character (relevant if it is repeated, because it will then be
6297              wrapped with ONCE brackets). */
6298    
6299            *code = OP_RECURSE;            *code = OP_RECURSE;
6300            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
6301            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
6302              groupsetfirstchar = FALSE;
6303            }            }
6304    
6305          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 6500  for (;; ptr++) Line 6661  for (;; ptr++)
6661            BOOL isnumber = TRUE;            BOOL isnumber = TRUE;
6662            for (p = ptr + 1; *p != 0 && *p != terminator; p++)            for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6663              {              {
6664                if (!MAX_255(*p)) { isnumber = FALSE; break; }
6665              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
6666              if ((cd->ctypes[*p] & ctype_word) == 0) break;              if ((cd->ctypes[*p] & ctype_word) == 0) break;
6667              }              }
# Line 6622  for (;; ptr++) Line 6784  for (;; ptr++)
6784  #endif  #endif
6785          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6786          so that it works in DFA mode and in lookbehinds. */          so that it works in DFA mode and in lookbehinds. */
6787    
6788            {            {
6789            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6790            *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;            *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
6791            }            }
# Line 6635  for (;; ptr++) Line 6797  for (;; ptr++)
6797      a value > 127. We set its representation in the length/buffer, and then      a value > 127. We set its representation in the length/buffer, and then
6798      handle it as a data character. */      handle it as a data character. */
6799    
6800  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
6801      if (utf && c > 127)      if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6802        mclength = PRIV(ord2utf)(c, mcbuffer);        mclength = PRIV(ord2utf)(c, mcbuffer);
6803      else      else
6804  #endif  #endif
# Line 7376  compile_data *cd = &compile_block; Line 7538  compile_data *cd = &compile_block;
7538  computing the amount of memory that is needed. Compiled items are thrown away  computing the amount of memory that is needed. Compiled items are thrown away
7539  as soon as possible, so that a fairly large buffer should be sufficient for  as soon as possible, so that a fairly large buffer should be sufficient for
7540  this purpose. The same space is used in the second phase for remembering where  this purpose. The same space is used in the second phase for remembering where
7541  to fill in forward references to subpatterns. */  to fill in forward references to subpatterns. That may overflow, in which case
7542    new memory is obtained from malloc(). */
7543    
7544  pcre_uchar cworkspace[COMPILE_WORK_SIZE];  pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7545    
# Line 7471  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 7634  while (ptr[skipatstart] == CHAR_LEFT_PAR
7634  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
7635  utf = (options & PCRE_UTF8) != 0;  utf = (options & PCRE_UTF8) != 0;
7636    
7637  /* Can't support UTF8 unless PCRE has been compiled to include the code. The  /* Can't support UTF unless PCRE has been compiled to include the code. The
7638  return of an error code from PRIV(valid_utf)() is a new feature, introduced in  return of an error code from PRIV(valid_utf)() is a new feature, introduced in
7639  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7640  not used here. */  not used here. */
7641    
7642  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
7643  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7644       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)       (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7645    {    {
# Line 7576  cd->bracount = cd->final_bracount = 0; Line 7739  cd->bracount = cd->final_bracount = 0;
7739  cd->names_found = 0;  cd->names_found = 0;
7740  cd->name_entry_size = 0;  cd->name_entry_size = 0;
7741  cd->name_table = NULL;  cd->name_table = NULL;
 cd->start_workspace = cworkspace;  
7742  cd->start_code = cworkspace;  cd->start_code = cworkspace;
7743  cd->hwm = cworkspace;  cd->hwm = cworkspace;
7744    cd->start_workspace = cworkspace;
7745    cd->workspace_size = COMPILE_WORK_SIZE;
7746  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
7747  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7748  cd->req_varyopt = 0;  cd->req_varyopt = 0;
# Line 7614  because nowadays we limit the maximum va Line 7778  because nowadays we limit the maximum va
7778  cd->name_entry_size. */  cd->name_entry_size. */
7779    
7780  size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);  size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7781  re = (real_pcre *)(pcre_malloc)(size);  re = (real_pcre *)(PUBL(malloc))(size);
7782    
7783  if (re == NULL)  if (re == NULL)
7784    {    {
# Line 7656  cd->names_found = 0; Line 7820  cd->names_found = 0;
7820  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7821  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
7822  cd->start_code = codestart;  cd->start_code = codestart;
7823  cd->hwm = cworkspace;  cd->hwm = (pcre_uchar *)(cd->start_workspace);
7824  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7825  cd->had_accept = FALSE;  cd->had_accept = FALSE;
7826  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
# Line 7673  code = (pcre_uchar *)codestart; Line 7837  code = (pcre_uchar *)codestart;
7837    &firstchar, &reqchar, NULL, cd, NULL);    &firstchar, &reqchar, NULL, cd, NULL);
7838  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
7839  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7840  re->flags = cd->external_flags;  re->flags = cd->external_flags | PCRE_MODE;
7841    
7842  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
7843    
# Line 7690  if debugging, leave the test till after Line 7854  if debugging, leave the test till after
7854  if (code - codestart > length) errorcode = ERR23;  if (code - codestart > length) errorcode = ERR23;
7855  #endif  #endif
7856    
7857  /* Fill in any forward references that are required. */  /* Fill in any forward references that are required. There may be repeated
7858    references; optimize for them, as searching a large regex takes time. */
7859    
7860  while (errorcode == 0 && cd->hwm > cworkspace)  if (cd->hwm > cd->start_workspace)
7861    {    {
7862    int offset, recno;    int prev_recno = -1;
7863    const pcre_uchar *groupptr;    const pcre_uchar *groupptr = NULL;
7864    cd->hwm -= LINK_SIZE;    while (errorcode == 0 && cd->hwm > cd->start_workspace)
7865    offset = GET(cd->hwm, 0);      {
7866    recno = GET(codestart, offset);      int offset, recno;
7867    groupptr = PRIV(find_bracket)(codestart, utf, recno);      cd->hwm -= LINK_SIZE;
7868    if (groupptr == NULL) errorcode = ERR53;      offset = GET(cd->hwm, 0);
7869      else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));      recno = GET(codestart, offset);
7870        if (recno != prev_recno)
7871          {
7872          groupptr = PRIV(find_bracket)(codestart, utf, recno);
7873          prev_recno = recno;
7874          }
7875        if (groupptr == NULL) errorcode = ERR53;
7876          else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
7877        }
7878    }    }
7879    
7880    /* If the workspace had to be expanded, free the new memory. */
7881    
7882    if (cd->workspace_size > COMPILE_WORK_SIZE)
7883      (PUBL(free))((void *)cd->start_workspace);
7884    
7885  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
7886  subpattern. */  subpattern. */
7887    
# Line 7756  if (cd->check_lookbehind) Line 7934  if (cd->check_lookbehind)
7934    
7935  if (errorcode != 0)  if (errorcode != 0)
7936    {    {
7937    (pcre_free)(re);    (PUBL(free))(re);
7938    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
7939    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
7940    PCRE_EARLY_ERROR_RETURN2:    PCRE_EARLY_ERROR_RETURN2:
# Line 7884  if ((re->flags & PCRE_REQCHSET) != 0) Line 8062  if ((re->flags & PCRE_REQCHSET) != 0)
8062      else printf("Req char = \\x%02x%s\n", ch, caseless);      else printf("Req char = \\x%02x%s\n", ch, caseless);
8063    }    }
8064    
8065    #ifdef COMPILE_PCRE8
8066  pcre_printint(re, stdout, TRUE);  pcre_printint(re, stdout, TRUE);
8067    #else
8068    pcre16_printint(re, stdout, TRUE);
8069    #endif
8070    
8071  /* This check is done here in the debugging case so that the code that  /* This check is done here in the debugging case so that the code that
8072  was compiled can be seen. */  was compiled can be seen. */
8073    
8074  if (code - codestart > length)  if (code - codestart > length)
8075    {    {
8076    (pcre_free)(re);    (PUBL(free))(re);
8077    *errorptr = find_error_text(ERR23);    *errorptr = find_error_text(ERR23);
8078    *erroroffset = ptr - (pcre_uchar *)pattern;    *erroroffset = ptr - (pcre_uchar *)pattern;
8079    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;

Legend:
Removed from v.794  
changed lines
  Added in v.805

  ViewVC Help
Powered by ViewVC 1.1.5