/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 795 by zherczeg, Sat Dec 10 02:20:06 2011 UTC revision 812 by zherczeg, Mon Dec 19 21:23:42 2011 UTC
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is  /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57  also used by pcretest. PCRE_DEBUG is not defined when building a production  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58  library. */  library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
# Line 88  so this number is very generous. Line 92  so this number is very generous.
92  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
93  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
94  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
96    filled up by repetitions of forward references, for example patterns like
97    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98    that the workspace is expanded using malloc() in this situation. The value
99    below is therefore a minimum, and we put a maximum on it for safety. The
100    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101    kicks in at the same number of forward references in all cases. */
102    
103  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105    
106  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
107  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
108    
109  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_SAFETY_MARGIN (100)
110    
111  /* Private flags added to firstchar and reqchar. */  /* Private flags added to firstchar and reqchar. */
112    
# Line 474  static const char error_texts[] = Line 485  static const char error_texts[] =
485    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486    /* 70 */    /* 70 */
487    "internal error: unknown opcode in find_fixedlength()\0"    "internal error: unknown opcode in find_fixedlength()\0"
488    "Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff)\0"    "\\N is not supported in a class\0"
489      "too many forward references\0"
490      "disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff)\0"
491    ;    ;
492    
493  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 649  return s; Line 662  return s;
662    
663    
664  /*************************************************  /*************************************************
665    *           Expand the workspace                 *
666    *************************************************/
667    
668    /* This function is called during the second compiling phase, if the number of
669    forward references fills the existing workspace, which is originally a block on
670    the stack. A larger block is obtained from malloc() unless the ultimate limit
671    has been reached or the increase will be rather small.
672    
673    Argument: pointer to the compile data block
674    Returns:  0 if all went well, else an error number
675    */
676    
677    static int
678    expand_workspace(compile_data *cd)
679    {
680    pcre_uchar *newspace;
681    int newsize = cd->workspace_size * 2;
682    
683    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
684    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
685        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
686     return ERR72;
687    
688    newspace = (PUBL(malloc))(newsize);
689    if (newspace == NULL) return ERR21;
690    
691    memcpy(newspace, cd->start_workspace, cd->workspace_size);
692    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
693    if (cd->workspace_size > COMPILE_WORK_SIZE)
694      (PUBL(free))((void *)cd->start_workspace);
695    cd->start_workspace = newspace;
696    cd->workspace_size = newsize;
697    return 0;
698    }
699    
700    
701    
702    /*************************************************
703  *            Check for counted repeat            *  *            Check for counted repeat            *
704  *************************************************/  *************************************************/
705    
# Line 1013  else Line 1064  else
1064    
1065        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1066          {          {
1067          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR71;          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1068          ptr = pt;          ptr = pt;
1069          break;          break;
1070          }          }
# Line 1144  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1195  if (c == CHAR_LEFT_CURLY_BRACKET)
1195      *negptr = TRUE;      *negptr = TRUE;
1196      ptr++;      ptr++;
1197      }      }
1198    for (i = 0; i < (int)sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1199      {      {
1200      c = *(++ptr);      c = *(++ptr);
1201      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 1658  for (;;) Line 1709  for (;;)
1709    int d;    int d;
1710    pcre_uchar *ce, *cs;    pcre_uchar *ce, *cs;
1711    register int op = *cc;    register int op = *cc;
1712    
1713    switch (op)    switch (op)
1714      {      {
1715      /* We only need to continue for OP_CBRA (normal capturing bracket) and      /* We only need to continue for OP_CBRA (normal capturing bracket) and
# Line 1717  for (;;) Line 1769  for (;;)
1769      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1770      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1771      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1772      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1773        break;
1774    
1775      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1776    
# Line 1811  for (;;) Line 1864  for (;;)
1864      cc++;      cc++;
1865      break;      break;
1866    
1867      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1868      otherwise \C is coded as OP_ALLANY. */      otherwise \C is coded as OP_ALLANY. */
1869    
1870      case OP_ANYBYTE:      case OP_ANYBYTE:
# Line 2909  if ((options & PCRE_EXTENDED) != 0) Line 2962  if ((options & PCRE_EXTENDED) != 0)
2962    {    {
2963    for (;;)    for (;;)
2964      {      {
2965      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2966      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2967        {        {
2968        ptr++;        ptr++;
# Line 2951  if ((options & PCRE_EXTENDED) != 0) Line 3004  if ((options & PCRE_EXTENDED) != 0)
3004    {    {
3005    for (;;)    for (;;)
3006      {      {
3007      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3008      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3009        {        {
3010        ptr++;        ptr++;
# Line 3012  if (next >= 0) switch(op_code) Line 3065  if (next >= 0) switch(op_code)
3065      }      }
3066    else    else
3067  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
3068    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3069    
3070    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
3071    opcodes are not used for multi-byte characters, because they are coded using    opcodes are not used for multi-byte characters, because they are coded using
# Line 3037  if (next >= 0) switch(op_code) Line 3090  if (next >= 0) switch(op_code)
3090      }      }
3091    else    else
3092  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
3093    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == TABLE_GET(next, cd->fcc, next));  /* Non-UTF-8 mode */
3094    
3095    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3096    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
# Line 3413  for (;; ptr++) Line 3466  for (;; ptr++)
3466    BOOL is_quantifier;    BOOL is_quantifier;
3467    BOOL is_recurse;    BOOL is_recurse;
3468    BOOL reset_bracount;    BOOL reset_bracount;
3469    int class_charcount;    int class_has_8bitchar;
3470    int class_lastchar;    int class_single_char;
3471    int newoptions;    int newoptions;
3472    int recno;    int recno;
3473    int refsign;    int refsign;
# Line 3448  for (;; ptr++) Line 3501  for (;; ptr++)
3501  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3502      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3503  #endif  #endif
3504      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3505            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3506        {        {
3507        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3508        goto FAILED;        goto FAILED;
# Line 3473  for (;; ptr++) Line 3527  for (;; ptr++)
3527      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3528      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,      DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3529        (int)(code - last_code), c, c));        (int)(code - last_code), c, c));
3530    
3531      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3532      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
3533      if "previous" is NULL, reset the current code pointer to the start. */      if "previous" is NULL, reset the current code pointer to the start. */
# Line 3498  for (;; ptr++) Line 3552  for (;; ptr++)
3552    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3553    reference list. */    reference list. */
3554    
3555    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3556               WORK_SIZE_SAFETY_MARGIN)
3557      {      {
3558      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3559      goto FAILED;      goto FAILED;
# Line 3550  for (;; ptr++) Line 3605  for (;; ptr++)
3605    
3606    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3607      {      {
3608      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3609      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3610        {        {
3611        ptr++;        ptr++;
# Line 3710  for (;; ptr++) Line 3765  for (;; ptr++)
3765    
3766      should_flip_negation = FALSE;      should_flip_negation = FALSE;
3767    
3768      /* Keep a count of chars with values < 256 so that we can optimize the case      /* For optimization purposes, we track some properties of the class.
3769      of just a single character (as long as it's < 256). However, For higher      class_has_8bitchar will be non-zero, if the class contains at least one
3770      valued UTF-8 characters, we don't yet do any optimization. */      < 256 character. class_single_char will be 1 if the class contains only
3771        a single character. */
3772    
3773      class_charcount = 0;      class_has_8bitchar = 0;
3774      class_lastchar = -1;      class_single_char = 0;
3775    
3776      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3777      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
# Line 3870  for (;; ptr++) Line 3926  for (;; ptr++)
3926            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3927    
3928          ptr = tempptr + 1;          ptr = tempptr + 1;
3929          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          /* Every class contains at least one < 256 characters. */
3930            class_has_8bitchar = 1;
3931            /* Every class contains at least two characters. */
3932            class_single_char = 2;
3933          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
3934          }          }
3935    
3936        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3937        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3938        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
3939        assume that other escapes have more than one character in them, so set        assume that other escapes have more than one character in them, so
3940        class_charcount bigger than one. Unrecognized escapes fall through and        speculatively set both class_has_8bitchar and class_single_char bigger
3941        are either treated as literal characters (by default), or are faulted if        than one. Unrecognized escapes fall through and are either treated
3942          as literal characters (by default), or are faulted if
3943        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
3944    
3945        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
# Line 3888  for (;; ptr++) Line 3948  for (;; ptr++)
3948          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3949    
3950          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3951            else if (-c == ESC_N)            /* \N is not supported in a class */
3952              {
3953              *errorcodeptr = ERR71;
3954              goto FAILED;
3955              }
3956          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3957            {            {
3958            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 3902  for (;; ptr++) Line 3967  for (;; ptr++)
3967          if (c < 0)          if (c < 0)
3968            {            {
3969            register const pcre_uint8 *cbits = cd->cbits;            register const pcre_uint8 *cbits = cd->cbits;
3970            class_charcount += 2;     /* Greater than 1 is what matters */            /* Every class contains at least two < 256 characters. */
3971              class_has_8bitchar++;
3972              /* Every class contains at least two characters. */
3973              class_single_char += 2;
3974    
3975            switch (-c)            switch (-c)
3976              {              {
# Line 3915  for (;; ptr++) Line 3983  for (;; ptr++)
3983              case ESC_SU:              case ESC_SU:
3984              nestptr = ptr;              nestptr = ptr;
3985              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3986              class_charcount -= 2;                /* Undo! */              class_has_8bitchar--;                /* Undo! */
3987              continue;              continue;
3988  #endif  #endif
3989              case ESC_d:              case ESC_d:
# Line 4081  for (;; ptr++) Line 4149  for (;; ptr++)
4149                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4150                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
4151                *class_uchardata++ = pdata;                *class_uchardata++ = pdata;
4152                class_charcount -= 2;   /* Not a < 256 character */                class_has_8bitchar--;                /* Undo! */
4153                continue;                continue;
4154                }                }
4155  #endif  #endif
# Line 4095  for (;; ptr++) Line 4163  for (;; ptr++)
4163                *errorcodeptr = ERR7;                *errorcodeptr = ERR7;
4164                goto FAILED;                goto FAILED;
4165                }                }
4166              class_charcount -= 2;  /* Undo the default count from above */              class_has_8bitchar--;    /* Undo the speculative increase. */
4167              c = *ptr;              /* Get the final character and fall through */              class_single_char -= 2;  /* Undo the speculative increase. */
4168                c = *ptr;                /* Get the final character and fall through */
4169              break;              break;
4170              }              }
4171            }            }
4172    
4173          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
4174          greater than 256 mode. */          greater than 256. */
4175    
4176          }   /* End of backslash handling */          }   /* End of backslash handling */
4177    
# Line 4195  for (;; ptr++) Line 4264  for (;; ptr++)
4264    
4265          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4266    
4267            /* Since we found a character range, single character optimizations
4268            cannot be done anymore. */
4269            class_single_char = 2;
4270    
4271          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4272          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4273          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 4323  for (;; ptr++) Line 4396  for (;; ptr++)
4396          /* We use the bit map for 8 bit mode, or when the characters fall          /* We use the bit map for 8 bit mode, or when the characters fall
4397          partially or entirely to [0-255] ([0-127] for UCP) ranges. */          partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4398    
4399          class_charcount += d - c + 1;          class_has_8bitchar = 1;
         class_lastchar = d;  
4400    
4401          /* We can save a bit of time by skipping this in the pre-compile. */          /* We can save a bit of time by skipping this in the pre-compile. */
4402    
# Line 4333  for (;; ptr++) Line 4405  for (;; ptr++)
4405            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4406            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
4407              {              {
4408              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c]; /* flip case */
4409              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
4410              }              }
4411            }            }
# Line 4347  for (;; ptr++) Line 4419  for (;; ptr++)
4419    
4420        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4421    
4422        /* Handle a character that cannot go in the bit map */        /* Only the value of 1 matters for class_single_char. */
4423          if (class_single_char < 2) class_single_char++;
4424    
4425          /* If class_charcount is 1, we saw precisely one character. As long as
4426          there were no negated characters >= 128 and there was no use of \p or \P,
4427          in other words, no use of any XCLASS features, we can optimize.
4428    
4429          In UTF-8 mode, we can optimize the negative case only if there were no
4430          characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4431          operate on single-bytes characters only. This is an historical hangover.
4432          Maybe one day we can tidy these opcodes to handle multi-byte characters.
4433    
4434          The optimization throws away the bit map. We turn the item into a
4435          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4436          Note that OP_NOT[I] does not support multibyte characters. In the positive
4437          case, it can cause firstchar to be set. Otherwise, there can be no first
4438          char if this item is first, whatever repeat count may follow. In the case
4439          of reqchar, save the previous value for reinstating. */
4440    
4441    #ifdef SUPPORT_UTF
4442          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4443            && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4444    #else
4445          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4446    #endif
4447            {
4448            ptr++;
4449            zeroreqchar = reqchar;
4450    
4451            /* The OP_NOT[I] opcodes work on single characters only. */
4452    
4453            if (negate_class)
4454              {
4455              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4456              zerofirstchar = firstchar;
4457              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4458              *code++ = c;
4459              goto NOT_CHAR;
4460              }
4461    
4462            /* For a single, positive character, get the value into mcbuffer, and
4463            then we can handle this with the normal one-character code. */
4464    
4465    #ifdef SUPPORT_UTF
4466            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4467              mclength = PRIV(ord2utf)(c, mcbuffer);
4468            else
4469    #endif
4470              {
4471              mcbuffer[0] = c;
4472              mclength = 1;
4473              }
4474            goto ONE_CHAR;
4475            }       /* End of 1-char optimization */
4476    
4477          /* Handle a character that cannot go in the bit map. */
4478    
4479  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4480        if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))        if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
# Line 4356  for (;; ptr++) Line 4483  for (;; ptr++)
4483  #elif !(defined COMPILE_PCRE8)  #elif !(defined COMPILE_PCRE8)
4484        if (c > 255)        if (c > 255)
4485  #endif  #endif
4486    
4487  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)  #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4488          {          {
4489          xclass = TRUE;          xclass = TRUE;
4490          *class_uchardata++ = XCL_SINGLE;          *class_uchardata++ = XCL_SINGLE;
4491  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4492  #ifndef COMPILE_PCRE8  #ifndef COMPILE_PCRE8
4493          /* In non 8 bit mode, we can get here even          /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4494          if we are not in UTF mode. */          if (!utf)
         if (!utf)  
4495            *class_uchardata++ = c;            *class_uchardata++ = c;
4496          else          else
4497  #endif  #endif
# Line 4377  for (;; ptr++) Line 4504  for (;; ptr++)
4504  #ifdef COMPILE_PCRE8  #ifdef COMPILE_PCRE8
4505          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4506  #else  #else
4507          /* In non 8 bit mode, we can get here even          /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
         if we are not in UTF mode. */  
4508          if (utf && (options & PCRE_CASELESS) != 0)          if (utf && (options & PCRE_CASELESS) != 0)
4509  #endif  #endif
4510            {            {
# Line 4394  for (;; ptr++) Line 4520  for (;; ptr++)
4520          }          }
4521        else        else
4522  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */  #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4523    
4524        /* Handle a single-byte character */        /* Handle a single-byte character */
4525          {          {
4526            class_has_8bitchar = 1;
4527          classbits[c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
4528          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4529            {            {
4530            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c]; /* flip case */
4531            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
4532            }            }
         class_charcount++;  
         class_lastchar = c;  
4533          }          }
   
4534        }        }
4535    
4536      /* Loop until ']' reached. This "while" is the end of the "do" far above.      /* Loop until ']' reached. This "while" is the end of the "do" far above.
# Line 4425  for (;; ptr++) Line 4550  for (;; ptr++)
4550        goto FAILED;        goto FAILED;
4551        }        }
4552    
4553      /* If class_charcount is 1, we saw precisely one character whose value is      /* If this is the first thing in the branch, there can be no first char
4554      less than 256. As long as there were no characters >= 128 and there was no      setting, whatever the repeat count. Any reqchar setting must remain
4555      use of \p or \P, in other words, no use of any XCLASS features, we can      unchanged after any kind of repeat. */
     optimize.  
   
     In UTF-8 mode, we can optimize the negative case only if there were no  
     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR  
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstchar to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqchar, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF  
     if (class_charcount == 1 && !xclass &&  
       (!utf || !negate_class || class_lastchar < 128))  
 #else  
     if (class_charcount == 1)  
 #endif  
       {  
       zeroreqchar = reqchar;  
   
       /* The OP_NOT[I] opcodes work on one-byte characters only. */  
   
       if (negate_class)  
         {  
         if (firstchar == REQ_UNSET) firstchar = REQ_NONE;  
         zerofirstchar = firstchar;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF  
       if (utf && class_lastchar > 127)  
         mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqchar setting must remain unchanged after any kind of  
     repeat. */  
4556    
4557      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4558      zerofirstchar = firstchar;      zerofirstchar = firstchar;
# Line 4496  for (;; ptr++) Line 4568  for (;; ptr++)
4568    
4569  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4570      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))      if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4571  #endif  #elif !defined COMPILE_PCRE8
 #ifndef COMPILE_PCRE8  
4572      if (xclass && !should_flip_negation)      if (xclass && !should_flip_negation)
4573  #endif  #endif
4574  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
# Line 4510  for (;; ptr++) Line 4581  for (;; ptr++)
4581        /* If the map is required, move up the extra data to make room for it;        /* If the map is required, move up the extra data to make room for it;
4582        otherwise just move the code pointer to the end of the extra data. */        otherwise just move the code pointer to the end of the extra data. */
4583    
4584        if (class_charcount > 0)        if (class_has_8bitchar > 0)
4585          {          {
4586          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
4587          memmove(code + (32 / sizeof(pcre_uchar)), code,          memmove(code + (32 / sizeof(pcre_uchar)), code,
# Line 4522  for (;; ptr++) Line 4593  for (;; ptr++)
4593    
4594        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4595    
4596        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4597        break;   /* End of class handling */        break;   /* End of class handling */
4598        }        }
4599  #endif  #endif
# Line 4541  for (;; ptr++) Line 4612  for (;; ptr++)
4612        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4613        }        }
4614      code += 32 / sizeof(pcre_uchar);      code += 32 / sizeof(pcre_uchar);
4615        NOT_CHAR:
4616      break;      break;
4617    
4618    
# Line 4617  for (;; ptr++) Line 4689  for (;; ptr++)
4689      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4690      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4691      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4692    
4693      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4694        {        {
4695        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));        memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
# Line 4661  for (;; ptr++) Line 4733  for (;; ptr++)
4733          {          {
4734          pcre_uchar *lastchar = code - 1;          pcre_uchar *lastchar = code - 1;
4735          BACKCHAR(lastchar);          BACKCHAR(lastchar);
4736          c = code - lastchar;            /* Length of UTF-8 character */          c = (int)(code - lastchar);     /* Length of UTF-8 character */
4737          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */          memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4738          c |= UTF_LENGTH;                /* Flag c as a length */          c |= UTF_LENGTH;                /* Flag c as a length */
4739          }          }
# Line 5067  for (;; ptr++) Line 5139  for (;; ptr++)
5139              *lengthptr += delta;              *lengthptr += delta;
5140              }              }
5141    
5142            /* This is compiling for real */            /* This is compiling for real. If there is a set first byte for
5143              the group, and we have not yet set a "required byte", set it. Make
5144              sure there is enough workspace for copying forward references before
5145              doing the copy. */
5146    
5147            else            else
5148              {              {
5149              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5150    
5151              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5152                {                {
5153                pcre_uchar *hc;                pcre_uchar *hc;
5154                pcre_uchar *this_hwm = cd->hwm;                pcre_uchar *this_hwm = cd->hwm;
5155                memcpy(code, previous, IN_UCHARS(len));                memcpy(code, previous, IN_UCHARS(len));
5156    
5157                  while (cd->hwm > cd->start_workspace + cd->workspace_size -
5158                         WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5159                    {
5160                    int save_offset = save_hwm - cd->start_workspace;
5161                    int this_offset = this_hwm - cd->start_workspace;
5162                    *errorcodeptr = expand_workspace(cd);
5163                    if (*errorcodeptr != 0) goto FAILED;
5164                    save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5165                    this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5166                    }
5167    
5168                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5169                  {                  {
5170                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 5144  for (;; ptr++) Line 5232  for (;; ptr++)
5232              }              }
5233    
5234            memcpy(code, previous, IN_UCHARS(len));            memcpy(code, previous, IN_UCHARS(len));
5235    
5236              /* Ensure there is enough workspace for forward references before
5237              copying them. */
5238    
5239              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5240                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5241                {
5242                int save_offset = save_hwm - cd->start_workspace;
5243                int this_offset = this_hwm - cd->start_workspace;
5244                *errorcodeptr = expand_workspace(cd);
5245                if (*errorcodeptr != 0) goto FAILED;
5246                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5247                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5248                }
5249    
5250            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5251              {              {
5252              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 5174  for (;; ptr++) Line 5277  for (;; ptr++)
5277        ONCE brackets can be converted into non-capturing brackets, as the        ONCE brackets can be converted into non-capturing brackets, as the
5278        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5279        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5280    
5281        Otherwise, when we are doing the actual compile phase, check to see        Otherwise, when we are doing the actual compile phase, check to see
5282        whether this group is one that could match an empty string. If so,        whether this group is one that could match an empty string. If so,
5283        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5284        that runtime checking can be done. [This check is also applied to ONCE        that runtime checking can be done. [This check is also applied to ONCE
5285        groups at runtime, but in a different way.]        groups at runtime, but in a different way.]
5286    
5287        Then, if the quantifier was possessive and the bracket is not a        Then, if the quantifier was possessive and the bracket is not a
5288        conditional, we convert the BRA code to the POS form, and the KET code to        conditional, we convert the BRA code to the POS form, and the KET code to
5289        KETRPOS. (It turns out to be convenient at runtime to detect this kind of        KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5290        subpattern at both the start and at the end.) The use of special opcodes        subpattern at both the start and at the end.) The use of special opcodes
5291        makes it possible to reduce greatly the stack usage in pcre_exec(). If        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5292        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5293    
5294        Then, if the minimum number of matches is 1 or 0, cancel the possessive        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5295        flag so that the default action below, of wrapping everything inside        flag so that the default action below, of wrapping everything inside
5296        atomic brackets, does not happen. When the minimum is greater than 1,        atomic brackets, does not happen. When the minimum is greater than 1,
5297        there will be earlier copies of the group, and so we still have to wrap        there will be earlier copies of the group, and so we still have to wrap
5298        the whole thing. */        the whole thing. */
5299    
5300        else        else
# Line 5200  for (;; ptr++) Line 5303  for (;; ptr++)
5303          pcre_uchar *bracode = ketcode - GET(ketcode, 1);          pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5304    
5305          /* Convert possessive ONCE brackets to non-capturing */          /* Convert possessive ONCE brackets to non-capturing */
5306    
5307          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5308              possessive_quantifier) *bracode = OP_BRA;              possessive_quantifier) *bracode = OP_BRA;
5309    
5310          /* For non-possessive ONCE brackets, all we need to do is to          /* For non-possessive ONCE brackets, all we need to do is to
5311          set the KET. */          set the KET. */
5312    
5313          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5314            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5315    
5316          /* Handle non-ONCE brackets and possessive ONCEs (which have been          /* Handle non-ONCE brackets and possessive ONCEs (which have been
5317          converted to non-capturing above). */          converted to non-capturing above). */
5318    
5319          else          else
5320            {            {
5321            /* In the compile phase, check for empty string matching. */            /* In the compile phase, check for empty string matching. */
5322    
5323            if (lengthptr == NULL)            if (lengthptr == NULL)
5324              {              {
5325              pcre_uchar *scode = bracode;              pcre_uchar *scode = bracode;
# Line 5231  for (;; ptr++) Line 5334  for (;; ptr++)
5334                }                }
5335              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5336              }              }
5337    
5338            /* Handle possessive quantifiers. */            /* Handle possessive quantifiers. */
5339    
5340            if (possessive_quantifier)            if (possessive_quantifier)
# Line 5240  for (;; ptr++) Line 5343  for (;; ptr++)
5343              repeated non-capturing bracket, because we have not invented POS              repeated non-capturing bracket, because we have not invented POS
5344              versions of the COND opcodes. Because we are moving code along, we              versions of the COND opcodes. Because we are moving code along, we
5345              must ensure that any pending recursive references are updated. */              must ensure that any pending recursive references are updated. */
5346    
5347              if (*bracode == OP_COND || *bracode == OP_SCOND)              if (*bracode == OP_COND || *bracode == OP_SCOND)
5348                {                {
5349                int nlen = (int)(code - bracode);                int nlen = (int)(code - bracode);
# Line 5253  for (;; ptr++) Line 5356  for (;; ptr++)
5356                *code++ = OP_KETRPOS;                *code++ = OP_KETRPOS;
5357                PUTINC(code, 0, nlen);                PUTINC(code, 0, nlen);
5358                PUT(bracode, 1, nlen);                PUT(bracode, 1, nlen);
5359                }                }
5360    
5361              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5362    
5363              else              else
5364                {                {
5365                *bracode += 1;              /* Switch to xxxPOS opcodes */                *bracode += 1;              /* Switch to xxxPOS opcodes */
5366                *ketcode = OP_KETRPOS;                *ketcode = OP_KETRPOS;
5367                }                }
5368    
5369              /* If the minimum is zero, mark it as possessive, then unset the              /* If the minimum is zero, mark it as possessive, then unset the
5370              possessive flag when the minimum is 0 or 1. */              possessive flag when the minimum is 0 or 1. */
5371    
5372              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5373              if (repeat_min < 2) possessive_quantifier = FALSE;              if (repeat_min < 2) possessive_quantifier = FALSE;
5374              }              }
5375    
5376            /* Non-possessive quantifier */            /* Non-possessive quantifier */
5377    
5378            else *ketcode = OP_KETRMAX + repeat_type;            else *ketcode = OP_KETRMAX + repeat_type;
5379            }            }
5380          }          }
# Line 5397  for (;; ptr++) Line 5500  for (;; ptr++)
5500    
5501      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
5502    
5503      if (*(++ptr) == CHAR_ASTERISK &&      ptr++;
5504           ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5505             || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5506        {        {
5507        int i, namelen;        int i, namelen;
5508        int arglen = 0;        int arglen = 0;
# Line 5406  for (;; ptr++) Line 5510  for (;; ptr++)
5510        const pcre_uchar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5511        const pcre_uchar *arg = NULL;        const pcre_uchar *arg = NULL;
5512        previous = NULL;        previous = NULL;
5513        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        ptr++;
5514          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5515        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5516    
5517        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
# Line 5592  for (;; ptr++) Line 5697  for (;; ptr++)
5697    
5698          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
5699    
5700          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5701            {            {
5702            ptr += 1;  /* To get the right offset */            ptr += 1;  /* To get the right offset */
5703            *errorcodeptr = ERR28;            *errorcodeptr = ERR28;
# Line 5603  for (;; ptr++) Line 5708  for (;; ptr++)
5708    
5709          recno = 0;          recno = 0;
5710          name = ++ptr;          name = ++ptr;
5711          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5712            {            {
5713            if (recno >= 0)            if (recno >= 0)
5714              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
# Line 5774  for (;; ptr++) Line 5879  for (;; ptr++)
5879            break;            break;
5880    
5881            default:                /* Could be name define, else bad */            default:                /* Could be name define, else bad */
5882            if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;            if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5883                goto DEFINE_NAME;
5884            ptr++;                  /* Correct offset for error */            ptr++;                  /* Correct offset for error */
5885            *errorcodeptr = ERR24;            *errorcodeptr = ERR24;
5886            goto FAILED;            goto FAILED;
# Line 5843  for (;; ptr++) Line 5949  for (;; ptr++)
5949              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5950            name = ++ptr;            name = ++ptr;
5951    
5952            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5953            namelen = (int)(ptr - name);            namelen = (int)(ptr - name);
5954    
5955            /* In the pre-compile phase, just do a syntax check. */            /* In the pre-compile phase, just do a syntax check. */
# Line 5973  for (;; ptr++) Line 6079  for (;; ptr++)
6079    
6080          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
6081          name = ++ptr;          name = ++ptr;
6082          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6083          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6084    
6085          /* In the pre-compile phase, do a syntax check. We used to just set          /* In the pre-compile phase, do a syntax check. We used to just set
# Line 6164  for (;; ptr++) Line 6270  for (;; ptr++)
6270                of the group. Then remember the forward reference. */                of the group. Then remember the forward reference. */
6271    
6272                called = cd->start_code + recno;                called = cd->start_code + recno;
6273                  if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6274                      WORK_SIZE_SAFETY_MARGIN)
6275                    {
6276                    *errorcodeptr = expand_workspace(cd);
6277                    if (*errorcodeptr != 0) goto FAILED;
6278                    }
6279                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6280                }                }
6281    
# Line 6184  for (;; ptr++) Line 6296  for (;; ptr++)
6296                }                }
6297              }              }
6298    
6299            /* Insert the recursion/subroutine item. */            /* Insert the recursion/subroutine item. It does not have a set first
6300              character (relevant if it is repeated, because it will then be
6301              wrapped with ONCE brackets). */
6302    
6303            *code = OP_RECURSE;            *code = OP_RECURSE;
6304            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
6305            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
6306              groupsetfirstchar = FALSE;
6307            }            }
6308    
6309          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 6547  for (;; ptr++) Line 6662  for (;; ptr++)
6662    
6663          if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)          if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6664            {            {
6665            BOOL isnumber = TRUE;            BOOL is_a_number = TRUE;
6666            for (p = ptr + 1; *p != 0 && *p != terminator; p++)            for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6667              {              {
6668              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;              if (!MAX_255(*p)) { is_a_number = FALSE; break; }
6669                if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;
6670              if ((cd->ctypes[*p] & ctype_word) == 0) break;              if ((cd->ctypes[*p] & ctype_word) == 0) break;
6671              }              }
6672            if (*p != terminator)            if (*p != terminator)
# Line 6558  for (;; ptr++) Line 6674  for (;; ptr++)
6674              *errorcodeptr = ERR57;              *errorcodeptr = ERR57;
6675              break;              break;
6676              }              }
6677            if (isnumber)            if (is_a_number)
6678              {              {
6679              ptr++;              ptr++;
6680              goto HANDLE_NUMERICAL_RECURSION;              goto HANDLE_NUMERICAL_RECURSION;
# Line 6672  for (;; ptr++) Line 6788  for (;; ptr++)
6788  #endif  #endif
6789          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6790          so that it works in DFA mode and in lookbehinds. */          so that it works in DFA mode and in lookbehinds. */
6791    
6792            {            {
6793            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6794            *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;            *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
6795            }            }
# Line 6686  for (;; ptr++) Line 6802  for (;; ptr++)
6802      handle it as a data character. */      handle it as a data character. */
6803    
6804  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6805      if (utf && c > 127)      if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6806        mclength = PRIV(ord2utf)(c, mcbuffer);        mclength = PRIV(ord2utf)(c, mcbuffer);
6807      else      else
6808  #endif  #endif
# Line 7426  compile_data *cd = &compile_block; Line 7542  compile_data *cd = &compile_block;
7542  computing the amount of memory that is needed. Compiled items are thrown away  computing the amount of memory that is needed. Compiled items are thrown away
7543  as soon as possible, so that a fairly large buffer should be sufficient for  as soon as possible, so that a fairly large buffer should be sufficient for
7544  this purpose. The same space is used in the second phase for remembering where  this purpose. The same space is used in the second phase for remembering where
7545  to fill in forward references to subpatterns. */  to fill in forward references to subpatterns. That may overflow, in which case
7546    new memory is obtained from malloc(). */
7547    
7548  pcre_uchar cworkspace[COMPILE_WORK_SIZE];  pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7549    
# Line 7626  cd->bracount = cd->final_bracount = 0; Line 7743  cd->bracount = cd->final_bracount = 0;
7743  cd->names_found = 0;  cd->names_found = 0;
7744  cd->name_entry_size = 0;  cd->name_entry_size = 0;
7745  cd->name_table = NULL;  cd->name_table = NULL;
 cd->start_workspace = cworkspace;  
7746  cd->start_code = cworkspace;  cd->start_code = cworkspace;
7747  cd->hwm = cworkspace;  cd->hwm = cworkspace;
7748    cd->start_workspace = cworkspace;
7749    cd->workspace_size = COMPILE_WORK_SIZE;
7750  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
7751  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7752  cd->req_varyopt = 0;  cd->req_varyopt = 0;
# Line 7664  because nowadays we limit the maximum va Line 7782  because nowadays we limit the maximum va
7782  cd->name_entry_size. */  cd->name_entry_size. */
7783    
7784  size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);  size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7785  re = (real_pcre *)(pcre_malloc)(size);  re = (real_pcre *)(PUBL(malloc))(size);
7786    
7787  if (re == NULL)  if (re == NULL)
7788    {    {
# Line 7706  cd->names_found = 0; Line 7824  cd->names_found = 0;
7824  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7825  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
7826  cd->start_code = codestart;  cd->start_code = codestart;
7827  cd->hwm = cworkspace;  cd->hwm = (pcre_uchar *)(cd->start_workspace);
7828  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7829  cd->had_accept = FALSE;  cd->had_accept = FALSE;
7830  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
# Line 7740  if debugging, leave the test till after Line 7858  if debugging, leave the test till after
7858  if (code - codestart > length) errorcode = ERR23;  if (code - codestart > length) errorcode = ERR23;
7859  #endif  #endif
7860    
7861  /* Fill in any forward references that are required. */  /* Fill in any forward references that are required. There may be repeated
7862    references; optimize for them, as searching a large regex takes time. */
7863    
7864  while (errorcode == 0 && cd->hwm > cworkspace)  if (cd->hwm > cd->start_workspace)
7865    {    {
7866    int offset, recno;    int prev_recno = -1;
7867    const pcre_uchar *groupptr;    const pcre_uchar *groupptr = NULL;
7868    cd->hwm -= LINK_SIZE;    while (errorcode == 0 && cd->hwm > cd->start_workspace)
7869    offset = GET(cd->hwm, 0);      {
7870    recno = GET(codestart, offset);      int offset, recno;
7871    groupptr = PRIV(find_bracket)(codestart, utf, recno);      cd->hwm -= LINK_SIZE;
7872    if (groupptr == NULL) errorcode = ERR53;      offset = GET(cd->hwm, 0);
7873      else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));      recno = GET(codestart, offset);
7874        if (recno != prev_recno)
7875          {
7876          groupptr = PRIV(find_bracket)(codestart, utf, recno);
7877          prev_recno = recno;
7878          }
7879        if (groupptr == NULL) errorcode = ERR53;
7880          else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
7881        }
7882    }    }
7883    
7884    /* If the workspace had to be expanded, free the new memory. */
7885    
7886    if (cd->workspace_size > COMPILE_WORK_SIZE)
7887      (PUBL(free))((void *)cd->start_workspace);
7888    
7889  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
7890  subpattern. */  subpattern. */
7891    
# Line 7806  if (cd->check_lookbehind) Line 7938  if (cd->check_lookbehind)
7938    
7939  if (errorcode != 0)  if (errorcode != 0)
7940    {    {
7941    (pcre_free)(re);    (PUBL(free))(re);
7942    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
7943    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
7944    PCRE_EARLY_ERROR_RETURN2:    PCRE_EARLY_ERROR_RETURN2:
# Line 7934  if ((re->flags & PCRE_REQCHSET) != 0) Line 8066  if ((re->flags & PCRE_REQCHSET) != 0)
8066      else printf("Req char = \\x%02x%s\n", ch, caseless);      else printf("Req char = \\x%02x%s\n", ch, caseless);
8067    }    }
8068    
8069    #ifdef COMPILE_PCRE8
8070  pcre_printint(re, stdout, TRUE);  pcre_printint(re, stdout, TRUE);
8071    #else
8072    pcre16_printint(re, stdout, TRUE);
8073    #endif
8074    
8075  /* This check is done here in the debugging case so that the code that  /* This check is done here in the debugging case so that the code that
8076  was compiled can be seen. */  was compiled can be seen. */
8077    
8078  if (code - codestart > length)  if (code - codestart > length)
8079    {    {
8080    (pcre_free)(re);    (PUBL(free))(re);
8081    *errorptr = find_error_text(ERR23);    *errorptr = find_error_text(ERR23);
8082    *erroroffset = ptr - (pcre_uchar *)pattern;    *erroroffset = ptr - (pcre_uchar *)pattern;
8083    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;

Legend:
Removed from v.795  
changed lines
  Added in v.812

  ViewVC Help
Powered by ViewVC 1.1.5