/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 779 by ph10, Fri Dec 2 10:39:32 2011 UTC revision 788 by ph10, Tue Dec 6 15:38:01 2011 UTC
# Line 88  so this number is very generous. Line 88  so this number is very generous.
88  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
89  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
90  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91  is 4 there is plenty of room for most patterns. However, the memory can get  is 4 there is plenty of room for most patterns. However, the memory can get
92  filled up by repetitions of forward references, for example patterns like  filled up by repetitions of forward references, for example patterns like
93  /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so  /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
94  that the workspace is expanded using malloc() in this situation. The value  that the workspace is expanded using malloc() in this situation. The value
95  below is therefore a minimum, and we put a maximum on it for safety. The  below is therefore a minimum, and we put a maximum on it for safety. The
96  minimum is now also defined in terms of LINK_SIZE so that the use of malloc()  minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
97  kicks in at the same number of forward references in all cases. */  kicks in at the same number of forward references in all cases. */
98    
99  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
# Line 419  static const char error_texts[] = Line 419  static const char error_texts[] =
419    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
420    /* 70 */    /* 70 */
421    "internal error: unknown opcode in find_fixedlength()\0"    "internal error: unknown opcode in find_fixedlength()\0"
422    "\\N is not supported in a class\0"    "\\N is not supported in a class\0"
423    "too many forward references\0"    "too many forward references\0"
424    ;    ;
425    
426  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 592  return s; Line 592  return s;
592  *           Expand the workspace                 *  *           Expand the workspace                 *
593  *************************************************/  *************************************************/
594    
595  /* This function is called during the second compiling phase, if the number of  /* This function is called during the second compiling phase, if the number of
596  forward references fills the existing workspace, which is originally a block on  forward references fills the existing workspace, which is originally a block on
597  the stack. A larger block is obtained from malloc() unless the ultimate limit  the stack. A larger block is obtained from malloc() unless the ultimate limit
598  has been reached or the increase will be rather small.  has been reached or the increase will be rather small.
599    
600  Argument: pointer to the compile data block  Argument: pointer to the compile data block
# Line 617  if (newspace == NULL) return ERR21; Line 617  if (newspace == NULL) return ERR21;
617    
618  memcpy(newspace, cd->start_workspace, cd->workspace_size);  memcpy(newspace, cd->start_workspace, cd->workspace_size);
619  cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace);  cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace);
620  if (cd->workspace_size > COMPILE_WORK_SIZE)  if (cd->workspace_size > COMPILE_WORK_SIZE)
621    (pcre_free)((void *)cd->start_workspace);    (pcre_free)((void *)cd->start_workspace);
622  cd->start_workspace = newspace;  cd->start_workspace = newspace;
623  cd->workspace_size = newsize;  cd->workspace_size = newsize;
# Line 1749  for (;;) Line 1749  for (;;)
1749      cc++;      cc++;
1750      break;      break;
1751    
1752      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1753      otherwise \C is coded as OP_ALLANY. */      otherwise \C is coded as OP_ALLANY. */
1754    
1755      case OP_ANYBYTE:      case OP_ANYBYTE:
# Line 3377  for (;; ptr++) Line 3377  for (;; ptr++)
3377  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
3378      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3379  #endif  #endif
3380      if (code > cd->start_workspace + cd->workspace_size -      if (code > cd->start_workspace + cd->workspace_size -
3381          WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */          WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3382        {        {
3383        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
# Line 3428  for (;; ptr++) Line 3428  for (;; ptr++)
3428    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3429    reference list. */    reference list. */
3430    
3431    else if (cd->hwm > cd->start_workspace + cd->workspace_size -    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3432             WORK_SIZE_SAFETY_MARGIN)             WORK_SIZE_SAFETY_MARGIN)
3433      {      {
3434      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
# Line 3822  for (;; ptr++) Line 3822  for (;; ptr++)
3822          else if (-c == ESC_N)            /* \N is not supported in a class */          else if (-c == ESC_N)            /* \N is not supported in a class */
3823            {            {
3824            *errorcodeptr = ERR71;            *errorcodeptr = ERR71;
3825            goto FAILED;            goto FAILED;
3826            }            }
3827          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3828            {            {
3829            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 4480  for (;; ptr++) Line 4480  for (;; ptr++)
4480      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4481      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4482      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4483    
4484      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4485        {        {
4486        memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);        memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
# Line 4932  for (;; ptr++) Line 4932  for (;; ptr++)
4932              }              }
4933    
4934            /* This is compiling for real. If there is a set first byte for            /* This is compiling for real. If there is a set first byte for
4935            the group, and we have not yet set a "required byte", set it. Make            the group, and we have not yet set a "required byte", set it. Make
4936            sure there is enough workspace for copying forward references before            sure there is enough workspace for copying forward references before
4937            doing the copy. */            doing the copy. */
4938    
4939            else            else
# Line 4945  for (;; ptr++) Line 4945  for (;; ptr++)
4945                uschar *hc;                uschar *hc;
4946                uschar *this_hwm = cd->hwm;                uschar *this_hwm = cd->hwm;
4947                memcpy(code, previous, len);                memcpy(code, previous, len);
4948    
4949                while (cd->hwm > cd->start_workspace + cd->workspace_size -                while (cd->hwm > cd->start_workspace + cd->workspace_size -
4950                       WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))                       WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
4951                  {                  {
# Line 4954  for (;; ptr++) Line 4954  for (;; ptr++)
4954                  *errorcodeptr = expand_workspace(cd);                  *errorcodeptr = expand_workspace(cd);
4955                  if (*errorcodeptr != 0) goto FAILED;                  if (*errorcodeptr != 0) goto FAILED;
4956                  save_hwm = (uschar *)cd->start_workspace + save_offset;                  save_hwm = (uschar *)cd->start_workspace + save_offset;
4957                  this_hwm = (uschar *)cd->start_workspace + this_offset;                  this_hwm = (uschar *)cd->start_workspace + this_offset;
4958                  }                  }
4959    
4960                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)                for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4961                  {                  {
4962                  PUT(cd->hwm, 0, GET(hc, 0) + len);                  PUT(cd->hwm, 0, GET(hc, 0) + len);
# Line 4986  for (;; ptr++) Line 4986  for (;; ptr++)
4986          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4987          paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is          paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4988          a 64-bit integer type when available, otherwise double. */          a 64-bit integer type when available, otherwise double. */
4989    
4990          if (lengthptr != NULL && repeat_max > 0)          if (lengthptr != NULL && repeat_max > 0)
4991            {            {
4992            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
# Line 5024  for (;; ptr++) Line 5024  for (;; ptr++)
5024              }              }
5025    
5026            memcpy(code, previous, len);            memcpy(code, previous, len);
5027    
5028            /* Ensure there is enough workspace for forward references before            /* Ensure there is enough workspace for forward references before
5029            copying them. */            copying them. */
5030    
5031            while (cd->hwm > cd->start_workspace + cd->workspace_size -            while (cd->hwm > cd->start_workspace + cd->workspace_size -
5032                   WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))                   WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5033              {              {
# Line 5036  for (;; ptr++) Line 5036  for (;; ptr++)
5036              *errorcodeptr = expand_workspace(cd);              *errorcodeptr = expand_workspace(cd);
5037              if (*errorcodeptr != 0) goto FAILED;              if (*errorcodeptr != 0) goto FAILED;
5038              save_hwm = (uschar *)cd->start_workspace + save_offset;              save_hwm = (uschar *)cd->start_workspace + save_offset;
5039              this_hwm = (uschar *)cd->start_workspace + this_offset;              this_hwm = (uschar *)cd->start_workspace + this_offset;
5040              }              }
5041    
5042            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)            for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5043              {              {
5044              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));              PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
# Line 5069  for (;; ptr++) Line 5069  for (;; ptr++)
5069        ONCE brackets can be converted into non-capturing brackets, as the        ONCE brackets can be converted into non-capturing brackets, as the
5070        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5071        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
5072    
5073        Otherwise, when we are doing the actual compile phase, check to see        Otherwise, when we are doing the actual compile phase, check to see
5074        whether this group is one that could match an empty string. If so,        whether this group is one that could match an empty string. If so,
5075        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5076        that runtime checking can be done. [This check is also applied to ONCE        that runtime checking can be done. [This check is also applied to ONCE
5077        groups at runtime, but in a different way.]        groups at runtime, but in a different way.]
5078    
5079        Then, if the quantifier was possessive and the bracket is not a        Then, if the quantifier was possessive and the bracket is not a
5080        conditional, we convert the BRA code to the POS form, and the KET code to        conditional, we convert the BRA code to the POS form, and the KET code to
5081        KETRPOS. (It turns out to be convenient at runtime to detect this kind of        KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5082        subpattern at both the start and at the end.) The use of special opcodes        subpattern at both the start and at the end.) The use of special opcodes
5083        makes it possible to reduce greatly the stack usage in pcre_exec(). If        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5084        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5085    
5086        Then, if the minimum number of matches is 1 or 0, cancel the possessive        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5087        flag so that the default action below, of wrapping everything inside        flag so that the default action below, of wrapping everything inside
5088        atomic brackets, does not happen. When the minimum is greater than 1,        atomic brackets, does not happen. When the minimum is greater than 1,
5089        there will be earlier copies of the group, and so we still have to wrap        there will be earlier copies of the group, and so we still have to wrap
5090        the whole thing. */        the whole thing. */
5091    
5092        else        else
# Line 5095  for (;; ptr++) Line 5095  for (;; ptr++)
5095          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
5096    
5097          /* Convert possessive ONCE brackets to non-capturing */          /* Convert possessive ONCE brackets to non-capturing */
5098    
5099          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5100              possessive_quantifier) *bracode = OP_BRA;              possessive_quantifier) *bracode = OP_BRA;
5101    
5102          /* For non-possessive ONCE brackets, all we need to do is to          /* For non-possessive ONCE brackets, all we need to do is to
5103          set the KET. */          set the KET. */
5104    
5105          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5106            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5107    
5108          /* Handle non-ONCE brackets and possessive ONCEs (which have been          /* Handle non-ONCE brackets and possessive ONCEs (which have been
5109          converted to non-capturing above). */          converted to non-capturing above). */
5110    
5111          else          else
5112            {            {
5113            /* In the compile phase, check for empty string matching. */            /* In the compile phase, check for empty string matching. */
5114    
5115            if (lengthptr == NULL)            if (lengthptr == NULL)
5116              {              {
5117              uschar *scode = bracode;              uschar *scode = bracode;
# Line 5126  for (;; ptr++) Line 5126  for (;; ptr++)
5126                }                }
5127              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5128              }              }
5129    
5130            /* Handle possessive quantifiers. */            /* Handle possessive quantifiers. */
5131    
5132            if (possessive_quantifier)            if (possessive_quantifier)
# Line 5135  for (;; ptr++) Line 5135  for (;; ptr++)
5135              repeated non-capturing bracket, because we have not invented POS              repeated non-capturing bracket, because we have not invented POS
5136              versions of the COND opcodes. Because we are moving code along, we              versions of the COND opcodes. Because we are moving code along, we
5137              must ensure that any pending recursive references are updated. */              must ensure that any pending recursive references are updated. */
5138    
5139              if (*bracode == OP_COND || *bracode == OP_SCOND)              if (*bracode == OP_COND || *bracode == OP_SCOND)
5140                {                {
5141                int nlen = (int)(code - bracode);                int nlen = (int)(code - bracode);
# Line 5148  for (;; ptr++) Line 5148  for (;; ptr++)
5148                *code++ = OP_KETRPOS;                *code++ = OP_KETRPOS;
5149                PUTINC(code, 0, nlen);                PUTINC(code, 0, nlen);
5150                PUT(bracode, 1, nlen);                PUT(bracode, 1, nlen);
5151                }                }
5152    
5153              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5154    
5155              else              else
5156                {                {
5157                *bracode += 1;              /* Switch to xxxPOS opcodes */                *bracode += 1;              /* Switch to xxxPOS opcodes */
5158                *ketcode = OP_KETRPOS;                *ketcode = OP_KETRPOS;
5159                }                }
5160    
5161              /* If the minimum is zero, mark it as possessive, then unset the              /* If the minimum is zero, mark it as possessive, then unset the
5162              possessive flag when the minimum is 0 or 1. */              possessive flag when the minimum is 0 or 1. */
5163    
5164              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5165              if (repeat_min < 2) possessive_quantifier = FALSE;              if (repeat_min < 2) possessive_quantifier = FALSE;
5166              }              }
5167    
5168            /* Non-possessive quantifier */            /* Non-possessive quantifier */
5169    
5170            else *ketcode = OP_KETRMAX + repeat_type;            else *ketcode = OP_KETRMAX + repeat_type;
5171            }            }
5172          }          }
# Line 6056  for (;; ptr++) Line 6056  for (;; ptr++)
6056                /* Fudge the value of "called" so that when it is inserted as an                /* Fudge the value of "called" so that when it is inserted as an
6057                offset below, what it actually inserted is the reference number                offset below, what it actually inserted is the reference number
6058                of the group. Then remember the forward reference. */                of the group. Then remember the forward reference. */
6059    
6060                called = cd->start_code + recno;                called = cd->start_code + recno;
6061                if (cd->hwm >= cd->start_workspace + cd->workspace_size -                if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6062                    WORK_SIZE_SAFETY_MARGIN)                    WORK_SIZE_SAFETY_MARGIN)
6063                  {                  {
6064                  *errorcodeptr = expand_workspace(cd);                  *errorcodeptr = expand_workspace(cd);
6065                  if (*errorcodeptr != 0) goto FAILED;                  if (*errorcodeptr != 0) goto FAILED;
6066                  }                  }
6067                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6068                }                }
6069    
# Line 6085  for (;; ptr++) Line 6085  for (;; ptr++)
6085              }              }
6086    
6087            /* Insert the recursion/subroutine item. It does not have a set first            /* Insert the recursion/subroutine item. It does not have a set first
6088            byte (relevant if it is repeated, because it will then be wrapped            byte (relevant if it is repeated, because it will then be wrapped
6089            with ONCE brackets). */            with ONCE brackets). */
6090    
6091            *code = OP_RECURSE;            *code = OP_RECURSE;
6092            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
6093            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
6094            groupsetfirstbyte = FALSE;            groupsetfirstbyte = FALSE;
6095            }            }
6096    
6097          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 6575  for (;; ptr++) Line 6575  for (;; ptr++)
6575  #endif  #endif
6576          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6577          so that it works in DFA mode and in lookbehinds. */          so that it works in DFA mode and in lookbehinds. */
6578    
6579            {            {
6580            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6581            *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;            *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
6582            }            }
# Line 7314  compile_data *cd = &compile_block; Line 7314  compile_data *cd = &compile_block;
7314  computing the amount of memory that is needed. Compiled items are thrown away  computing the amount of memory that is needed. Compiled items are thrown away
7315  as soon as possible, so that a fairly large buffer should be sufficient for  as soon as possible, so that a fairly large buffer should be sufficient for
7316  this purpose. The same space is used in the second phase for remembering where  this purpose. The same space is used in the second phase for remembering where
7317  to fill in forward references to subpatterns. That may overflow, in which case  to fill in forward references to subpatterns. That may overflow, in which case
7318  new memory is obtained from malloc(). */  new memory is obtained from malloc(). */
7319    
7320  uschar cworkspace[COMPILE_WORK_SIZE];  uschar cworkspace[COMPILE_WORK_SIZE];
# Line 7620  if debugging, leave the test till after Line 7620  if debugging, leave the test till after
7620  if (code - codestart > length) errorcode = ERR23;  if (code - codestart > length) errorcode = ERR23;
7621  #endif  #endif
7622    
7623  /* Fill in any forward references that are required. There may be repeated  /* Fill in any forward references that are required. There may be repeated
7624  references; optimize for them, as searching a large regex takes time. */  references; optimize for them, as searching a large regex takes time. */
7625    
7626  if (cd->hwm > cd->start_workspace)  if (cd->hwm > cd->start_workspace)
7627    {    {
7628    int prev_recno = -1;    int prev_recno = -1;
7629    const uschar *groupptr = NULL;    const uschar *groupptr = NULL;
7630    while (errorcode == 0 && cd->hwm > cd->start_workspace)    while (errorcode == 0 && cd->hwm > cd->start_workspace)
7631      {      {
# Line 7634  if (cd->hwm > cd->start_workspace) Line 7634  if (cd->hwm > cd->start_workspace)
7634      offset = GET(cd->hwm, 0);      offset = GET(cd->hwm, 0);
7635      recno = GET(codestart, offset);      recno = GET(codestart, offset);
7636      if (recno != prev_recno)      if (recno != prev_recno)
7637        {        {
7638        groupptr = _pcre_find_bracket(codestart, utf8, recno);        groupptr = _pcre_find_bracket(codestart, utf8, recno);
7639        prev_recno = recno;        prev_recno = recno;
7640        }        }
7641      if (groupptr == NULL) errorcode = ERR53;      if (groupptr == NULL) errorcode = ERR53;
7642        else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));        else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
7643      }      }
7644    }    }
7645    
7646  /* If the workspace had to be expanded, free the new memory. */  /* If the workspace had to be expanded, free the new memory. */
7647    
7648  if (cd->workspace_size > COMPILE_WORK_SIZE)  if (cd->workspace_size > COMPILE_WORK_SIZE)
7649    (pcre_free)((void *)cd->start_workspace);    (pcre_free)((void *)cd->start_workspace);
7650    
7651  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
7652  subpattern. */  subpattern. */

Legend:
Removed from v.779  
changed lines
  Added in v.788

  ViewVC Help
Powered by ViewVC 1.1.5