/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 180 by ph10, Wed Jun 13 10:59:18 2007 UTC revision 210 by ph10, Wed Aug 8 14:24:50 2007 UTC
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49  #define NLBLOCK cd             /* Block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
50  #define PSSTART start_pattern  /* Field containing processed string start */  #define PSSTART start_pattern  /* Field containing processed string start */
51  #define PSEND   end_pattern    /* Field containing processed string end */  #define PSEND   end_pattern    /* Field containing processed string end */
52    
   
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 62  used by pcretest. DEBUG is not defined w Line 65  used by pcretest. DEBUG is not defined w
65    
66  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
# Line 120  static const short int escapes[] = { Line 130  static const short int escapes[] = {
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
# Line 130  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143    /* Table of special "verbs" like (*PRUNE) */
144    
145    typedef struct verbitem {
146      const char *name;
147      int   len;
148      int   op;
149    } verbitem;
150    
151    static verbitem verbs[] = {
152      { "ACCEPT", 6, OP_ACCEPT },
153      { "COMMIT", 6, OP_COMMIT },
154      { "F",      1, OP_FAIL },
155      { "FAIL",   4, OP_FAIL },
156      { "PRUNE",  5, OP_PRUNE },
157      { "SKIP",   4, OP_SKIP  },
158      { "THEN",   4, OP_THEN  }
159    };
160    
161    static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
165  terminated by a zero length entry. The first three must be alpha, lower, upper,  terminated by a zero length entry. The first three must be alpha, lower, upper,
166  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
# Line 203  static const char *error_texts[] = { Line 234  static const char *error_texts[] = {
234    "missing ) after comment",    "missing ) after comment",
235    "parentheses nested too deeply",  /** DEAD **/    "parentheses nested too deeply",  /** DEAD **/
236    /* 20 */    /* 20 */
237    "regular expression too large",    "regular expression is too large",
238    "failed to get memory",    "failed to get memory",
239    "unmatched parentheses",    "unmatched parentheses",
240    "internal error: code overflow",    "internal error: code overflow",
# Line 239  static const char *error_texts[] = { Line 270  static const char *error_texts[] = {
270    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272    /* 50 */    /* 50 */
273    "repeated subpattern is too long",    "repeated subpattern is too long",    /** DEAD **/
274    "octal value is greater than \\377 (not in UTF-8 mode)",    "octal value is greater than \\377 (not in UTF-8 mode)",
275    "internal error: overran compiling workspace",    "internal error: overran compiling workspace",
276    "internal error: previously-checked referenced subpattern not found",    "internal error: previously-checked referenced subpattern not found",
# Line 248  static const char *error_texts[] = { Line 279  static const char *error_texts[] = {
279    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed",
280    "inconsistent NEWLINE options",    "inconsistent NEWLINE options",
281    "\\g is not followed by a braced name or an optionally braced non-zero number",    "\\g is not followed by a braced name or an optionally braced non-zero number",
282    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283      "(*VERB) with an argument is not supported",
284      /* 60 */
285      "(*VERB) not recognized"
286  };  };
287    
288    
# Line 701  if (c == '{') Line 735  if (c == '{')
735      *negptr = TRUE;      *negptr = TRUE;
736      ptr++;      ptr++;
737      }      }
738    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
739      {      {
740      c = *(++ptr);      c = *(++ptr);
741      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 931  for (; *ptr != 0; ptr++) Line 965  for (; *ptr != 0; ptr++)
965    /* An opening parens must now be a real metacharacter */    /* An opening parens must now be a real metacharacter */
966    
967    if (*ptr != '(') continue;    if (*ptr != '(') continue;
968    if (ptr[1] != '?')    if (ptr[1] != '?' && ptr[1] != '*')
969      {      {
970      count++;      count++;
971      if (name == NULL && count == lorn) return count;      if (name == NULL && count == lorn) return count;
# Line 1399  for (code = first_significant_code(code Line 1433  for (code = first_significant_code(code
1433    
1434    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
1435    
1436    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1437      {      {
1438      BOOL empty_branch;      BOOL empty_branch;
1439      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 2026  switch(op_code) Line 2060  switch(op_code)
2060    
2061      case ESC_W:      case ESC_W:
2062      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2063    
2064      case ESC_h:      case ESC_h:
2065      case ESC_H:      case ESC_H:
2066      switch(item)      switch(item)
# Line 2053  switch(op_code) Line 2087  switch(op_code)
2087        return -next != ESC_h;        return -next != ESC_h;
2088        default:        default:
2089        return -next == ESC_h;        return -next == ESC_h;
2090        }        }
2091    
2092      case ESC_v:      case ESC_v:
2093      case ESC_V:      case ESC_V:
2094      switch(item)      switch(item)
# Line 2069  switch(op_code) Line 2103  switch(op_code)
2103        return -next != ESC_v;        return -next != ESC_v;
2104        default:        default:
2105        return -next == ESC_v;        return -next == ESC_v;
2106        }        }
2107    
2108      default:      default:
2109      return FALSE;      return FALSE;
# Line 2093  switch(op_code) Line 2127  switch(op_code)
2127    
2128    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
2129    return next == -ESC_h;    return next == -ESC_h;
2130    
2131    /* Can't have \S in here because VT matches \S (Perl anomaly) */    /* Can't have \S in here because VT matches \S (Perl anomaly) */
2132    case OP_VSPACE:    case OP_VSPACE:
2133    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2134    
2135    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
2136    return next == -ESC_v;    return next == -ESC_v;
2137    
2138    case OP_WORDCHAR:    case OP_WORDCHAR:
2139    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2140    
2141    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
2142    return next == -ESC_w || next == -ESC_d;    return next == -ESC_w || next == -ESC_d;
2143    
2144    default:    default:
2145    return FALSE;    return FALSE;
2146    }    }
# Line 2255  for (;; ptr++) Line 2289  for (;; ptr++)
2289      */      */
2290    
2291      if (code < last_code) code = last_code;      if (code < last_code) code = last_code;
2292    
2293        /* Paranoid check for integer overflow */
2294    
2295        if (OFLOW_MAX - *lengthptr < code - last_code)
2296          {
2297          *errorcodeptr = ERR20;
2298          goto FAILED;
2299          }
2300    
2301      *lengthptr += code - last_code;      *lengthptr += code - last_code;
2302      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2303    
# Line 2367  for (;; ptr++) Line 2410  for (;; ptr++)
2410      *ptrptr = ptr;      *ptrptr = ptr;
2411      if (lengthptr != NULL)      if (lengthptr != NULL)
2412        {        {
2413          if (OFLOW_MAX - *lengthptr < code - last_code)
2414            {
2415            *errorcodeptr = ERR20;
2416            goto FAILED;
2417            }
2418        *lengthptr += code - last_code;   /* To include callout length */        *lengthptr += code - last_code;   /* To include callout length */
2419        DPRINTF((">> end branch\n"));        DPRINTF((">> end branch\n"));
2420        }        }
# Line 2429  for (;; ptr++) Line 2477  for (;; ptr++)
2477        goto FAILED;        goto FAILED;
2478        }        }
2479    
2480      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2481        if the first few characters (either before or after ^) are \Q\E or \E we
2482        skip them too. This makes for compatibility with Perl. */
2483    
2484      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2485        for (;;)
2486        {        {
       negate_class = TRUE;  
2487        c = *(++ptr);        c = *(++ptr);
2488        }        if (c == '\\')
2489      else          {
2490        {          if (ptr[1] == 'E') ptr++;
2491        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2492                else break;
2493            }
2494          else if (!negate_class && c == '^')
2495            negate_class = TRUE;
2496          else break;
2497        }        }
2498    
2499      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
# Line 2579  for (;; ptr++) Line 2634  for (;; ptr++)
2634        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
2635        case. Inside a class (and only there) it is treated as backspace.        case. Inside a class (and only there) it is treated as backspace.
2636        Elsewhere it marks a word boundary. Other escapes have preset maps ready        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2637        to or into the one we are building. We assume they have more than one        to 'or' into the one we are building. We assume they have more than one
2638        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2639    
2640        if (c == '\\')        if (c == '\\')
# Line 3521  for (;; ptr++) Line 3576  for (;; ptr++)
3576          goto FAILED;          goto FAILED;
3577          }          }
3578    
       /* This is a paranoid check to stop integer overflow later on */  
   
       if (len > MAX_DUPLENGTH)  
         {  
         *errorcodeptr = ERR50;  
         goto FAILED;  
         }  
   
3579        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3580        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3581        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 3617  for (;; ptr++) Line 3664  for (;; ptr++)
3664          if (repeat_min > 1)          if (repeat_min > 1)
3665            {            {
3666            /* In the pre-compile phase, we don't actually do the replication. We            /* In the pre-compile phase, we don't actually do the replication. We
3667            just adjust the length as if we had. */            just adjust the length as if we had. Do some paranoid checks for
3668              potential integer overflow. */
3669    
3670            if (lengthptr != NULL)            if (lengthptr != NULL)
3671              *lengthptr += (repeat_min - 1)*length_prevgroup;              {
3672                int delta = (repeat_min - 1)*length_prevgroup;
3673                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3674                                                                (double)INT_MAX ||
3675                    OFLOW_MAX - *lengthptr < delta)
3676                  {
3677                  *errorcodeptr = ERR20;
3678                  goto FAILED;
3679                  }
3680                *lengthptr += delta;
3681                }
3682    
3683            /* This is compiling for real */            /* This is compiling for real */
3684    
# Line 3658  for (;; ptr++) Line 3716  for (;; ptr++)
3716          /* In the pre-compile phase, we don't actually do the replication. We          /* In the pre-compile phase, we don't actually do the replication. We
3717          just adjust the length as if we had. For each repetition we must add 1          just adjust the length as if we had. For each repetition we must add 1
3718          to the length for BRAZERO and for all but the last repetition we must          to the length for BRAZERO and for all but the last repetition we must
3719          add 2 + 2*LINKSIZE to allow for the nesting that occurs. */          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3720            paranoid checks to avoid integer overflow. */
3721    
3722          if (lengthptr != NULL && repeat_max > 0)          if (lengthptr != NULL && repeat_max > 0)
3723            *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -            {
3724              2 - 2*LINK_SIZE;  /* Last one doesn't nest */            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3725                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3726              if ((double)repeat_max *
3727                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3728                      > (double)INT_MAX ||
3729                  OFLOW_MAX - *lengthptr < delta)
3730                {
3731                *errorcodeptr = ERR20;
3732                goto FAILED;
3733                }
3734              *lengthptr += delta;
3735              }
3736    
3737          /* This is compiling for real */          /* This is compiling for real */
3738    
# Line 3814  for (;; ptr++) Line 3884  for (;; ptr++)
3884      /* ===================================================================*/      /* ===================================================================*/
3885      /* Start of nested parenthesized sub-expression, or comment or lookahead or      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3886      lookbehind or option setting or condition or all the other extended      lookbehind or option setting or condition or all the other extended
3887      parenthesis forms. First deal with the specials; all are introduced by ?,      parenthesis forms.  */
     and the appearance of any of them means that this is not a capturing  
     group. */  
3888    
3889      case '(':      case '(':
3890      newoptions = options;      newoptions = options;
# Line 3824  for (;; ptr++) Line 3892  for (;; ptr++)
3892      bravalue = OP_CBRA;      bravalue = OP_CBRA;
3893      save_hwm = cd->hwm;      save_hwm = cd->hwm;
3894      reset_bracount = FALSE;      reset_bracount = FALSE;
3895    
3896        /* First deal with various "verbs" that can be introduced by '*'. */
3897    
3898        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3899          {
3900          int i, namelen;
3901          const uschar *name = ++ptr;
3902          previous = NULL;
3903          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3904          if (*ptr == ':')
3905            {
3906            *errorcodeptr = ERR59;   /* Not supported */
3907            goto FAILED;
3908            }
3909          if (*ptr != ')')
3910            {
3911            *errorcodeptr = ERR60;
3912            goto FAILED;
3913            }
3914          namelen = ptr - name;
3915          for (i = 0; i < verbcount; i++)
3916            {
3917            if (namelen == verbs[i].len &&
3918                strncmp((char *)name, verbs[i].name, namelen) == 0)
3919              {
3920              *code = verbs[i].op;
3921              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3922              break;
3923              }
3924            }
3925          if (i < verbcount) continue;
3926          *errorcodeptr = ERR60;
3927          goto FAILED;
3928          }
3929    
3930        /* Deal with the extended parentheses; all are introduced by '?', and the
3931        appearance of any of them means that this is not a capturing group. */
3932    
3933      if (*(++ptr) == '?')      else if (*ptr == '?')
3934        {        {
3935        int i, set, unset, namelen;        int i, set, unset, namelen;
3936        int *optset;        int *optset;
# Line 4067  for (;; ptr++) Line 4172  for (;; ptr++)
4172    
4173          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4174          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
         bravalue = OP_ASSERT_NOT;  
4175          ptr++;          ptr++;
4176            if (*ptr == ')')          /* Optimize (?!) */
4177              {
4178              *code++ = OP_FAIL;
4179              previous = NULL;
4180              continue;
4181              }
4182            bravalue = OP_ASSERT_NOT;
4183          break;          break;
4184    
4185    
# Line 4624  for (;; ptr++) Line 4735  for (;; ptr++)
4735    
4736      if (lengthptr != NULL)      if (lengthptr != NULL)
4737        {        {
4738          if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4739            {
4740            *errorcodeptr = ERR20;
4741            goto FAILED;
4742            }
4743        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4744        code++;        code++;
4745        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
# Line 5119  for (;;) Line 5235  for (;;)
5235      *ptrptr = ptr;      *ptrptr = ptr;
5236      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
5237      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
5238      if (lengthptr != NULL) *lengthptr += length;      if (lengthptr != NULL)
5239          {
5240          if (OFLOW_MAX - *lengthptr < length)
5241            {
5242            *errorcodeptr = ERR20;
5243            return FALSE;
5244            }
5245          *lengthptr += length;
5246          }
5247      return TRUE;      return TRUE;
5248      }      }
5249    
# Line 5647  cd->start_code = codestart; Line 5771  cd->start_code = codestart;
5771  cd->hwm = cworkspace;  cd->hwm = cworkspace;
5772  cd->req_varyopt = 0;  cd->req_varyopt = 0;
5773  cd->nopartial = FALSE;  cd->nopartial = FALSE;
5774    cd->had_accept = FALSE;
5775    
5776  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
5777  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
# Line 5661  re->top_bracket = cd->bracount; Line 5786  re->top_bracket = cd->bracount;
5786  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
5787    
5788  if (cd->nopartial) re->options |= PCRE_NOPARTIAL;  if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5789    if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
5790    
5791  /* If not reached end of pattern on success, there's an excess bracket. */  /* If not reached end of pattern on success, there's an excess bracket. */
5792    

Legend:
Removed from v.180  
changed lines
  Added in v.210

  ViewVC Help
Powered by ViewVC 1.1.5