/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 178 by ph10, Wed Jun 13 08:44:34 2007 UTC revision 210 by ph10, Wed Aug 8 14:24:50 2007 UTC
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49  #define NLBLOCK cd             /* Block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
50  #define PSSTART start_pattern  /* Field containing processed string start */  #define PSSTART start_pattern  /* Field containing processed string start */
51  #define PSEND   end_pattern    /* Field containing processed string end */  #define PSEND   end_pattern    /* Field containing processed string end */
52    
   
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 62  used by pcretest. DEBUG is not defined w Line 65  used by pcretest. DEBUG is not defined w
65    
66  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
# Line 120  static const short int escapes[] = { Line 130  static const short int escapes[] = {
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
# Line 130  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143    /* Table of special "verbs" like (*PRUNE) */
144    
145    typedef struct verbitem {
146      const char *name;
147      int   len;
148      int   op;
149    } verbitem;
150    
151    static verbitem verbs[] = {
152      { "ACCEPT", 6, OP_ACCEPT },
153      { "COMMIT", 6, OP_COMMIT },
154      { "F",      1, OP_FAIL },
155      { "FAIL",   4, OP_FAIL },
156      { "PRUNE",  5, OP_PRUNE },
157      { "SKIP",   4, OP_SKIP  },
158      { "THEN",   4, OP_THEN  }
159    };
160    
161    static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
165  terminated by a zero length entry. The first three must be alpha, lower, upper,  terminated by a zero length entry. The first three must be alpha, lower, upper,
166  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
# Line 203  static const char *error_texts[] = { Line 234  static const char *error_texts[] = {
234    "missing ) after comment",    "missing ) after comment",
235    "parentheses nested too deeply",  /** DEAD **/    "parentheses nested too deeply",  /** DEAD **/
236    /* 20 */    /* 20 */
237    "regular expression too large",    "regular expression is too large",
238    "failed to get memory",    "failed to get memory",
239    "unmatched parentheses",    "unmatched parentheses",
240    "internal error: code overflow",    "internal error: code overflow",
# Line 239  static const char *error_texts[] = { Line 270  static const char *error_texts[] = {
270    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272    /* 50 */    /* 50 */
273    "repeated subpattern is too long",    "repeated subpattern is too long",    /** DEAD **/
274    "octal value is greater than \\377 (not in UTF-8 mode)",    "octal value is greater than \\377 (not in UTF-8 mode)",
275    "internal error: overran compiling workspace",    "internal error: overran compiling workspace",
276    "internal error: previously-checked referenced subpattern not found",    "internal error: previously-checked referenced subpattern not found",
# Line 248  static const char *error_texts[] = { Line 279  static const char *error_texts[] = {
279    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed",
280    "inconsistent NEWLINE options",    "inconsistent NEWLINE options",
281    "\\g is not followed by a braced name or an optionally braced non-zero number",    "\\g is not followed by a braced name or an optionally braced non-zero number",
282    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283      "(*VERB) with an argument is not supported",
284      /* 60 */
285      "(*VERB) not recognized"
286  };  };
287    
288    
# Line 379  static const unsigned char ebcdic_charta Line 413  static const unsigned char ebcdic_charta
413  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
414    
415  static BOOL  static BOOL
416    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
417      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
418    
419    
# Line 701  if (c == '{') Line 735  if (c == '{')
735      *negptr = TRUE;      *negptr = TRUE;
736      ptr++;      ptr++;
737      }      }
738    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
739      {      {
740      c = *(++ptr);      c = *(++ptr);
741      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 931  for (; *ptr != 0; ptr++) Line 965  for (; *ptr != 0; ptr++)
965    /* An opening parens must now be a real metacharacter */    /* An opening parens must now be a real metacharacter */
966    
967    if (*ptr != '(') continue;    if (*ptr != '(') continue;
968    if (ptr[1] != '?')    if (ptr[1] != '?' && ptr[1] != '*')
969      {      {
970      count++;      count++;
971      if (name == NULL && count == lorn) return count;      if (name == NULL && count == lorn) return count;
# Line 1399  for (code = first_significant_code(code Line 1433  for (code = first_significant_code(code
1433    
1434    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
1435    
1436    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1437      {      {
1438      BOOL empty_branch;      BOOL empty_branch;
1439      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1949  if (next >= 0) switch(op_code) Line 1983  if (next >= 0) switch(op_code)
1983    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
1984    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1985    
1986      case OP_HSPACE:
1987      case OP_NOT_HSPACE:
1988      switch(next)
1989        {
1990        case 0x09:
1991        case 0x20:
1992        case 0xa0:
1993        case 0x1680:
1994        case 0x180e:
1995        case 0x2000:
1996        case 0x2001:
1997        case 0x2002:
1998        case 0x2003:
1999        case 0x2004:
2000        case 0x2005:
2001        case 0x2006:
2002        case 0x2007:
2003        case 0x2008:
2004        case 0x2009:
2005        case 0x200A:
2006        case 0x202f:
2007        case 0x205f:
2008        case 0x3000:
2009        return op_code != OP_HSPACE;
2010        default:
2011        return op_code == OP_HSPACE;
2012        }
2013    
2014      case OP_VSPACE:
2015      case OP_NOT_VSPACE:
2016      switch(next)
2017        {
2018        case 0x0a:
2019        case 0x0b:
2020        case 0x0c:
2021        case 0x0d:
2022        case 0x85:
2023        case 0x2028:
2024        case 0x2029:
2025        return op_code != OP_VSPACE;
2026        default:
2027        return op_code == OP_VSPACE;
2028        }
2029    
2030    default:    default:
2031    return FALSE;    return FALSE;
2032    }    }
# Line 1983  switch(op_code) Line 2061  switch(op_code)
2061      case ESC_W:      case ESC_W:
2062      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2063    
2064        case ESC_h:
2065        case ESC_H:
2066        switch(item)
2067          {
2068          case 0x09:
2069          case 0x20:
2070          case 0xa0:
2071          case 0x1680:
2072          case 0x180e:
2073          case 0x2000:
2074          case 0x2001:
2075          case 0x2002:
2076          case 0x2003:
2077          case 0x2004:
2078          case 0x2005:
2079          case 0x2006:
2080          case 0x2007:
2081          case 0x2008:
2082          case 0x2009:
2083          case 0x200A:
2084          case 0x202f:
2085          case 0x205f:
2086          case 0x3000:
2087          return -next != ESC_h;
2088          default:
2089          return -next == ESC_h;
2090          }
2091    
2092        case ESC_v:
2093        case ESC_V:
2094        switch(item)
2095          {
2096          case 0x0a:
2097          case 0x0b:
2098          case 0x0c:
2099          case 0x0d:
2100          case 0x85:
2101          case 0x2028:
2102          case 0x2029:
2103          return -next != ESC_v;
2104          default:
2105          return -next == ESC_v;
2106          }
2107    
2108      default:      default:
2109      return FALSE;      return FALSE;
2110      }      }
2111    
2112    case OP_DIGIT:    case OP_DIGIT:
2113    return next == -ESC_D || next == -ESC_s || next == -ESC_W;    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2114             next == -ESC_h || next == -ESC_v;
2115    
2116    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
2117    return next == -ESC_d;    return next == -ESC_d;
# Line 1997  switch(op_code) Line 2120  switch(op_code)
2120    return next == -ESC_S || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2121    
2122    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
2123    return next == -ESC_s;    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2124    
2125      case OP_HSPACE:
2126      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2127    
2128      case OP_NOT_HSPACE:
2129      return next == -ESC_h;
2130    
2131      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2132      case OP_VSPACE:
2133      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2134    
2135      case OP_NOT_VSPACE:
2136      return next == -ESC_v;
2137    
2138    case OP_WORDCHAR:    case OP_WORDCHAR:
2139    return next == -ESC_W || next == -ESC_s;    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2140    
2141    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
2142    return next == -ESC_w || next == -ESC_d;    return next == -ESC_w || next == -ESC_d;
# Line 2115  for (;; ptr++) Line 2251  for (;; ptr++)
2251    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2252    BOOL is_quantifier;    BOOL is_quantifier;
2253    BOOL is_recurse;    BOOL is_recurse;
2254    BOOL reset_bracount;    BOOL reset_bracount;
2255    int class_charcount;    int class_charcount;
2256    int class_lastchar;    int class_lastchar;
2257    int newoptions;    int newoptions;
# Line 2153  for (;; ptr++) Line 2289  for (;; ptr++)
2289      */      */
2290    
2291      if (code < last_code) code = last_code;      if (code < last_code) code = last_code;
2292    
2293        /* Paranoid check for integer overflow */
2294    
2295        if (OFLOW_MAX - *lengthptr < code - last_code)
2296          {
2297          *errorcodeptr = ERR20;
2298          goto FAILED;
2299          }
2300    
2301      *lengthptr += code - last_code;      *lengthptr += code - last_code;
2302      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2303    
# Line 2265  for (;; ptr++) Line 2410  for (;; ptr++)
2410      *ptrptr = ptr;      *ptrptr = ptr;
2411      if (lengthptr != NULL)      if (lengthptr != NULL)
2412        {        {
2413          if (OFLOW_MAX - *lengthptr < code - last_code)
2414            {
2415            *errorcodeptr = ERR20;
2416            goto FAILED;
2417            }
2418        *lengthptr += code - last_code;   /* To include callout length */        *lengthptr += code - last_code;   /* To include callout length */
2419        DPRINTF((">> end branch\n"));        DPRINTF((">> end branch\n"));
2420        }        }
# Line 2327  for (;; ptr++) Line 2477  for (;; ptr++)
2477        goto FAILED;        goto FAILED;
2478        }        }
2479    
2480      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2481        if the first few characters (either before or after ^) are \Q\E or \E we
2482        skip them too. This makes for compatibility with Perl. */
2483    
2484      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2485        for (;;)
2486        {        {
       negate_class = TRUE;  
2487        c = *(++ptr);        c = *(++ptr);
2488        }        if (c == '\\')
2489      else          {
2490        {          if (ptr[1] == 'E') ptr++;
2491        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2492                else break;
2493            }
2494          else if (!negate_class && c == '^')
2495            negate_class = TRUE;
2496          else break;
2497        }        }
2498    
2499      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
# Line 2477  for (;; ptr++) Line 2634  for (;; ptr++)
2634        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
2635        case. Inside a class (and only there) it is treated as backspace.        case. Inside a class (and only there) it is treated as backspace.
2636        Elsewhere it marks a word boundary. Other escapes have preset maps ready        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2637        to or into the one we are building. We assume they have more than one        to 'or' into the one we are building. We assume they have more than one
2638        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2639    
2640        if (c == '\\')        if (c == '\\')
# Line 2535  for (;; ptr++) Line 2692  for (;; ptr++)
2692    
2693              case ESC_E: /* Perl ignores an orphan \E */              case ESC_E: /* Perl ignores an orphan \E */
2694              continue;              continue;
2695    
2696              default:    /* Not recognized; fall through */              default:    /* Not recognized; fall through */
2697              break;      /* Need "default" setting to stop compiler warning. */              break;      /* Need "default" setting to stop compiler warning. */
2698              }              }
# Line 2544  for (;; ptr++) Line 2701  for (;; ptr++)
2701    
2702            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2703                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2704    
2705            /* We need to deal with \H, \h, \V, and \v in both phases because            /* We need to deal with \H, \h, \V, and \v in both phases because
2706            they use extra memory. */            they use extra memory. */
2707    
2708            if (-c == ESC_h)            if (-c == ESC_h)
2709              {              {
2710              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
2711              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
2712              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
2713  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2714              if (utf8)              if (utf8)
2715                {                {
2716                class_utf8 = TRUE;                class_utf8 = TRUE;
2717                *class_utf8data++ = XCL_SINGLE;                *class_utf8data++ = XCL_SINGLE;
2718                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2719                *class_utf8data++ = XCL_SINGLE;                *class_utf8data++ = XCL_SINGLE;
2720                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2721                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2722                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2723                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2724                *class_utf8data++ = XCL_SINGLE;                *class_utf8data++ = XCL_SINGLE;
2725                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2726                *class_utf8data++ = XCL_SINGLE;                *class_utf8data++ = XCL_SINGLE;
2727                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2728                *class_utf8data++ = XCL_SINGLE;                *class_utf8data++ = XCL_SINGLE;
2729                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2730                }                }
2731  #endif  #endif
2732              continue;              continue;
2733              }              }
2734    
2735            if (-c == ESC_H)            if (-c == ESC_H)
2736              {              {
# Line 2581  for (;; ptr++) Line 2738  for (;; ptr++)
2738                {                {
2739                int x = 0xff;                int x = 0xff;
2740                switch (c)                switch (c)
2741                  {                  {
2742                  case 0x09/8: x ^= 1 << (0x09%8); break;                  case 0x09/8: x ^= 1 << (0x09%8); break;
2743                  case 0x20/8: x ^= 1 << (0x20%8); break;                  case 0x20/8: x ^= 1 << (0x20%8); break;
2744                  case 0xa0/8: x ^= 1 << (0xa0%8); break;                  case 0xa0/8: x ^= 1 << (0xa0%8); break;
2745                  default: break;                  default: break;
2746                  }                  }
2747                classbits[c] |= x;                classbits[c] |= x;
2748                }                }
2749    
2750  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2751              if (utf8)              if (utf8)
2752                {                {
2753                class_utf8 = TRUE;                class_utf8 = TRUE;
2754                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2755                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2756                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2757                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2758                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2759                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2760                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2761                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2762                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2763                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2764                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2765                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2766                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2767                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2768                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2769                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2770                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2771                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2772                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2773                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2774                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2775                }                }
2776  #endif  #endif
2777              continue;              continue;
2778              }              }
2779    
2780            if (-c == ESC_v)            if (-c == ESC_v)
2781              {              {
2782              SETBIT(classbits, 0x0a); /* LF */              SETBIT(classbits, 0x0a); /* LF */
2783              SETBIT(classbits, 0x0b); /* VT */              SETBIT(classbits, 0x0b); /* VT */
2784              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
2785              SETBIT(classbits, 0x0d); /* CR */              SETBIT(classbits, 0x0d); /* CR */
2786              SETBIT(classbits, 0x85); /* NEL */              SETBIT(classbits, 0x85); /* NEL */
2787  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2788              if (utf8)              if (utf8)
2789                {                {
2790                class_utf8 = TRUE;                class_utf8 = TRUE;
2791                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2792                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2793                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2794                }                }
2795  #endif  #endif
2796              continue;              continue;
2797              }              }
2798    
2799            if (-c == ESC_V)            if (-c == ESC_V)
2800              {              {
# Line 2645  for (;; ptr++) Line 2802  for (;; ptr++)
2802                {                {
2803                int x = 0xff;                int x = 0xff;
2804                switch (c)                switch (c)
2805                  {                  {
2806                  case 0x0a/8: x ^= 1 << (0x0a%8);                  case 0x0a/8: x ^= 1 << (0x0a%8);
2807                               x ^= 1 << (0x0b%8);                               x ^= 1 << (0x0b%8);
2808                               x ^= 1 << (0x0c%8);                               x ^= 1 << (0x0c%8);
2809                               x ^= 1 << (0x0d%8);                               x ^= 1 << (0x0d%8);
2810                               break;                               break;
2811                  case 0x85/8: x ^= 1 << (0x85%8); break;                  case 0x85/8: x ^= 1 << (0x85%8); break;
2812                  default: break;                  default: break;
2813                  }                  }
2814                classbits[c] |= x;                classbits[c] |= x;
2815                }                }
2816    
2817  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2818              if (utf8)              if (utf8)
2819                {                {
2820                class_utf8 = TRUE;                class_utf8 = TRUE;
2821                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2822                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2823                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2824                *class_utf8data++ = XCL_RANGE;                *class_utf8data++ = XCL_RANGE;
2825                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2826                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);                class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2827                }                }
2828  #endif  #endif
2829              continue;              continue;
2830              }              }
2831    
2832            /* We need to deal with \P and \p in both phases. */            /* We need to deal with \P and \p in both phases. */
2833    
# Line 2812  for (;; ptr++) Line 2969  for (;; ptr++)
2969              unsigned int origd = d;              unsigned int origd = d;
2970              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2971                {                {
2972                if (occ >= (unsigned int)c &&                if (occ >= (unsigned int)c &&
2973                    ocd <= (unsigned int)d)                    ocd <= (unsigned int)d)
2974                  continue;                          /* Skip embedded ranges */                  continue;                          /* Skip embedded ranges */
2975    
2976                if (occ < (unsigned int)c  &&                if (occ < (unsigned int)c  &&
2977                    ocd >= (unsigned int)c - 1)      /* Extend the basic range */                    ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2978                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2979                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2980                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2981                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2982                if (ocd > (unsigned int)d &&                if (ocd > (unsigned int)d &&
2983                    occ <= (unsigned int)d + 1)      /* always shorter than    */                    occ <= (unsigned int)d + 1)      /* always shorter than    */
2984                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2985                  d = ocd;                  d = ocd;
# Line 3419  for (;; ptr++) Line 3576  for (;; ptr++)
3576          goto FAILED;          goto FAILED;
3577          }          }
3578    
       /* This is a paranoid check to stop integer overflow later on */  
   
       if (len > MAX_DUPLENGTH)  
         {  
         *errorcodeptr = ERR50;  
         goto FAILED;  
         }  
   
3579        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3580        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3581        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 3515  for (;; ptr++) Line 3664  for (;; ptr++)
3664          if (repeat_min > 1)          if (repeat_min > 1)
3665            {            {
3666            /* In the pre-compile phase, we don't actually do the replication. We            /* In the pre-compile phase, we don't actually do the replication. We
3667            just adjust the length as if we had. */            just adjust the length as if we had. Do some paranoid checks for
3668              potential integer overflow. */
3669    
3670            if (lengthptr != NULL)            if (lengthptr != NULL)
3671              *lengthptr += (repeat_min - 1)*length_prevgroup;              {
3672                int delta = (repeat_min - 1)*length_prevgroup;
3673                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3674                                                                (double)INT_MAX ||
3675                    OFLOW_MAX - *lengthptr < delta)
3676                  {
3677                  *errorcodeptr = ERR20;
3678                  goto FAILED;
3679                  }
3680                *lengthptr += delta;
3681                }
3682    
3683            /* This is compiling for real */            /* This is compiling for real */
3684    
# Line 3556  for (;; ptr++) Line 3716  for (;; ptr++)
3716          /* In the pre-compile phase, we don't actually do the replication. We          /* In the pre-compile phase, we don't actually do the replication. We
3717          just adjust the length as if we had. For each repetition we must add 1          just adjust the length as if we had. For each repetition we must add 1
3718          to the length for BRAZERO and for all but the last repetition we must          to the length for BRAZERO and for all but the last repetition we must
3719          add 2 + 2*LINKSIZE to allow for the nesting that occurs. */          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3720            paranoid checks to avoid integer overflow. */
3721    
3722          if (lengthptr != NULL && repeat_max > 0)          if (lengthptr != NULL && repeat_max > 0)
3723            *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -            {
3724              2 - 2*LINK_SIZE;  /* Last one doesn't nest */            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3725                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3726              if ((double)repeat_max *
3727                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3728                      > (double)INT_MAX ||
3729                  OFLOW_MAX - *lengthptr < delta)
3730                {
3731                *errorcodeptr = ERR20;
3732                goto FAILED;
3733                }
3734              *lengthptr += delta;
3735              }
3736    
3737          /* This is compiling for real */          /* This is compiling for real */
3738    
# Line 3712  for (;; ptr++) Line 3884  for (;; ptr++)
3884      /* ===================================================================*/      /* ===================================================================*/
3885      /* Start of nested parenthesized sub-expression, or comment or lookahead or      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3886      lookbehind or option setting or condition or all the other extended      lookbehind or option setting or condition or all the other extended
3887      parenthesis forms. First deal with the specials; all are introduced by ?,      parenthesis forms.  */
     and the appearance of any of them means that this is not a capturing  
     group. */  
3888    
3889      case '(':      case '(':
3890      newoptions = options;      newoptions = options;
3891      skipbytes = 0;      skipbytes = 0;
3892      bravalue = OP_CBRA;      bravalue = OP_CBRA;
3893      save_hwm = cd->hwm;      save_hwm = cd->hwm;
3894      reset_bracount = FALSE;      reset_bracount = FALSE;
3895    
3896        /* First deal with various "verbs" that can be introduced by '*'. */
3897    
3898        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3899          {
3900          int i, namelen;
3901          const uschar *name = ++ptr;
3902          previous = NULL;
3903          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3904          if (*ptr == ':')
3905            {
3906            *errorcodeptr = ERR59;   /* Not supported */
3907            goto FAILED;
3908            }
3909          if (*ptr != ')')
3910            {
3911            *errorcodeptr = ERR60;
3912            goto FAILED;
3913            }
3914          namelen = ptr - name;
3915          for (i = 0; i < verbcount; i++)
3916            {
3917            if (namelen == verbs[i].len &&
3918                strncmp((char *)name, verbs[i].name, namelen) == 0)
3919              {
3920              *code = verbs[i].op;
3921              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3922              break;
3923              }
3924            }
3925          if (i < verbcount) continue;
3926          *errorcodeptr = ERR60;
3927          goto FAILED;
3928          }
3929    
3930        /* Deal with the extended parentheses; all are introduced by '?', and the
3931        appearance of any of them means that this is not a capturing group. */
3932    
3933      if (*(++ptr) == '?')      else if (*ptr == '?')
3934        {        {
3935        int i, set, unset, namelen;        int i, set, unset, namelen;
3936        int *optset;        int *optset;
# Line 3746  for (;; ptr++) Line 3953  for (;; ptr++)
3953          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
3954          case '|':                 /* Reset capture count for each branch */          case '|':                 /* Reset capture count for each branch */
3955          reset_bracount = TRUE;          reset_bracount = TRUE;
3956          /* Fall through */          /* Fall through */
3957    
3958          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
3959          case ':':                 /* Non-capturing bracket */          case ':':                 /* Non-capturing bracket */
# Line 3965  for (;; ptr++) Line 4172  for (;; ptr++)
4172    
4173          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4174          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
         bravalue = OP_ASSERT_NOT;  
4175          ptr++;          ptr++;
4176            if (*ptr == ')')          /* Optimize (?!) */
4177              {
4178              *code++ = OP_FAIL;
4179              previous = NULL;
4180              continue;
4181              }
4182            bravalue = OP_ASSERT_NOT;
4183          break;          break;
4184    
4185    
# Line 4447  for (;; ptr++) Line 4660  for (;; ptr++)
4660           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4661           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4662            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4663           reset_bracount,               /* True if (?| group */           reset_bracount,               /* True if (?| group */
4664           skipbytes,                    /* Skip over bracket number */           skipbytes,                    /* Skip over bracket number */
4665           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4666           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
# Line 4522  for (;; ptr++) Line 4735  for (;; ptr++)
4735    
4736      if (lengthptr != NULL)      if (lengthptr != NULL)
4737        {        {
4738          if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4739            {
4740            *errorcodeptr = ERR20;
4741            goto FAILED;
4742            }
4743        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4744        code++;        code++;
4745        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
# Line 4807  Arguments: Line 5025  Arguments:
5025    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
5026    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
5027    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
5028    reset_bracount TRUE to reset the count for each branch    reset_bracount TRUE to reset the count for each branch
5029    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
5030    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
5031    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
# Line 4821  Returns:         TRUE on success Line 5039  Returns:         TRUE on success
5039    
5040  static BOOL  static BOOL
5041  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5042    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5043    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5044    int *lengthptr)    int *lengthptr)
5045  {  {
5046  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
# Line 4867  orig_bracount = max_bracount = cd->braco Line 5085  orig_bracount = max_bracount = cd->braco
5085  for (;;)  for (;;)
5086    {    {
5087    /* For a (?| group, reset the capturing bracket count so that each branch    /* For a (?| group, reset the capturing bracket count so that each branch
5088    uses the same numbers. */    uses the same numbers. */
5089    
5090    if (reset_bracount) cd->bracount = orig_bracount;    if (reset_bracount) cd->bracount = orig_bracount;
5091    
5092    /* Handle a change of ims options at the start of the branch */    /* Handle a change of ims options at the start of the branch */
5093    
5094    if ((options & PCRE_IMS) != oldims)    if ((options & PCRE_IMS) != oldims)
# Line 4899  for (;;) Line 5117  for (;;)
5117      *ptrptr = ptr;      *ptrptr = ptr;
5118      return FALSE;      return FALSE;
5119      }      }
5120    
5121    /* Keep the highest bracket count in case (?| was used and some branch    /* Keep the highest bracket count in case (?| was used and some branch
5122    has fewer than the rest. */    has fewer than the rest. */
5123    
5124    if (cd->bracount > max_bracount) max_bracount = cd->bracount;    if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5125    
5126    /* In the real compile phase, there is some post-processing to be done. */    /* In the real compile phase, there is some post-processing to be done. */
# Line 5006  for (;;) Line 5224  for (;;)
5224        *code++ = oldims;        *code++ = oldims;
5225        length += 2;        length += 2;
5226        }        }
5227    
5228      /* Retain the highest bracket number, in case resetting was used. */      /* Retain the highest bracket number, in case resetting was used. */
5229    
5230      cd->bracount = max_bracount;      cd->bracount = max_bracount;
5231    
5232      /* Set values to pass back */      /* Set values to pass back */
# Line 5017  for (;;) Line 5235  for (;;)
5235      *ptrptr = ptr;      *ptrptr = ptr;
5236      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
5237      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
5238      if (lengthptr != NULL) *lengthptr += length;      if (lengthptr != NULL)
5239          {
5240          if (OFLOW_MAX - *lengthptr < length)
5241            {
5242            *errorcodeptr = ERR20;
5243            return FALSE;
5244            }
5245          *lengthptr += length;
5246          }
5247      return TRUE;      return TRUE;
5248      }      }
5249    
# Line 5485  outside can help speed up starting point Line 5711  outside can help speed up starting point
5711  code = cworkspace;  code = cworkspace;
5712  *code = OP_BRA;  *code = OP_BRA;
5713  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5714    &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,    &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5715    &length);    &length);
5716  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5717    
# Line 5545  cd->start_code = codestart; Line 5771  cd->start_code = codestart;
5771  cd->hwm = cworkspace;  cd->hwm = cworkspace;
5772  cd->req_varyopt = 0;  cd->req_varyopt = 0;
5773  cd->nopartial = FALSE;  cd->nopartial = FALSE;
5774    cd->had_accept = FALSE;
5775    
5776  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
5777  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
# Line 5559  re->top_bracket = cd->bracount; Line 5786  re->top_bracket = cd->bracount;
5786  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
5787    
5788  if (cd->nopartial) re->options |= PCRE_NOPARTIAL;  if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5789    if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
5790    
5791  /* If not reached end of pattern on success, there's an excess bracket. */  /* If not reached end of pattern on success, there's an excess bracket. */
5792    

Legend:
Removed from v.178  
changed lines
  Added in v.210

  ViewVC Help
Powered by ViewVC 1.1.5