/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 167 by ph10, Wed May 9 15:53:54 2007 UTC revision 200 by ph10, Wed Aug 1 09:10:40 2007 UTC
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49  #define NLBLOCK cd             /* Block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
50  #define PSSTART start_pattern  /* Field containing processed string start */  #define PSSTART start_pattern  /* Field containing processed string start */
51  #define PSEND   end_pattern    /* Field containing processed string end */  #define PSEND   end_pattern    /* Field containing processed string end */
52    
   
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 58  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    
69  /*************************************************  /*************************************************
70  *      Code parameters and static tables         *  *      Code parameters and static tables         *
71  *************************************************/  *************************************************/
# Line 87  static const short int escapes[] = { Line 95  static const short int escapes[] = {
95       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
96       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
97     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
98       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
99  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
100  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
101     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
102       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
103  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
104       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
105  };  };
106    
# Line 106  static const short int escapes[] = { Line 114  static const short int escapes[] = {
114  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
115  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
116  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
117  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
118  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
119  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
120  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
121  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
122  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
123  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
124  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
125  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
126  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
127  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
128  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
129  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
130  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
131  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 242  static const char *error_texts[] = { Line 250  static const char *error_texts[] = {
250    /* 55 */    /* 55 */
251    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed",
252    "inconsistent NEWLINE options",    "inconsistent NEWLINE options",
253    "\\g is not followed by an (optionally braced) non-zero number",    "\\g is not followed by a braced name or an optionally braced non-zero number",
254    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
255  };  };
256    
257    
# Line 374  static const unsigned char ebcdic_charta Line 382  static const unsigned char ebcdic_charta
382  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
383    
384  static BOOL  static BOOL
385    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
386      int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
387    
388    
389    
# Line 453  else Line 461  else
461    
462      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by a number, either plain or braced. If positive, it
463      is an absolute backreference. If negative, it is a relative backreference.      is an absolute backreference. If negative, it is a relative backreference.
464      This is a Perl 5.10 feature. */      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
465        reference to a named group. This is part of Perl's movement towards a
466        unified syntax for back references. As this is synonymous with \k{name}, we
467        fudge it up by pretending it really was \k. */
468    
469      case 'g':      case 'g':
470      if (ptr[1] == '{')      if (ptr[1] == '{')
471        {        {
472          const uschar *p;
473          for (p = ptr+2; *p != 0 && *p != '}'; p++)
474            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
475          if (*p != 0 && *p != '}')
476            {
477            c = -ESC_k;
478            break;
479            }
480        braced = TRUE;        braced = TRUE;
481        ptr++;        ptr++;
482        }        }
# Line 685  if (c == '{') Line 704  if (c == '{')
704      *negptr = TRUE;      *negptr = TRUE;
705      ptr++;      ptr++;
706      }      }
707    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
708      {      {
709      c = *(++ptr);      c = *(++ptr);
710      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 1371  for (code = first_significant_code(code Line 1390  for (code = first_significant_code(code
1390    
1391    c = *code;    c = *code;
1392    
1393      /* Groups with zero repeats can of course be empty; skip them. */
1394    
1395      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1396        {
1397        code += _pcre_OP_lengths[c];
1398        do code += GET(code, 1); while (*code == OP_ALT);
1399        c = *code;
1400        continue;
1401        }
1402    
1403      /* For other groups, scan the branches. */
1404    
1405    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1406      {      {
1407      BOOL empty_branch;      BOOL empty_branch;
# Line 1387  for (code = first_significant_code(code Line 1418  for (code = first_significant_code(code
1418        }        }
1419      while (*code == OP_ALT);      while (*code == OP_ALT);
1420      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1421        c = *code;
     /* Move past the KET and fudge things so that the increment in the "for"  
     above has no effect. */  
   
     c = OP_END;  
     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];  
1422      continue;      continue;
1423      }      }
1424    
# Line 1926  if (next >= 0) switch(op_code) Line 1952  if (next >= 0) switch(op_code)
1952    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
1953    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1954    
1955      case OP_HSPACE:
1956      case OP_NOT_HSPACE:
1957      switch(next)
1958        {
1959        case 0x09:
1960        case 0x20:
1961        case 0xa0:
1962        case 0x1680:
1963        case 0x180e:
1964        case 0x2000:
1965        case 0x2001:
1966        case 0x2002:
1967        case 0x2003:
1968        case 0x2004:
1969        case 0x2005:
1970        case 0x2006:
1971        case 0x2007:
1972        case 0x2008:
1973        case 0x2009:
1974        case 0x200A:
1975        case 0x202f:
1976        case 0x205f:
1977        case 0x3000:
1978        return op_code != OP_HSPACE;
1979        default:
1980        return op_code == OP_HSPACE;
1981        }
1982    
1983      case OP_VSPACE:
1984      case OP_NOT_VSPACE:
1985      switch(next)
1986        {
1987        case 0x0a:
1988        case 0x0b:
1989        case 0x0c:
1990        case 0x0d:
1991        case 0x85:
1992        case 0x2028:
1993        case 0x2029:
1994        return op_code != OP_VSPACE;
1995        default:
1996        return op_code == OP_VSPACE;
1997        }
1998    
1999    default:    default:
2000    return FALSE;    return FALSE;
2001    }    }
# Line 1960  switch(op_code) Line 2030  switch(op_code)
2030      case ESC_W:      case ESC_W:
2031      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2032    
2033        case ESC_h:
2034        case ESC_H:
2035        switch(item)
2036          {
2037          case 0x09:
2038          case 0x20:
2039          case 0xa0:
2040          case 0x1680:
2041          case 0x180e:
2042          case 0x2000:
2043          case 0x2001:
2044          case 0x2002:
2045          case 0x2003:
2046          case 0x2004:
2047          case 0x2005:
2048          case 0x2006:
2049          case 0x2007:
2050          case 0x2008:
2051          case 0x2009:
2052          case 0x200A:
2053          case 0x202f:
2054          case 0x205f:
2055          case 0x3000:
2056          return -next != ESC_h;
2057          default:
2058          return -next == ESC_h;
2059          }
2060    
2061        case ESC_v:
2062        case ESC_V:
2063        switch(item)
2064          {
2065          case 0x0a:
2066          case 0x0b:
2067          case 0x0c:
2068          case 0x0d:
2069          case 0x85:
2070          case 0x2028:
2071          case 0x2029:
2072          return -next != ESC_v;
2073          default:
2074          return -next == ESC_v;
2075          }
2076    
2077      default:      default:
2078      return FALSE;      return FALSE;
2079      }      }
2080    
2081    case OP_DIGIT:    case OP_DIGIT:
2082    return next == -ESC_D || next == -ESC_s || next == -ESC_W;    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2083             next == -ESC_h || next == -ESC_v;
2084    
2085    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
2086    return next == -ESC_d;    return next == -ESC_d;
# Line 1974  switch(op_code) Line 2089  switch(op_code)
2089    return next == -ESC_S || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2090    
2091    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
2092    return next == -ESC_s;    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2093    
2094      case OP_HSPACE:
2095      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2096    
2097      case OP_NOT_HSPACE:
2098      return next == -ESC_h;
2099    
2100      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2101      case OP_VSPACE:
2102      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2103    
2104      case OP_NOT_VSPACE:
2105      return next == -ESC_v;
2106    
2107    case OP_WORDCHAR:    case OP_WORDCHAR:
2108    return next == -ESC_W || next == -ESC_s;    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2109    
2110    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
2111    return next == -ESC_w || next == -ESC_d;    return next == -ESC_w || next == -ESC_d;
# Line 2092  for (;; ptr++) Line 2220  for (;; ptr++)
2220    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2221    BOOL is_quantifier;    BOOL is_quantifier;
2222    BOOL is_recurse;    BOOL is_recurse;
2223      BOOL reset_bracount;
2224    int class_charcount;    int class_charcount;
2225    int class_lastchar;    int class_lastchar;
2226    int newoptions;    int newoptions;
2227    int recno;    int recno;
2228    int refsign;    int refsign;
2229    int skipbytes;    int skipbytes;
2230    int subreqbyte;    int subreqbyte;
2231    int subfirstbyte;    int subfirstbyte;
# Line 2521  for (;; ptr++) Line 2650  for (;; ptr++)
2650            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2651                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2652    
2653              /* We need to deal with \H, \h, \V, and \v in both phases because
2654              they use extra memory. */
2655    
2656              if (-c == ESC_h)
2657                {
2658                SETBIT(classbits, 0x09); /* VT */
2659                SETBIT(classbits, 0x20); /* SPACE */
2660                SETBIT(classbits, 0xa0); /* NSBP */
2661    #ifdef SUPPORT_UTF8
2662                if (utf8)
2663                  {
2664                  class_utf8 = TRUE;
2665                  *class_utf8data++ = XCL_SINGLE;
2666                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2667                  *class_utf8data++ = XCL_SINGLE;
2668                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2669                  *class_utf8data++ = XCL_RANGE;
2670                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2671                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2672                  *class_utf8data++ = XCL_SINGLE;
2673                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2674                  *class_utf8data++ = XCL_SINGLE;
2675                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2676                  *class_utf8data++ = XCL_SINGLE;
2677                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2678                  }
2679    #endif
2680                continue;
2681                }
2682    
2683              if (-c == ESC_H)
2684                {
2685                for (c = 0; c < 32; c++)
2686                  {
2687                  int x = 0xff;
2688                  switch (c)
2689                    {
2690                    case 0x09/8: x ^= 1 << (0x09%8); break;
2691                    case 0x20/8: x ^= 1 << (0x20%8); break;
2692                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2693                    default: break;
2694                    }
2695                  classbits[c] |= x;
2696                  }
2697    
2698    #ifdef SUPPORT_UTF8
2699                if (utf8)
2700                  {
2701                  class_utf8 = TRUE;
2702                  *class_utf8data++ = XCL_RANGE;
2703                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2704                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2705                  *class_utf8data++ = XCL_RANGE;
2706                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2707                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2708                  *class_utf8data++ = XCL_RANGE;
2709                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2710                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2711                  *class_utf8data++ = XCL_RANGE;
2712                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2713                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2714                  *class_utf8data++ = XCL_RANGE;
2715                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2716                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2717                  *class_utf8data++ = XCL_RANGE;
2718                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2719                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2720                  *class_utf8data++ = XCL_RANGE;
2721                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2722                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2723                  }
2724    #endif
2725                continue;
2726                }
2727    
2728              if (-c == ESC_v)
2729                {
2730                SETBIT(classbits, 0x0a); /* LF */
2731                SETBIT(classbits, 0x0b); /* VT */
2732                SETBIT(classbits, 0x0c); /* FF */
2733                SETBIT(classbits, 0x0d); /* CR */
2734                SETBIT(classbits, 0x85); /* NEL */
2735    #ifdef SUPPORT_UTF8
2736                if (utf8)
2737                  {
2738                  class_utf8 = TRUE;
2739                  *class_utf8data++ = XCL_RANGE;
2740                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2741                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2742                  }
2743    #endif
2744                continue;
2745                }
2746    
2747              if (-c == ESC_V)
2748                {
2749                for (c = 0; c < 32; c++)
2750                  {
2751                  int x = 0xff;
2752                  switch (c)
2753                    {
2754                    case 0x0a/8: x ^= 1 << (0x0a%8);
2755                                 x ^= 1 << (0x0b%8);
2756                                 x ^= 1 << (0x0c%8);
2757                                 x ^= 1 << (0x0d%8);
2758                                 break;
2759                    case 0x85/8: x ^= 1 << (0x85%8); break;
2760                    default: break;
2761                    }
2762                  classbits[c] |= x;
2763                  }
2764    
2765    #ifdef SUPPORT_UTF8
2766                if (utf8)
2767                  {
2768                  class_utf8 = TRUE;
2769                  *class_utf8data++ = XCL_RANGE;
2770                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2771                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2772                  *class_utf8data++ = XCL_RANGE;
2773                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2774                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2775                  }
2776    #endif
2777                continue;
2778                }
2779    
2780            /* We need to deal with \P and \p in both phases. */            /* We need to deal with \P and \p in both phases. */
2781    
2782  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
# Line 2661  for (;; ptr++) Line 2917  for (;; ptr++)
2917              unsigned int origd = d;              unsigned int origd = d;
2918              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2919                {                {
2920                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2921                      ocd <= (unsigned int)d)
2922                    continue;                          /* Skip embedded ranges */
2923    
2924                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2925                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2926                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2927                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2928                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2929                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2930                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2931                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2932                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2933                  d = ocd;                  d = ocd;
2934                  continue;                  continue;
# Line 3566  for (;; ptr++) Line 3826  for (;; ptr++)
3826      skipbytes = 0;      skipbytes = 0;
3827      bravalue = OP_CBRA;      bravalue = OP_CBRA;
3828      save_hwm = cd->hwm;      save_hwm = cd->hwm;
3829        reset_bracount = FALSE;
3830    
3831      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3832        {        {
# Line 3588  for (;; ptr++) Line 3849  for (;; ptr++)
3849    
3850    
3851          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
3852            case '|':                 /* Reset capture count for each branch */
3853            reset_bracount = TRUE;
3854            /* Fall through */
3855    
3856            /* ------------------------------------------------------------ */
3857          case ':':                 /* Non-capturing bracket */          case ':':                 /* Non-capturing bracket */
3858          bravalue = OP_BRA;          bravalue = OP_BRA;
3859          ptr++;          ptr++;
# Line 3623  for (;; ptr++) Line 3889  for (;; ptr++)
3889    
3890          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
3891          skipbytes = 3;          skipbytes = 3;
3892          refsign = -1;          refsign = -1;
3893    
3894          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
3895    
# Line 3647  for (;; ptr++) Line 3913  for (;; ptr++)
3913            terminator = '\'';            terminator = '\'';
3914            ptr++;            ptr++;
3915            }            }
3916          else          else
3917            {            {
3918            terminator = 0;            terminator = 0;
3919            if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);            if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3920            }            }
3921    
3922          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
3923    
# Line 3689  for (;; ptr++) Line 3955  for (;; ptr++)
3955          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
3956          reference. If the string started with "+" or "-" we require the rest to          reference. If the string started with "+" or "-" we require the rest to
3957          be digits, in which case recno will be set. */          be digits, in which case recno will be set. */
3958    
3959          if (refsign > 0)          if (refsign > 0)
3960            {            {
3961            if (recno <= 0)            if (recno <= 0)
3962              {              {
3963              *errorcodeptr = ERR58;              *errorcodeptr = ERR58;
3964              goto FAILED;              goto FAILED;
3965              }              }
3966            if (refsign == '-')            if (refsign == '-')
3967              {              {
3968              recno = cd->bracount - recno + 1;              recno = cd->bracount - recno + 1;
3969              if (recno <= 0)              if (recno <= 0)
3970                {                {
3971                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
3972                goto FAILED;                goto FAILED;
3973                }                }
3974              }              }
3975            else recno += cd->bracount;            else recno += cd->bracount;
3976            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
3977            break;            break;
3978            }            }
3979    
3980          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
3981          name. */          name. */
3982    
3983          slot = cd->name_table;          slot = cd->name_table;
3984          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
3985            {            {
# Line 4038  for (;; ptr++) Line 4304  for (;; ptr++)
4304            const uschar *called;            const uschar *called;
4305    
4306            if ((refsign = *ptr) == '+') ptr++;            if ((refsign = *ptr) == '+') ptr++;
4307            else if (refsign == '-')            else if (refsign == '-')
4308              {              {
4309              if ((digitab[ptr[1]] & ctype_digit) == 0)              if ((digitab[ptr[1]] & ctype_digit) == 0)
4310                goto OTHER_CHAR_AFTER_QUERY;                goto OTHER_CHAR_AFTER_QUERY;
4311              ptr++;              ptr++;
4312              }              }
4313    
4314            recno = 0;            recno = 0;
4315            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4316              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
# Line 4054  for (;; ptr++) Line 4320  for (;; ptr++)
4320              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4321              goto FAILED;              goto FAILED;
4322              }              }
4323    
4324            if (refsign == '-')            if (refsign == '-')
4325              {              {
4326              if (recno == 0)              if (recno == 0)
4327                {                {
4328                *errorcodeptr = ERR58;                *errorcodeptr = ERR58;
4329                goto FAILED;                goto FAILED;
4330                }                }
4331              recno = cd->bracount - recno + 1;              recno = cd->bracount - recno + 1;
4332              if (recno <= 0)              if (recno <= 0)
4333                {                {
4334                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
4335                goto FAILED;                goto FAILED;
4336                }                }
4337              }              }
4338            else if (refsign == '+')            else if (refsign == '+')
4339              {              {
# Line 4075  for (;; ptr++) Line 4341  for (;; ptr++)
4341                {                {
4342                *errorcodeptr = ERR58;                *errorcodeptr = ERR58;
4343                goto FAILED;                goto FAILED;
4344                }                }
4345              recno += cd->bracount;              recno += cd->bracount;
4346              }              }
4347    
4348            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4349    
# Line 4151  for (;; ptr++) Line 4417  for (;; ptr++)
4417    
4418          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4419          default:              /* Other characters: check option setting */          default:              /* Other characters: check option setting */
4420          OTHER_CHAR_AFTER_QUERY:          OTHER_CHAR_AFTER_QUERY:
4421          set = unset = 0;          set = unset = 0;
4422          optset = &set;          optset = &set;
4423    
# Line 4286  for (;; ptr++) Line 4552  for (;; ptr++)
4552           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4553           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4554            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4555             reset_bracount,               /* True if (?| group */
4556           skipbytes,                    /* Skip over bracket number */           skipbytes,                    /* Skip over bracket number */
4557           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4558           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
# Line 4302  for (;; ptr++) Line 4569  for (;; ptr++)
4569      is on the bracket. */      is on the bracket. */
4570    
4571      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4572      two branches in the group, or just one if it's a DEFINE group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4573        in the real compile phase, not in the pre-pass, where the whole group may
4574        not be available. */
4575    
4576      if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4577        {        {
4578        uschar *tc = code;        uschar *tc = code;
4579        int condcount = 0;        int condcount = 0;
# Line 4464  for (;; ptr++) Line 4733  for (;; ptr++)
4733        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4734        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4735    
4736        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4737          We also support \k{name} (.NET syntax) */
4738    
4739        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4740          {          {
4741          is_recurse = FALSE;          is_recurse = FALSE;
4742          terminator = (*(++ptr) == '<')? '>' : '\'';          terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4743          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
4744          }          }
4745    
# Line 4635  This function is used during the pre-com Line 4905  This function is used during the pre-com
4905  out the amount of memory needed, as well as during the real compile phase. The  out the amount of memory needed, as well as during the real compile phase. The
4906  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
4907    
4908  Argument:  Arguments:
4909    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4910    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
4911    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4912    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4913    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4914    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4915      reset_bracount TRUE to reset the count for each branch
4916    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4917    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4918    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
# Line 4655  Returns:         TRUE on success Line 4926  Returns:         TRUE on success
4926    
4927  static BOOL  static BOOL
4928  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4929    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4930    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4931      int *lengthptr)
4932  {  {
4933  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4934  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 4666  uschar *reverse_count = NULL; Line 4938  uschar *reverse_count = NULL;
4938  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4939  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4940  int length;  int length;
4941    int orig_bracount;
4942    int max_bracount;
4943  branch_chain bc;  branch_chain bc;
4944    
4945  bc.outer = bcptr;  bc.outer = bcptr;
# Line 4694  code += 1 + LINK_SIZE + skipbytes; Line 4968  code += 1 + LINK_SIZE + skipbytes;
4968    
4969  /* Loop for each alternative branch */  /* Loop for each alternative branch */
4970    
4971    orig_bracount = max_bracount = cd->bracount;
4972  for (;;)  for (;;)
4973    {    {
4974      /* For a (?| group, reset the capturing bracket count so that each branch
4975      uses the same numbers. */
4976    
4977      if (reset_bracount) cd->bracount = orig_bracount;
4978    
4979    /* Handle a change of ims options at the start of the branch */    /* Handle a change of ims options at the start of the branch */
4980    
4981    if ((options & PCRE_IMS) != oldims)    if ((options & PCRE_IMS) != oldims)
# Line 4725  for (;;) Line 5005  for (;;)
5005      return FALSE;      return FALSE;
5006      }      }
5007    
5008      /* Keep the highest bracket count in case (?| was used and some branch
5009      has fewer than the rest. */
5010    
5011      if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5012    
5013    /* In the real compile phase, there is some post-processing to be done. */    /* In the real compile phase, there is some post-processing to be done. */
5014    
5015    if (lengthptr == NULL)    if (lengthptr == NULL)
# Line 4788  for (;;) Line 5073  for (;;)
5073        }        }
5074      }      }
5075    
5076    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. In the real
5077    the alternative branches and reverse the chain of offsets, with the field in    compile phase, go back through the alternative branches and reverse the chain
5078    the BRA item now becoming an offset to the first alternative. If there are    of offsets, with the field in the BRA item now becoming an offset to the
5079    no alternatives, it points to the end of the group. The length in the    first alternative. If there are no alternatives, it points to the end of the
5080    terminating ket is always the length of the whole bracketed item. If any of    group. The length in the terminating ket is always the length of the whole
5081    the ims options were changed inside the group, compile a resetting op-code    bracketed item. If any of the ims options were changed inside the group,
5082    following, except at the very end of the pattern. Return leaving the pointer    compile a resetting op-code following, except at the very end of the pattern.
5083    at the terminating char. */    Return leaving the pointer at the terminating char. */
5084    
5085    if (*ptr != '|')    if (*ptr != '|')
5086      {      {
5087      int branch_length = code - last_branch;      if (lengthptr == NULL)
     do  
5088        {        {
5089        int prev_length = GET(last_branch, 1);        int branch_length = code - last_branch;
5090        PUT(last_branch, 1, branch_length);        do
5091        branch_length = prev_length;          {
5092        last_branch -= branch_length;          int prev_length = GET(last_branch, 1);
5093            PUT(last_branch, 1, branch_length);
5094            branch_length = prev_length;
5095            last_branch -= branch_length;
5096            }
5097          while (branch_length > 0);
5098        }        }
     while (branch_length > 0);  
5099    
5100      /* Fill in the ket */      /* Fill in the ket */
5101    
# Line 4824  for (;;) Line 5112  for (;;)
5112        length += 2;        length += 2;
5113        }        }
5114    
5115        /* Retain the highest bracket number, in case resetting was used. */
5116    
5117        cd->bracount = max_bracount;
5118    
5119      /* Set values to pass back */      /* Set values to pass back */
5120    
5121      *codeptr = code;      *codeptr = code;
# Line 4834  for (;;) Line 5126  for (;;)
5126      return TRUE;      return TRUE;
5127      }      }
5128    
5129    /* Another branch follows; insert an "or" node. Its length field points back    /* Another branch follows. In the pre-compile phase, we can move the code
5130      pointer back to where it was for the start of the first branch. (That is,
5131      pretend that each branch is the only one.)
5132    
5133      In the real compile phase, insert an ALT node. Its length field points back
5134    to the previous branch while the bracket remains open. At the end the chain    to the previous branch while the bracket remains open. At the end the chain
5135    is reversed. It's done like this so that the start of the bracket has a    is reversed. It's done like this so that the start of the bracket has a
5136    zero offset until it is closed, making it possible to detect recursion. */    zero offset until it is closed, making it possible to detect recursion. */
5137    
5138    *code = OP_ALT;    if (lengthptr != NULL)
5139    PUT(code, 1, code - last_branch);      {
5140    bc.current = last_branch = code;      code = *codeptr + 1 + LINK_SIZE + skipbytes;
5141    code += 1 + LINK_SIZE;      length += 1 + LINK_SIZE;
5142        }
5143      else
5144        {
5145        *code = OP_ALT;
5146        PUT(code, 1, code - last_branch);
5147        bc.current = last_branch = code;
5148        code += 1 + LINK_SIZE;
5149        }
5150    
5151    ptr++;    ptr++;
   length += 1 + LINK_SIZE;  
5152    }    }
5153  /* Control never reaches here */  /* Control never reaches here */
5154  }  }
# Line 5286  outside can help speed up starting point Line 5590  outside can help speed up starting point
5590  code = cworkspace;  code = cworkspace;
5591  *code = OP_BRA;  *code = OP_BRA;
5592  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5593    &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);    &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5594      &length);
5595  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5596    
5597  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
# Line 5354  ptr = (const uschar *)pattern; Line 5659  ptr = (const uschar *)pattern;
5659  code = (uschar *)codestart;  code = (uschar *)codestart;
5660  *code = OP_BRA;  *code = OP_BRA;
5661  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5662    &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);    &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5663  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
5664  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
5665    

Legend:
Removed from v.167  
changed lines
  Added in v.200

  ViewVC Help
Powered by ViewVC 1.1.5