/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 47 by nigel, Sat Feb 24 21:39:29 2007 UTC revision 59 by nigel, Sat Feb 24 21:39:54 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-2000 University of Cambridge             Copyright (c) 1997-2001 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 60  the external pcre header. */ Line 60  the external pcre header. */
60  #endif  #endif
61    
62    
63  /* Number of items on the nested bracket stacks at compile time. This should  /* Maximum number of items on the nested bracket stacks at compile time. This
64  not be set greater than 200. */  applies to the nesting of all kinds of parentheses. It does not limit
65    un-nested, non-capturing parentheses. This number can be made bigger if
66    necessary - it is used to dimension one int and one unsigned char vector at
67    compile time. */
68    
69  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
70    
71    
72    /* The number of bytes in a literal character string above which we can't add
73    any more is different when UTF-8 characters may be encountered. */
74    
75    #ifdef SUPPORT_UTF8
76    #define MAXLIT 250
77    #else
78    #define MAXLIT 255
79    #endif
80    
81    
82  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
83    
84  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
# Line 85  static const char *OP_names[] = { Line 98  static const char *OP_names[] = {
98    "class", "Ref", "Recurse",    "class", "Ref", "Recurse",
99    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
100    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
101    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Branumber", "Bra"
102  };  };
103  #endif  #endif
104    
# Line 101  static const short int escapes[] = { Line 114  static const short int escapes[] = {
114      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
115      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
116      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
117    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,  ESC_E,  ESC_F,      0,   /* ` - g */
118      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */      0,      0,      0,      0,      0,      0,  ESC_N,      0,   /* h - o */
119      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */      0,      0,  ESC_R, -ESC_s,  ESC_T,      0,      0, -ESC_w,   /* p - w */
120      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
121  };  };
122    
# Line 176  void  (*pcre_free)(void *) = free; Line 189  void  (*pcre_free)(void *) = free;
189    
190    
191    
192    /*************************************************
193    *    Macros and tables for character handling    *
194    *************************************************/
195    
196    /* When UTF-8 encoding is being used, a character is no longer just a single
197    byte. The macros for character handling generate simple sequences when used in
198    byte-mode, and more complicated ones for UTF-8 characters. */
199    
200    #ifndef SUPPORT_UTF8
201    #define GETCHARINC(c, eptr) c = *eptr++;
202    #define GETCHARLEN(c, eptr, len) c = *eptr;
203    #define BACKCHAR(eptr)
204    
205    #else   /* SUPPORT_UTF8 */
206    
207    /* Get the next UTF-8 character, advancing the pointer */
208    
209    #define GETCHARINC(c, eptr) \
210      c = *eptr++; \
211      if (md->utf8 && (c & 0xc0) == 0xc0) \
212        { \
213        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
214        int s = 6*a; \
215        c = (c & utf8_table3[a]) << s; \
216        while (a-- > 0) \
217          { \
218          s -= 6; \
219          c |= (*eptr++ & 0x3f) << s; \
220          } \
221        }
222    
223    /* Get the next UTF-8 character, not advancing the pointer, setting length */
224    
225    #define GETCHARLEN(c, eptr, len) \
226      c = *eptr; \
227      len = 1; \
228      if (md->utf8 && (c & 0xc0) == 0xc0) \
229        { \
230        int i; \
231        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
232        int s = 6*a; \
233        c = (c & utf8_table3[a]) << s; \
234        for (i = 1; i <= a; i++) \
235          { \
236          s -= 6; \
237          c |= (eptr[i] & 0x3f) << s; \
238          } \
239        len += a; \
240        }
241    
242    /* If the pointer is not at the start of a character, move it back until
243    it is. */
244    
245    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
246    
247    #endif
248    
249    
250    
251  /*************************************************  /*************************************************
252  *             Default character tables           *  *             Default character tables           *
# Line 191  tables. */ Line 262  tables. */
262    
263    
264    
265    #ifdef SUPPORT_UTF8
266    /*************************************************
267    *           Tables for UTF-8 support             *
268    *************************************************/
269    
270    /* These are the breakpoints for different numbers of bytes in a UTF-8
271    character. */
272    
273    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
274    
275    /* These are the indicator bits and the mask for the data bits to set in the
276    first byte of a character, indexed by the number of additional bytes. */
277    
278    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
279    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
280    
281    /* Table of the number of extra characters, indexed by the first character
282    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
283    0x3d. */
284    
285    static uschar utf8_table4[] = {
286      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
287      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
288      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
289      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
290    
291    
292    /*************************************************
293    *       Convert character value to UTF-8         *
294    *************************************************/
295    
296    /* This function takes an integer value in the range 0 - 0x7fffffff
297    and encodes it as a UTF-8 character in 0 to 6 bytes.
298    
299    Arguments:
300      cvalue     the character value
301      buffer     pointer to buffer for result - at least 6 bytes long
302    
303    Returns:     number of characters placed in the buffer
304    */
305    
306    static int
307    ord2utf8(int cvalue, uschar *buffer)
308    {
309    register int i, j;
310    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
311      if (cvalue <= utf8_table1[i]) break;
312    buffer += i;
313    for (j = i; j > 0; j--)
314     {
315     *buffer-- = 0x80 | (cvalue & 0x3f);
316     cvalue >>= 6;
317     }
318    *buffer = utf8_table2[i] | cvalue;
319    return i + 1;
320    }
321    #endif
322    
323    
324    
325  /*************************************************  /*************************************************
326  *          Return version string                 *  *          Return version string                 *
327  *************************************************/  *************************************************/
# Line 349  while (length-- > 0) Line 480  while (length-- > 0)
480    
481  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
482  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
483  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
484  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
485  sequence.  the \. On exit, it is on the final character of the escape sequence.
486    
487  Arguments:  Arguments:
488    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 373  check_escape(const uschar **ptrptr, cons Line 504  check_escape(const uschar **ptrptr, cons
504  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
505  int c, i;  int c, i;
506    
507  c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  /* If backslash is at the end of the pattern, it's an error. */
508    
509    c = *(++ptr);
510  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
511    
512  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 433  else Line 566  else
566        }        }
567    
568      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
569      larger first octal digit */      larger first octal digit. */
570    
571      case '0':      case '0':
572      c -= '0';      c -= '0';
573      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
574        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
575          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
576        c &= 255;     /* Take least significant 8 bits */
577      break;      break;
578    
579      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
580        which can be greater than 0xff, but only if the ddd are hex digits. */
581    
582      case 'x':      case 'x':
583    #ifdef SUPPORT_UTF8
584        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
585          {
586          const uschar *pt = ptr + 2;
587          register int count = 0;
588          c = 0;
589          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
590            {
591            count++;
592            c = c * 16 + cd->lcc[*pt] -
593              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
594            pt++;
595            }
596          if (*pt == '}')
597            {
598            if (c < 0 || count > 8) *errorptr = ERR34;
599            ptr = pt;
600            break;
601            }
602          /* If the sequence of hex digits does not end with '}', then we don't
603          recognize this construct; fall through to the normal \x handling. */
604          }
605    #endif
606    
607        /* Read just a single hex char */
608    
609      c = 0;      c = 0;
610      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
611        {        {
# Line 454  else Line 615  else
615        }        }
616      break;      break;
617    
618        /* Other special escapes not starting with a digit are straightforward */
619    
620      case 'c':      case 'c':
621      c = *(++ptr);      c = *(++ptr);
622      if (c == 0)      if (c == 0)
# Line 591  if the length is fixed. This is needed f Line 754  if the length is fixed. This is needed f
754    
755  Arguments:  Arguments:
756    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
757      options  the compiling options
758    
759  Returns:   the fixed length, or -1 if there is no fixed length  Returns:   the fixed length, or -1 if there is no fixed length
760  */  */
761    
762  static int  static int
763  find_fixedlength(uschar *code)  find_fixedlength(uschar *code, int options)
764  {  {
765  int length = -1;  int length = -1;
766    
# Line 617  for (;;) Line 781  for (;;)
781      case OP_BRA:      case OP_BRA:
782      case OP_ONCE:      case OP_ONCE:
783      case OP_COND:      case OP_COND:
784      d = find_fixedlength(cc);      d = find_fixedlength(cc, options);
785      if (d < 0) return -1;      if (d < 0) return -1;
786      branchlength += d;      branchlength += d;
787      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
# Line 653  for (;;) Line 817  for (;;)
817      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
818    
819      case OP_REVERSE:      case OP_REVERSE:
820        case OP_BRANUMBER:
821        case OP_CREF:
822      cc++;      cc++;
823      /* Fall through */      /* Fall through */
824    
     case OP_CREF:  
825      case OP_OPT:      case OP_OPT:
826      cc++;      cc++;
827      /* Fall through */      /* Fall through */
# Line 671  for (;;) Line 836  for (;;)
836      cc++;      cc++;
837      break;      break;
838    
839      /* Handle char strings */      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
840        This requires a scan of the string, unfortunately. We assume valid UTF-8
841        strings, so all we do is reduce the length by one for byte whose bits are
842        10xxxxxx. */
843    
844      case OP_CHARS:      case OP_CHARS:
845      branchlength += *(++cc);      branchlength += *(++cc);
846    #ifdef SUPPORT_UTF8
847        for (d = 1; d <= *cc; d++)
848          if ((cc[d] & 0xc0) == 0x80) branchlength--;
849    #endif
850      cc += *cc + 1;      cc += *cc + 1;
851      break;      break;
852    
# Line 703  for (;;) Line 875  for (;;)
875      /* Check a class for variable quantification */      /* Check a class for variable quantification */
876    
877      case OP_CLASS:      case OP_CLASS:
878      cc += (*cc == OP_REF)? 2 : 33;      cc += 33;
879    
880      switch (*cc)      switch (*cc)
881        {        {
# Line 810  return -1; Line 982  return -1;
982    
983  Arguments:  Arguments:
984    options      the option bits    options      the option bits
985    brackets     points to number of brackets used    brackets     points to number of extracting brackets used
986    code         points to the pointer to the current code point    code         points to the pointer to the current code point
987    ptrptr       points to the current pattern pointer    ptrptr       points to the current pattern pointer
988    errorptr     points to pointer to error message    errorptr     points to pointer to error message
# Line 861  for (;; ptr++) Line 1033  for (;; ptr++)
1033    int class_charcount;    int class_charcount;
1034    int class_lastchar;    int class_lastchar;
1035    int newoptions;    int newoptions;
1036    int condref;    int skipbytes;
1037    int subreqchar;    int subreqchar;
1038    
1039    c = *ptr;    c = *ptr;
# Line 872  for (;; ptr++) Line 1044  for (;; ptr++)
1044        {        {
1045        /* The space before the ; is to avoid a warning on a silly compiler        /* The space before the ; is to avoid a warning on a silly compiler
1046        on the Macintosh. */        on the Macintosh. */
1047        while ((c = *(++ptr)) != 0 && c != '\n') ;        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1048        continue;        continue;
1049        }        }
1050      }      }
# Line 1054  for (;; ptr++) Line 1226  for (;; ptr++)
1226              goto FAILED;              goto FAILED;
1227              }              }
1228            }            }
1229          /* Fall through if single character */  
1230            /* Fall through if single character, but don't at present allow
1231            chars > 255 in UTF-8 mode. */
1232    
1233    #ifdef SUPPORT_UTF8
1234            if (c > 255)
1235              {
1236              *errorptr = ERR33;
1237              goto FAILED;
1238              }
1239    #endif
1240          }          }
1241    
1242        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
# Line 1074  for (;; ptr++) Line 1256  for (;; ptr++)
1256            }            }
1257    
1258          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1259          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1260            in such circumstances. */
1261    
1262          if (d == '\\')          if (d == '\\')
1263            {            {
1264              const uschar *oldptr = ptr;
1265            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1266    
1267    #ifdef SUPPORT_UTF8
1268              if (d > 255)
1269                {
1270                *errorptr = ERR33;
1271                goto FAILED;
1272                }
1273    #endif
1274              /* \b is backslash; any other special means the '-' was literal */
1275    
1276            if (d < 0)            if (d < 0)
1277              {              {
1278              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1279                {                {
1280                *errorptr = ERR7;                ptr = oldptr - 2;
1281                goto FAILED;                goto SINGLE_CHARACTER;  /* A few lines below */
1282                }                }
1283              }              }
1284            }            }
# Line 1112  for (;; ptr++) Line 1306  for (;; ptr++)
1306        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1307        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1308    
1309          SINGLE_CHARACTER:
1310    
1311        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1312        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1313          {          {
# Line 1386  for (;; ptr++) Line 1582  for (;; ptr++)
1582        OP_BRAZERO in front of it, and because the group appears once in the        OP_BRAZERO in front of it, and because the group appears once in the
1583        data, whereas in other cases it appears the minimum number of times. For        data, whereas in other cases it appears the minimum number of times. For
1584        this reason, it is simplest to treat this case separately, as otherwise        this reason, it is simplest to treat this case separately, as otherwise
1585        the code gets far too mess. There are several special subcases when the        the code gets far too messy. There are several special subcases when the
1586        minimum is zero. */        minimum is zero. */
1587    
1588        if (repeat_min == 0)        if (repeat_min == 0)
# Line 1537  for (;; ptr++) Line 1733  for (;; ptr++)
1733    
1734      case '(':      case '(':
1735      newoptions = options;      newoptions = options;
1736      condref = -1;      skipbytes = 0;
1737    
1738      if (*(++ptr) == '?')      if (*(++ptr) == '?')
1739        {        {
# Line 1560  for (;; ptr++) Line 1756  for (;; ptr++)
1756          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
1757          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1758            {            {
1759            condref = *ptr - '0';            int condref = *ptr - '0';
1760            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1761              if (condref == 0)
1762                {
1763                *errorptr = ERR35;
1764                goto FAILED;
1765                }
1766            ptr++;            ptr++;
1767              code[3] = OP_CREF;
1768              code[4] = condref >> 8;
1769              code[5] = condref & 255;
1770              skipbytes = 3;
1771            }            }
1772          else ptr--;          else ptr--;
1773          break;          break;
# Line 1665  for (;; ptr++) Line 1870  for (;; ptr++)
1870          }          }
1871        }        }
1872    
1873      /* Else we have a referencing group; adjust the opcode. */      /* Else we have a referencing group; adjust the opcode. If the bracket
1874        number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1875        arrange for the true number to follow later, in an OP_BRANUMBER item. */
1876    
1877      else      else
1878        {        {
1879        if (++(*brackets) > EXTRACT_MAX)        if (++(*brackets) > EXTRACT_BASIC_MAX)
1880          {          {
1881          *errorptr = ERR13;          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1882          goto FAILED;          code[3] = OP_BRANUMBER;
1883            code[4] = *brackets >> 8;
1884            code[5] = *brackets & 255;
1885            skipbytes = 3;
1886          }          }
1887        bravalue = OP_BRA + *brackets;        else bravalue = OP_BRA + *brackets;
1888        }        }
1889    
1890      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed re. Assertions may not be repeated, but other
# Line 1690  for (;; ptr++) Line 1900  for (;; ptr++)
1900           options | PCRE_INGROUP,       /* Set for all nested groups */           options | PCRE_INGROUP,       /* Set for all nested groups */
1901           ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?           ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1902             newoptions & PCRE_IMS : -1, /* Pass ims options if changed */             newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1903           brackets,                     /* Bracket level */           brackets,                     /* Extracting bracket count */
1904           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
1905           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
1906           errorptr,                     /* Where to put an error message */           errorptr,                     /* Where to put an error message */
1907           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1908            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1909           condref,                      /* Condition reference number */           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
1910           &subreqchar,                  /* For possible last char */           &subreqchar,                  /* For possible last char */
1911           &subcountlits,                /* For literal count */           &subcountlits,                /* For literal count */
1912           cd))                          /* Tables block */           cd))                          /* Tables block */
# Line 1710  for (;; ptr++) Line 1920  for (;; ptr++)
1920      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
1921      two branches in the group. */      two branches in the group. */
1922    
1923      if (bravalue == OP_COND)      else if (bravalue == OP_COND)
1924        {        {
1925        uschar *tc = code;        uschar *tc = code;
1926        condcount = 0;        condcount = 0;
# Line 1777  for (;; ptr++) Line 1987  for (;; ptr++)
1987        {        {
1988        if (-c >= ESC_REF)        if (-c >= ESC_REF)
1989          {          {
1990            int number = -c - ESC_REF;
1991          previous = code;          previous = code;
1992          *code++ = OP_REF;          *code++ = OP_REF;
1993          *code++ = -c - ESC_REF;          *code++ = number >> 8;
1994            *code++ = number & 255;
1995          }          }
1996        else        else
1997          {          {
# Line 1814  for (;; ptr++) Line 2026  for (;; ptr++)
2026            {            {
2027            /* The space before the ; is to avoid a warning on a silly compiler            /* The space before the ; is to avoid a warning on a silly compiler
2028            on the Macintosh. */            on the Macintosh. */
2029            while ((c = *(++ptr)) != 0 && c != '\n') ;            while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2030            if (c == 0) break;            if (c == 0) break;
2031            continue;            continue;
2032            }            }
# Line 1829  for (;; ptr++) Line 2041  for (;; ptr++)
2041          tempptr = ptr;          tempptr = ptr;
2042          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2043          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
2044    
2045            /* If a character is > 127 in UTF-8 mode, we have to turn it into
2046            two or more characters in the UTF-8 encoding. */
2047    
2048    #ifdef SUPPORT_UTF8
2049            if (c > 127 && (options & PCRE_UTF8) != 0)
2050              {
2051              uschar buffer[8];
2052              int len = ord2utf8(c, buffer);
2053              for (c = 0; c < len; c++) *code++ = buffer[c];
2054              length += len;
2055              continue;
2056              }
2057    #endif
2058          }          }
2059    
2060        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1839  for (;; ptr++) Line 2065  for (;; ptr++)
2065    
2066      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2067    
2068      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2069    
2070      /* Update the last character and the count of literals */      /* Update the last character and the count of literals */
2071    
# Line 1851  for (;; ptr++) Line 2077  for (;; ptr++)
2077      the next state. */      the next state. */
2078    
2079      previous[1] = length;      previous[1] = length;
2080      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
2081      break;      break;
2082      }      }
2083    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1889  Argument: Line 2115  Argument:
2115    ptrptr      -> the address of the current pattern pointer    ptrptr      -> the address of the current pattern pointer
2116    errorptr    -> pointer to error message    errorptr    -> pointer to error message
2117    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
2118    condref     > 0 for OPT_CREF setting at start of conditional group    skipbytes   skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2119    reqchar     -> place to put the last required character, or a negative number    reqchar     -> place to put the last required character, or a negative number
2120    countlits   -> place to put the shortest literal count of any branch    countlits   -> place to put the shortest literal count of any branch
2121    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
# Line 1899  Returns:      TRUE on success Line 2125  Returns:      TRUE on success
2125    
2126  static BOOL  static BOOL
2127  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2128    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2129    int *reqchar, int *countlits, compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
2130  {  {
2131  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
# Line 1912  int branchreqchar, branchcountlits; Line 2138  int branchreqchar, branchcountlits;
2138    
2139  *reqchar = -1;  *reqchar = -1;
2140  *countlits = INT_MAX;  *countlits = INT_MAX;
2141  code += 3;  code += 3 + skipbytes;
   
 /* At the start of a reference-based conditional group, insert the reference  
 number as an OP_CREF item. */  
   
 if (condref > 0)  
   {  
   *code++ = OP_CREF;  
   *code++ = condref;  
   }  
2142    
2143  /* Loop for each alternative branch */  /* Loop for each alternative branch */
2144    
# Line 1989  for (;;) Line 2206  for (;;)
2206    if (lookbehind)    if (lookbehind)
2207      {      {
2208      *code = OP_END;      *code = OP_END;
2209      length = find_fixedlength(last_branch);      length = find_fixedlength(last_branch, options);
2210      DPRINTF(("fixed length = %d\n", length));      DPRINTF(("fixed length = %d\n", length));
2211      if (length < 0)      if (length < 0)
2212        {        {
# Line 2073  for (;;) Line 2290  for (;;)
2290      break;      break;
2291    
2292      case OP_CREF:      case OP_CREF:
2293      code += 2;      case OP_BRANUMBER:
2294        code += 3;
2295      break;      break;
2296    
2297      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 2280  uschar bralenstack[BRASTACK_SIZE]; Line 2498  uschar bralenstack[BRASTACK_SIZE];
2498  uschar *code_base, *code_end;  uschar *code_base, *code_end;
2499  #endif  #endif
2500    
2501    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2502    
2503    #ifndef SUPPORT_UTF8
2504    if ((options & PCRE_UTF8) != 0)
2505      {
2506      *errorptr = ERR32;
2507      return NULL;
2508      }
2509    #endif
2510    
2511  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
2512  can do is just return NULL. */  can do is just return NULL. */
2513    
# Line 2326  while ((c = *(++ptr)) != 0) Line 2554  while ((c = *(++ptr)) != 0)
2554    {    {
2555    int min, max;    int min, max;
2556    int class_charcount;    int class_charcount;
2557      int bracket_length;
2558    
2559    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2560      {      {
# Line 2334  while ((c = *(++ptr)) != 0) Line 2563  while ((c = *(++ptr)) != 0)
2563        {        {
2564        /* The space before the ; is to avoid a warning on a silly compiler        /* The space before the ; is to avoid a warning on a silly compiler
2565        on the Macintosh. */        on the Macintosh. */
2566        while ((c = *(++ptr)) != 0 && c != '\n') ;        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2567        continue;        continue;
2568        }        }
2569      }      }
# Line 2360  while ((c = *(++ptr)) != 0) Line 2589  while ((c = *(++ptr)) != 0)
2589        }        }
2590      length++;      length++;
2591    
2592      /* A back reference needs an additional char, plus either one or 5      /* A back reference needs an additional 2 bytes, plus either one or 5
2593      bytes for a repeat. We also need to keep the value of the highest      bytes for a repeat. We also need to keep the value of the highest
2594      back reference. */      back reference. */
2595    
# Line 2368  while ((c = *(++ptr)) != 0) Line 2597  while ((c = *(++ptr)) != 0)
2597        {        {
2598        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
2599        if (refnum > top_backref) top_backref = refnum;        if (refnum > top_backref) top_backref = refnum;
2600        length++;   /* For single back reference */        length += 2;   /* For single back reference */
2601        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2602          {          {
2603          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
# Line 2466  while ((c = *(++ptr)) != 0) Line 2695  while ((c = *(++ptr)) != 0)
2695    
2696      case '(':      case '(':
2697      branch_newextra = 0;      branch_newextra = 0;
2698        bracket_length = 3;
2699    
2700      /* Handle special forms of bracket, which all start (? */      /* Handle special forms of bracket, which all start (? */
2701    
# Line 2533  while ((c = *(++ptr)) != 0) Line 2763  while ((c = *(++ptr)) != 0)
2763          if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)          if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2764            {            {
2765            ptr += 4;            ptr += 4;
2766            length += 2;            length += 3;
2767            while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;            while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2768            if (*ptr != ')')            if (*ptr != ')')
2769              {              {
# Line 2660  while ((c = *(++ptr)) != 0) Line 2890  while ((c = *(++ptr)) != 0)
2890        }        }
2891    
2892      /* Extracting brackets must be counted so we can process escapes in a      /* Extracting brackets must be counted so we can process escapes in a
2893      Perlish way. */      Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2894        need an additional 3 bytes of store per extracting bracket. */
2895    
2896      else bracount++;      else
2897          {
2898          bracount++;
2899          if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2900          }
2901    
2902      /* Non-special forms of bracket. Save length for computing whole length      /* Save length for computing whole length at end if there's a repeat that
2903      at end if there's a repeat that requires duplication of the group. Also      requires duplication of the group. Also save the current value of
2904      save the current value of branch_extra, and start the new group with      branch_extra, and start the new group with the new value. If non-zero, this
2905      the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3      will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
     for a lookbehind assertion. */  
2906    
2907      if (brastackptr >= sizeof(brastack)/sizeof(int))      if (brastackptr >= sizeof(brastack)/sizeof(int))
2908        {        {
# Line 2680  while ((c = *(++ptr)) != 0) Line 2914  while ((c = *(++ptr)) != 0)
2914      branch_extra = branch_newextra;      branch_extra = branch_newextra;
2915    
2916      brastack[brastackptr++] = length;      brastack[brastackptr++] = length;
2917      length += 3;      length += bracket_length;
2918      continue;      continue;
2919    
2920      /* Handle ket. Look for subsequent max/min; for certain sets of values we      /* Handle ket. Look for subsequent max/min; for certain sets of values we
# Line 2760  while ((c = *(++ptr)) != 0) Line 2994  while ((c = *(++ptr)) != 0)
2994            {            {
2995            /* The space before the ; is to avoid a warning on a silly compiler            /* The space before the ; is to avoid a warning on a silly compiler
2996            on the Macintosh. */            on the Macintosh. */
2997            while ((c = *(++ptr)) != 0 && c != '\n') ;            while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2998            continue;            continue;
2999            }            }
3000          }          }
# Line 2775  while ((c = *(++ptr)) != 0) Line 3009  while ((c = *(++ptr)) != 0)
3009            &compile_block);            &compile_block);
3010          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3011          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
3012    
3013    #ifdef SUPPORT_UTF8
3014            if (c > 127 && (options & PCRE_UTF8) != 0)
3015              {
3016              int i;
3017              for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3018                if (c <= utf8_table1[i]) break;
3019              runlength += i;
3020              }
3021    #endif
3022          }          }
3023    
3024        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 2784  while ((c = *(++ptr)) != 0) Line 3028  while ((c = *(++ptr)) != 0)
3028    
3029      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
3030    
3031      while (runlength < 255 &&      while (runlength < MAXLIT &&
3032        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3033    
3034      ptr--;      ptr--;
# Line 2831  ptr = (const uschar *)pattern; Line 3075  ptr = (const uschar *)pattern;
3075  code = re->code;  code = re->code;
3076  *code = OP_BRA;  *code = OP_BRA;
3077  bracount = 0;  bracount = 0;
3078  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3079    &reqchar, &countlits, &compile_block);    &reqchar, &countlits, &compile_block);
3080  re->top_bracket = bracount;  re->top_bracket = bracount;
3081  re->top_backref = top_backref;  re->top_backref = top_backref;
# Line 2945  while (code < code_end) Line 3189  while (code < code_end)
3189    
3190    if (*code >= OP_BRA)    if (*code >= OP_BRA)
3191      {      {
3192      printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);      if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3193          printf("%3d Bra extra", (code[1] << 8) + code[2]);
3194        else
3195          printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3196      code += 2;      code += 2;
3197      }      }
3198    
# Line 2956  while (code < code_end) Line 3203  while (code < code_end)
3203      code++;      code++;
3204      break;      break;
3205    
     case OP_COND:  
     printf("%3d Cond", (code[1] << 8) + code[2]);  
     code += 2;  
     break;  
   
     case OP_CREF:  
     printf(" %.2d %s", code[1], OP_names[*code]);  
     code++;  
     break;  
   
3206      case OP_CHARS:      case OP_CHARS:
3207      charlength = *(++code);      charlength = *(++code);
3208      printf("%3d ", charlength);      printf("%3d ", charlength);
# Line 2982  while (code < code_end) Line 3219  while (code < code_end)
3219      case OP_ASSERTBACK:      case OP_ASSERTBACK:
3220      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
3221      case OP_ONCE:      case OP_ONCE:
     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);  
     code += 2;  
     break;  
   
3222      case OP_REVERSE:      case OP_REVERSE:
3223        case OP_BRANUMBER:
3224        case OP_COND:
3225        case OP_CREF:
3226      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3227      code += 2;      code += 2;
3228      break;      break;
# Line 3059  while (code < code_end) Line 3295  while (code < code_end)
3295      break;      break;
3296    
3297      case OP_REF:      case OP_REF:
3298      printf("    \\%d", *(++code));      printf("    \\%d", (code[1] << 8) | code[2]);
3299      code ++;      code += 3;
3300      goto CLASS_REF_REPEAT;      goto CLASS_REF_REPEAT;
3301    
3302      case OP_CLASS:      case OP_CLASS:
# Line 3273  for (;;) Line 3509  for (;;)
3509    
3510    if (op > OP_BRA)    if (op > OP_BRA)
3511      {      {
3512        int offset;
3513      int number = op - OP_BRA;      int number = op - OP_BRA;
3514      int offset = number << 1;  
3515        /* For extended extraction brackets (large number), we have to fish out the
3516        number from a dummy opcode at the start. */
3517    
3518        if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3519        offset = number << 1;
3520    
3521  #ifdef DEBUG  #ifdef DEBUG
3522      printf("start bracket %d subject=", number);      printf("start bracket %d subject=", number);
# Line 3304  for (;;) Line 3546  for (;;)
3546        md->offset_vector[offset] = save_offset1;        md->offset_vector[offset] = save_offset1;
3547        md->offset_vector[offset+1] = save_offset2;        md->offset_vector[offset+1] = save_offset2;
3548        md->offset_vector[md->offset_end - number] = save_offset3;        md->offset_vector[md->offset_end - number] = save_offset3;
3549    
3550        return FALSE;        return FALSE;
3551        }        }
3552    
# Line 3336  for (;;) Line 3579  for (;;)
3579      case OP_COND:      case OP_COND:
3580      if (ecode[3] == OP_CREF)         /* Condition is extraction test */      if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3581        {        {
3582        int offset = ecode[4] << 1;    /* Doubled reference number */        int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3583        return match(eptr,        return match(eptr,
3584          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585            5 : 3 + (ecode[1] << 8) + ecode[2]),            6 : 3 + (ecode[1] << 8) + ecode[2]),
3586          offset_top, md, ims, eptrb, match_isgroup);          offset_top, md, ims, eptrb, match_isgroup);
3587        }        }
3588    
# Line 3359  for (;;) Line 3602  for (;;)
3602        }        }
3603      /* Control never reaches here */      /* Control never reaches here */
3604    
3605      /* Skip over conditional reference data if encountered (should not be) */      /* Skip over conditional reference or large extraction number data if
3606        encountered. */
3607    
3608      case OP_CREF:      case OP_CREF:
3609      ecode += 2;      case OP_BRANUMBER:
3610        ecode += 3;
3611      break;      break;
3612    
3613      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
# Line 3429  for (;;) Line 3674  for (;;)
3674    
3675      /* Move the subject pointer back. This occurs only at the start of      /* Move the subject pointer back. This occurs only at the start of
3676      each branch of a lookbehind assertion. If we are too close to the start to      each branch of a lookbehind assertion. If we are too close to the start to
3677      move back, this match function fails. */      move back, this match function fails. When working with UTF-8 we move
3678        back a number of characters, not bytes. */
3679    
3680      case OP_REVERSE:      case OP_REVERSE:
3681    #ifdef SUPPORT_UTF8
3682        c = (ecode[1] << 8) + ecode[2];
3683        for (i = 0; i < c; i++)
3684          {
3685          eptr--;
3686          BACKCHAR(eptr)
3687          }
3688    #else
3689      eptr -= (ecode[1] << 8) + ecode[2];      eptr -= (ecode[1] << 8) + ecode[2];
3690    #endif
3691    
3692      if (eptr < md->start_subject) return FALSE;      if (eptr < md->start_subject) return FALSE;
3693      ecode += 3;      ecode += 3;
3694      break;      break;
# Line 3617  for (;;) Line 3873  for (;;)
3873    
3874        if (*prev != OP_COND)        if (*prev != OP_COND)
3875          {          {
3876            int offset;
3877          int number = *prev - OP_BRA;          int number = *prev - OP_BRA;
3878          int offset = number << 1;  
3879            /* For extended extraction brackets (large number), we have to fish out
3880            the number from a dummy opcode at the start. */
3881    
3882            if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3883            offset = number << 1;
3884    
3885  #ifdef DEBUG  #ifdef DEBUG
3886          printf("end bracket %d", number);          printf("end bracket %d", number);
# Line 3678  for (;;) Line 3940  for (;;)
3940      if (md->notbol && eptr == md->start_subject) return FALSE;      if (md->notbol && eptr == md->start_subject) return FALSE;
3941      if ((ims & PCRE_MULTILINE) != 0)      if ((ims & PCRE_MULTILINE) != 0)
3942        {        {
3943        if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;        if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3944        ecode++;        ecode++;
3945        break;        break;
3946        }        }
# Line 3697  for (;;) Line 3959  for (;;)
3959      case OP_DOLL:      case OP_DOLL:
3960      if ((ims & PCRE_MULTILINE) != 0)      if ((ims & PCRE_MULTILINE) != 0)
3961        {        {
3962        if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }        if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3963          else { if (md->noteol) return FALSE; }          else { if (md->noteol) return FALSE; }
3964        ecode++;        ecode++;
3965        break;        break;
# Line 3708  for (;;) Line 3970  for (;;)
3970        if (!md->endonly)        if (!md->endonly)
3971          {          {
3972          if (eptr < md->end_subject - 1 ||          if (eptr < md->end_subject - 1 ||
3973             (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;             (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3974    
3975          ecode++;          ecode++;
3976          break;          break;
# Line 3727  for (;;) Line 3989  for (;;)
3989    
3990      case OP_EODN:      case OP_EODN:
3991      if (eptr < md->end_subject - 1 ||      if (eptr < md->end_subject - 1 ||
3992         (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;         (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3993      ecode++;      ecode++;
3994      break;      break;
3995    
# Line 3749  for (;;) Line 4011  for (;;)
4011      /* Match a single character type; inline for speed */      /* Match a single character type; inline for speed */
4012    
4013      case OP_ANY:      case OP_ANY:
4014      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4015        return FALSE;        return FALSE;
4016      if (eptr++ >= md->end_subject) return FALSE;      if (eptr++ >= md->end_subject) return FALSE;
4017    #ifdef SUPPORT_UTF8
4018        if (md->utf8)
4019          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4020    #endif
4021      ecode++;      ecode++;
4022      break;      break;
4023    
# Line 3808  for (;;) Line 4074  for (;;)
4074      case OP_REF:      case OP_REF:
4075        {        {
4076        int length;        int length;
4077        int offset = ecode[1] << 1;                /* Doubled reference number */        int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4078        ecode += 2;                                /* Advance past the item */        ecode += 3;                                     /* Advance past item */
4079    
4080        /* If the reference is unset, set the length to be longer than the amount        /* If the reference is unset, set the length to be longer than the amount
4081        of subject left; this ensures that every attempt at a match fails. We        of subject left; this ensures that every attempt at a match fails. We
# Line 3953  for (;;) Line 4219  for (;;)
4219        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4220          {          {
4221          if (eptr >= md->end_subject) return FALSE;          if (eptr >= md->end_subject) return FALSE;
4222          c = *eptr++;          GETCHARINC(c, eptr)         /* Get character; increment eptr */
4223    
4224    #ifdef SUPPORT_UTF8
4225            /* We do not yet support class members > 255 */
4226            if (c > 255) return FALSE;
4227    #endif
4228    
4229          if ((data[c/8] & (1 << (c&7))) != 0) continue;          if ((data[c/8] & (1 << (c&7))) != 0) continue;
4230          return FALSE;          return FALSE;
4231          }          }
# Line 3973  for (;;) Line 4245  for (;;)
4245            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4246              return TRUE;              return TRUE;
4247            if (i >= max || eptr >= md->end_subject) return FALSE;            if (i >= max || eptr >= md->end_subject) return FALSE;
4248            c = *eptr++;            GETCHARINC(c, eptr)       /* Get character; increment eptr */
4249    
4250    #ifdef SUPPORT_UTF8
4251              /* We do not yet support class members > 255 */
4252              if (c > 255) return FALSE;
4253    #endif
4254            if ((data[c/8] & (1 << (c&7))) != 0) continue;            if ((data[c/8] & (1 << (c&7))) != 0) continue;
4255            return FALSE;            return FALSE;
4256            }            }
# Line 3985  for (;;) Line 4262  for (;;)
4262        else        else
4263          {          {
4264          const uschar *pp = eptr;          const uschar *pp = eptr;
4265          for (i = min; i < max; eptr++, i++)          int len = 1;
4266            for (i = min; i < max; i++)
4267            {            {
4268            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
4269            c = *eptr;            GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4270            if ((data[c/8] & (1 << (c&7))) != 0) continue;  
4271            break;  #ifdef SUPPORT_UTF8
4272              /* We do not yet support class members > 255 */
4273              if (c > 255) break;
4274    #endif
4275              if ((data[c/8] & (1 << (c&7))) == 0) break;
4276              eptr += len;
4277            }            }
4278    
4279          while (eptr >= pp)          while (eptr >= pp)
4280              {
4281            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4282              return TRUE;              return TRUE;
4283    
4284    #ifdef SUPPORT_UTF8
4285              BACKCHAR(eptr)
4286    #endif
4287              }
4288          return FALSE;          return FALSE;
4289          }          }
4290        }        }
# Line 4315  for (;;) Line 4604  for (;;)
4604    
4605      /* First, ensure the minimum number of matches are present. Use inline      /* First, ensure the minimum number of matches are present. Use inline
4606      code for maximizing the speed, and do the type test once at the start      code for maximizing the speed, and do the type test once at the start
4607      (i.e. keep it out of the loop). Also test that there are at least the      (i.e. keep it out of the loop). Also we can test that there are at least
4608      minimum number of characters before we start. */      the minimum number of bytes before we start, except when doing '.' in
4609        UTF8 mode. Leave the test in in all cases; in the special case we have
4610        to test after each character. */
4611    
4612      if (min > md->end_subject - eptr) return FALSE;      if (min > md->end_subject - eptr) return FALSE;
4613      if (min > 0) switch(ctype)      if (min > 0) switch(ctype)
4614        {        {
4615        case OP_ANY:        case OP_ANY:
4616    #ifdef SUPPORT_UTF8
4617          if (md->utf8)
4618            {
4619            for (i = 1; i <= min; i++)
4620              {
4621              if (eptr >= md->end_subject ||
4622                 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4623                return FALSE;
4624              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4625              }
4626            break;
4627            }
4628    #endif
4629          /* Non-UTF8 can be faster */
4630        if ((ims & PCRE_DOTALL) == 0)        if ((ims & PCRE_DOTALL) == 0)
4631          { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }          { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4632        else eptr += min;        else eptr += min;
4633        break;        break;
4634    
# Line 4378  for (;;) Line 4683  for (;;)
4683          switch(ctype)          switch(ctype)
4684            {            {
4685            case OP_ANY:            case OP_ANY:
4686            if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;            if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4687    #ifdef SUPPORT_UTF8
4688              if (md->utf8)
4689                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4690    #endif
4691            break;            break;
4692    
4693            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
# Line 4418  for (;;) Line 4727  for (;;)
4727        switch(ctype)        switch(ctype)
4728          {          {
4729          case OP_ANY:          case OP_ANY:
4730    
4731            /* Special code is required for UTF8, but when the maximum is unlimited
4732            we don't need it. */
4733    
4734    #ifdef SUPPORT_UTF8
4735            if (md->utf8 && max < INT_MAX)
4736              {
4737              if ((ims & PCRE_DOTALL) == 0)
4738                {
4739                for (i = min; i < max; i++)
4740                  {
4741                  if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4742                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4743                  }
4744                }
4745              else
4746                {
4747                for (i = min; i < max; i++)
4748                  {
4749                  eptr++;
4750                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4751                  }
4752                }
4753              break;
4754              }
4755    #endif
4756            /* Non-UTF8 can be faster */
4757          if ((ims & PCRE_DOTALL) == 0)          if ((ims & PCRE_DOTALL) == 0)
4758            {            {
4759            for (i = min; i < max; i++)            for (i = min; i < max; i++)
4760              {              {
4761              if (eptr >= md->end_subject || *eptr == '\n') break;              if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4762              eptr++;              eptr++;
4763              }              }
4764            }            }
# Line 4490  for (;;) Line 4826  for (;;)
4826          }          }
4827    
4828        while (eptr >= pp)        while (eptr >= pp)
4829            {
4830          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4831            return TRUE;            return TRUE;
4832    #ifdef SUPPORT_UTF8
4833            if (md->utf8)
4834              while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4835    #endif
4836            }
4837        return FALSE;        return FALSE;
4838        }        }
4839      /* Control never gets here */      /* Control never gets here */
# Line 4557  const uschar *req_char_ptr = start_match Line 4899  const uschar *req_char_ptr = start_match
4899  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4900  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4901  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
4902  BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;  BOOL anchored;
4903  BOOL startline = (re->options & PCRE_STARTLINE) != 0;  BOOL startline;
4904    
4905  if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;  if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4906    
# Line 4566  if (re == NULL || subject == NULL || Line 4908  if (re == NULL || subject == NULL ||
4908     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4909  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4910    
4911    anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4912    startline = (re->options & PCRE_STARTLINE) != 0;
4913    
4914  match_block.start_pattern = re->code;  match_block.start_pattern = re->code;
4915  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
4916  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
4917  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;
4918    
4919  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4920    match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4921    
4922  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4923  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
# Line 4693  do Line 5039  do
5039      {      {
5040      if (start_match > match_block.start_subject + start_offset)      if (start_match > match_block.start_subject + start_offset)
5041        {        {
5042        while (start_match < end_subject && start_match[-1] != '\n')        while (start_match < end_subject && start_match[-1] != NEWLINE)
5043          start_match++;          start_match++;
5044        }        }
5045      }      }
# Line 4798  do Line 5144  do
5144    
5145    rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;    rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5146    
5147    if (match_block.offset_end < 2) rc = 0; else    if (offsetcount < 2) rc = 0; else
5148      {      {
5149      offsets[0] = start_match - match_block.start_subject;      offsets[0] = start_match - match_block.start_subject;
5150      offsets[1] = match_block.end_match_ptr - match_block.start_subject;      offsets[1] = match_block.end_match_ptr - match_block.start_subject;

Legend:
Removed from v.47  
changed lines
  Added in v.59

  ViewVC Help
Powered by ViewVC 1.1.5