/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 23 by nigel, Sat Feb 24 21:38:41 2007 UTC revision 53 by nigel, Sat Feb 24 21:39:42 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1998 University of Cambridge             Copyright (c) 1997-2001 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 56  the external pcre header. */ Line 60  the external pcre header. */
60  #endif  #endif
61    
62    
63  /* Number of items on the nested bracket stacks at compile time. This should  /* Maximum number of items on the nested bracket stacks at compile time. This
64  not be set greater than 200. */  applies to the nesting of all kinds of parentheses. It does not limit
65    un-nested, non-capturing parentheses. This number can be made bigger if
66    necessary - it is used to dimension one int and one unsigned char vector at
67    compile time. */
68    
69  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
70    
71    
72    /* The number of bytes in a literal character string above which we can't add
73    any more is different when UTF-8 characters may be encountered. */
74    
75    #ifdef SUPPORT_UTF8
76    #define MAXLIT 250
77    #else
78    #define MAXLIT 255
79    #endif
80    
81    
82  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
83    
84  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
# Line 78  static const char *OP_names[] = { Line 95  static const char *OP_names[] = {
95    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
96    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
97    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
98    "class", "Ref",    "class", "Ref", "Recurse",
99    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
100    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
101    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Branumber", "Bra"
102  };  };
103  #endif  #endif
104    
# Line 97  static const short int escapes[] = { Line 114  static const short int escapes[] = {
114      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
115      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
116      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
117    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,  ESC_E,  ESC_F,      0,   /* ` - g */
118      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */      0,      0,      0,      0,      0,      0,  ESC_N,      0,   /* h - o */
119      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */      0,      0,  ESC_R, -ESC_s,  ESC_T,      0,      0, -ESC_w,   /* p - w */
120      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
121  };  };
122    
123    /* Tables of names of POSIX character classes and their lengths. The list is
124    terminated by a zero length entry. The first three must be alpha, upper, lower,
125    as this is assumed for handling case independence. */
126    
127    static const char *posix_names[] = {
128      "alpha", "lower", "upper",
129      "alnum", "ascii", "cntrl", "digit", "graph",
130      "print", "punct", "space", "word",  "xdigit" };
131    
132    static const uschar posix_name_lengths[] = {
133      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
134    
135    /* Table of class bit maps for each POSIX class; up to three may be combined
136    to form the class. */
137    
138    static const int posix_class_maps[] = {
139      cbit_lower, cbit_upper, -1,             /* alpha */
140      cbit_lower, -1,         -1,             /* lower */
141      cbit_upper, -1,         -1,             /* upper */
142      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
143      cbit_print, cbit_cntrl, -1,             /* ascii */
144      cbit_cntrl, -1,         -1,             /* cntrl */
145      cbit_digit, -1,         -1,             /* digit */
146      cbit_graph, -1,         -1,             /* graph */
147      cbit_print, -1,         -1,             /* print */
148      cbit_punct, -1,         -1,             /* punct */
149      cbit_space, -1,         -1,             /* space */
150      cbit_word,  -1,         -1,             /* word */
151      cbit_xdigit,-1,         -1              /* xdigit */
152    };
153    
154    
155  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
156    
157  static BOOL  static BOOL
158    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
159      BOOL, int);      BOOL, int, int *, int *, compile_data *);
160    
161    /* Structure for building a chain of data that actually lives on the
162    stack, for holding the values of the subject pointer at the start of each
163    subpattern, so as to detect when an empty string has been matched by a
164    subpattern - to break infinite loops. */
165    
166    typedef struct eptrblock {
167      struct eptrblock *prev;
168      const uschar *saved_eptr;
169    } eptrblock;
170    
171  /* Structure for passing "static" information around between the functions  /* Flag bits for the match() function */
 doing the matching, so that they are thread-safe. */  
172    
173  typedef struct match_data {  #define match_condassert   0x01    /* Called to check a condition assertion */
174    int    errorcode;             /* As it says */  #define match_isgroup      0x02    /* Set if start of bracketed group */
   int   *offset_vector;         /* Offset vector */  
   int    offset_end;            /* One past the end */  
   int    offset_max;            /* The maximum usable for return data */  
   BOOL   offset_overflow;       /* Set if too many extractions */  
   BOOL   notbol;                /* NOTBOL flag */  
   BOOL   noteol;                /* NOTEOL flag */  
   BOOL   endonly;               /* Dollar not before final \n */  
   const uschar *start_subject;  /* Start of the subject string */  
   const uschar *end_subject;    /* End of the subject string */  
   const uschar *end_match_ptr;  /* Subject position at end match */  
   int     end_offset_top;       /* Highwater mark at end of match */  
 } match_data;  
175    
176    
177    
# Line 143  void  (*pcre_free)(void *) = free; Line 189  void  (*pcre_free)(void *) = free;
189    
190    
191    
192    /*************************************************
193    *    Macros and tables for character handling    *
194    *************************************************/
195    
196    /* When UTF-8 encoding is being used, a character is no longer just a single
197    byte. The macros for character handling generate simple sequences when used in
198    byte-mode, and more complicated ones for UTF-8 characters. */
199    
200    #ifndef SUPPORT_UTF8
201    #define GETCHARINC(c, eptr) c = *eptr++;
202    #define GETCHARLEN(c, eptr, len) c = *eptr;
203    #define BACKCHAR(eptr)
204    
205    #else   /* SUPPORT_UTF8 */
206    
207    /* Get the next UTF-8 character, advancing the pointer */
208    
209    #define GETCHARINC(c, eptr) \
210      c = *eptr++; \
211      if (md->utf8 && (c & 0xc0) == 0xc0) \
212        { \
213        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
214        int s = 6 - a;                  /* Amount to shift next byte */  \
215        c &= utf8_table3[a];            /* Low order bits from first byte */ \
216        while (a-- > 0) \
217          { \
218          c |= (*eptr++ & 0x3f) << s; \
219          s += 6; \
220          } \
221        }
222    
223    /* Get the next UTF-8 character, not advancing the pointer, setting length */
224    
225    #define GETCHARLEN(c, eptr, len) \
226      c = *eptr; \
227      len = 1; \
228      if (md->utf8 && (c & 0xc0) == 0xc0) \
229        { \
230        int i; \
231        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
232        int s = 6 - a;                  /* Amount to shift next byte */  \
233        c &= utf8_table3[a];            /* Low order bits from first byte */ \
234        for (i = 1; i <= a; i++) \
235          { \
236          c |= (eptr[i] & 0x3f) << s; \
237          s += 6; \
238          } \
239        len += a; \
240        }
241    
242    /* If the pointer is not at the start of a character, move it back until
243    it is. */
244    
245    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
246    
247    #endif
248    
249    
250    
251    /*************************************************
252    *             Default character tables           *
253    *************************************************/
254    
255    /* A default set of character tables is included in the PCRE binary. Its source
256    is built by the maketables auxiliary program, which uses the default C ctypes
257    functions, and put in the file chartables.c. These tables are used by PCRE
258    whenever the caller of pcre_compile() does not provide an alternate set of
259    tables. */
260    
261    #include "chartables.c"
262    
263    
264    
265    #ifdef SUPPORT_UTF8
266    /*************************************************
267    *           Tables for UTF-8 support             *
268    *************************************************/
269    
270    /* These are the breakpoints for different numbers of bytes in a UTF-8
271    character. */
272    
273    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
274    
275    /* These are the indicator bits and the mask for the data bits to set in the
276    first byte of a character, indexed by the number of additional bytes. */
277    
278    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
279    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
280    
281    /* Table of the number of extra characters, indexed by the first character
282    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
283    0x3d. */
284    
285    static uschar utf8_table4[] = {
286      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
287      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
288      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
289      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
290    
291    
292    /*************************************************
293    *       Convert character value to UTF-8         *
294    *************************************************/
295    
296    /* This function takes an integer value in the range 0 - 0x7fffffff
297    and encodes it as a UTF-8 character in 0 to 6 bytes.
298    
299    Arguments:
300      cvalue     the character value
301      buffer     pointer to buffer for result - at least 6 bytes long
302    
303    Returns:     number of characters placed in the buffer
304    */
305    
306    static int
307    ord2utf8(int cvalue, uschar *buffer)
308    {
309    register int i, j;
310    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
311      if (cvalue <= utf8_table1[i]) break;
312    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
313    cvalue >>= 6 - i;
314    for (j = 0; j < i; j++)
315      {
316      *buffer++ = 0x80 | (cvalue & 0x3f);
317      cvalue >>= 6;
318      }
319    return i + 1;
320    }
321    #endif
322    
323    
324    
325  /*************************************************  /*************************************************
326  *          Return version string                 *  *          Return version string                 *
327  *************************************************/  *************************************************/
328    
329    #define STRING(a)  # a
330    #define XSTRING(s) STRING(s)
331    
332  const char *  const char *
333  pcre_version(void)  pcre_version(void)
334  {  {
335  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
336  }  }
337    
338    
339    
340    
341  /*************************************************  /*************************************************
342  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
343  *************************************************/  *************************************************/
344    
345  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
346  structure.  of the private structure, but its interface was too rigid. It remains for
347    backwards compatibility. The public options are passed back in an int - though
348    the re->options field has been expanded to a long int, all the public options
349    at the low end of it, and so even on 16-bit systems this will still be OK.
350    Therefore, I haven't changed the API for pcre_info().
351    
352  Arguments:  Arguments:
353    external_re   points to compiled code    external_re   points to compiled code
# Line 171  Arguments: Line 356  Arguments:
356                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
357                  or -2 otherwise                  or -2 otherwise
358    
359  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
360                  or negative values on error                  or negative values on error
361  */  */
362    
# Line 181  pcre_info(const pcre *external_re, int * Line 366  pcre_info(const pcre *external_re, int *
366  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
367  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
368  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
369  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
370  if (first_char != NULL)  if (first_char != NULL)
371    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
372       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 190  return re->top_bracket; Line 375  return re->top_bracket;
375    
376    
377    
378    /*************************************************
379    *        Return info about compiled pattern      *
380    *************************************************/
381    
382    /* This is a newer "info" function which has an extensible interface so
383    that additional items can be added compatibly.
384    
385    Arguments:
386      external_re      points to compiled code
387      external_study   points to study data, or NULL
388      what             what information is required
389      where            where to put the information
390    
391    Returns:           0 if data returned, negative on error
392    */
393    
394    int
395    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
396      void *where)
397    {
398    const real_pcre *re = (const real_pcre *)external_re;
399    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
400    
401    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
402    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
403    
404    switch (what)
405      {
406      case PCRE_INFO_OPTIONS:
407      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
408      break;
409    
410      case PCRE_INFO_SIZE:
411      *((size_t *)where) = re->size;
412      break;
413    
414      case PCRE_INFO_CAPTURECOUNT:
415      *((int *)where) = re->top_bracket;
416      break;
417    
418      case PCRE_INFO_BACKREFMAX:
419      *((int *)where) = re->top_backref;
420      break;
421    
422      case PCRE_INFO_FIRSTCHAR:
423      *((int *)where) =
424        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
425        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
426      break;
427    
428      case PCRE_INFO_FIRSTTABLE:
429      *((const uschar **)where) =
430        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
431          study->start_bits : NULL;
432      break;
433    
434      case PCRE_INFO_LASTLITERAL:
435      *((int *)where) =
436        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
437      break;
438    
439      default: return PCRE_ERROR_BADOPTION;
440      }
441    
442    return 0;
443    }
444    
445    
446    
447  #ifdef DEBUG  #ifdef DEBUG
448  /*************************************************  /*************************************************
# Line 227  while (length-- > 0) Line 480  while (length-- > 0)
480    
481  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
482  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
483  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
484  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
485  sequence.  the \. On exit, it is on the final character of the escape sequence.
486    
487  Arguments:  Arguments:
488    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 237  Arguments: Line 490  Arguments:
490    bracount   number of previous extracting brackets    bracount   number of previous extracting brackets
491    options    the options bits    options    the options bits
492    isclass    TRUE if inside a character class    isclass    TRUE if inside a character class
493      cd         pointer to char tables block
494    
495  Returns:     zero or positive => a data character  Returns:     zero or positive => a data character
496               negative => a special escape sequence               negative => a special escape sequence
# Line 245  Returns:     zero or positive => a data Line 499  Returns:     zero or positive => a data
499    
500  static int  static int
501  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
502    int options, BOOL isclass)    int options, BOOL isclass, compile_data *cd)
503  {  {
504  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
505  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
506    
507    /* If backslash is at the end of the pattern, it's an error. */
508    
509    c = *(++ptr);
510  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
511    
512  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 288  else Line 544  else
544        {        {
545        oldptr = ptr;        oldptr = ptr;
546        c -= '0';        c -= '0';
547        while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)        while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
548          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
549        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
550          {          {
# Line 310  else Line 566  else
566        }        }
567    
568      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
569      larger first octal digit */      larger first octal digit. */
570    
571      case '0':      case '0':
572      c -= '0';      c -= '0';
573      while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
574        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
575          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
576        c &= 255;     /* Take least significant 8 bits */
577      break;      break;
578    
579      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
580        which can be greater than 0xff, but only if the ddd are hex digits. */
581    
582      case 'x':      case 'x':
583    #ifdef SUPPORT_UTF8
584        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
585          {
586          const uschar *pt = ptr + 2;
587          register int count = 0;
588          c = 0;
589          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
590            {
591            count++;
592            c = c * 16 + cd->lcc[*pt] -
593              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
594            pt++;
595            }
596          if (*pt == '}')
597            {
598            if (c < 0 || count > 8) *errorptr = ERR34;
599            ptr = pt;
600            break;
601            }
602          /* If the sequence of hex digits does not end with '}', then we don't
603          recognize this construct; fall through to the normal \x handling. */
604          }
605    #endif
606    
607        /* Read just a single hex char */
608    
609      c = 0;      c = 0;
610      while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
611        {        {
612        ptr++;        ptr++;
613        c = c * 16 + pcre_lcc[*ptr] -        c = c * 16 + cd->lcc[*ptr] -
614          (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');          (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
615        }        }
616      break;      break;
617    
618        /* Other special escapes not starting with a digit are straightforward */
619    
620      case 'c':      case 'c':
621      c = *(++ptr);      c = *(++ptr);
622      if (c == 0)      if (c == 0)
# Line 341  else Line 627  else
627    
628      /* A letter is upper-cased; then the 0x40 bit is flipped */      /* A letter is upper-cased; then the 0x40 bit is flipped */
629    
630      if (c >= 'a' && c <= 'z') c = pcre_fcc[c];      if (c >= 'a' && c <= 'z') c = cd->fcc[c];
631      c ^= 0x40;      c ^= 0x40;
632      break;      break;
633    
634      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
635      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
636      for Perl compatibility, it is a literal. */      for Perl compatibility, it is a literal. This code looks a bit odd, but
637        there used to be some cases other than the default, and there may be again
638        in future, so I haven't "optimized" it. */
639    
640      default:      default:
641      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 377  where the ddds are digits. Line 665  where the ddds are digits.
665    
666  Arguments:  Arguments:
667    p         pointer to the first char after '{'    p         pointer to the first char after '{'
668      cd        pointer to char tables block
669    
670  Returns:    TRUE or FALSE  Returns:    TRUE or FALSE
671  */  */
672    
673  static BOOL  static BOOL
674  is_counted_repeat(const uschar *p)  is_counted_repeat(const uschar *p, compile_data *cd)
675  {  {
676  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
677  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
678  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
679    
680  if (*p++ != ',') return FALSE;  if (*p++ != ',') return FALSE;
681  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
682    
683  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
684  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
685  return (*p == '}');  return (*p == '}');
686  }  }
687    
# Line 412  Arguments: Line 701  Arguments:
701    maxp       pointer to int for max    maxp       pointer to int for max
702               returned as -1 if no max               returned as -1 if no max
703    errorptr   points to pointer to error message    errorptr   points to pointer to error message
704      cd         pointer to character tables clock
705    
706  Returns:     pointer to '}' on success;  Returns:     pointer to '}' on success;
707               current ptr on error, with errorptr set               current ptr on error, with errorptr set
708  */  */
709    
710  static const uschar *  static const uschar *
711  read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)  read_repeat_counts(const uschar *p, int *minp, int *maxp,
712      const char **errorptr, compile_data *cd)
713  {  {
714  int min = 0;  int min = 0;
715  int max = -1;  int max = -1;
716    
717  while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
718    
719  if (*p == '}') max = min; else  if (*p == '}') max = min; else
720    {    {
721    if (*(++p) != '}')    if (*(++p) != '}')
722      {      {
723      max = 0;      max = 0;
724      while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
725      if (max < min)      if (max < min)
726        {        {
727        *errorptr = ERR4;        *errorptr = ERR4;
# Line 463  if the length is fixed. This is needed f Line 754  if the length is fixed. This is needed f
754    
755  Arguments:  Arguments:
756    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
757      options  the compiling options
758    
759  Returns:   the fixed length, or -1 if there is no fixed length  Returns:   the fixed length, or -1 if there is no fixed length
760  */  */
761    
762  static int  static int
763  find_fixedlength(uschar *code)  find_fixedlength(uschar *code, int options)
764  {  {
765  int length = -1;  int length = -1;
766    
# Line 489  for (;;) Line 781  for (;;)
781      case OP_BRA:      case OP_BRA:
782      case OP_ONCE:      case OP_ONCE:
783      case OP_COND:      case OP_COND:
784      d = find_fixedlength(cc);      d = find_fixedlength(cc, options);
785      if (d < 0) return -1;      if (d < 0) return -1;
786      branchlength += d;      branchlength += d;
787      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
# Line 525  for (;;) Line 817  for (;;)
817      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
818    
819      case OP_REVERSE:      case OP_REVERSE:
820        case OP_BRANUMBER:
821        case OP_CREF:
822      cc++;      cc++;
823        /* Fall through */
824    
     case OP_CREF:  
825      case OP_OPT:      case OP_OPT:
826      cc++;      cc++;
827      /* Fall through */      /* Fall through */
# Line 542  for (;;) Line 836  for (;;)
836      cc++;      cc++;
837      break;      break;
838    
839      /* Handle char strings */      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
840        This requires a scan of the string, unfortunately. We assume valid UTF-8
841        strings, so all we do is reduce the length by one for byte whose bits are
842        10xxxxxx. */
843    
844      case OP_CHARS:      case OP_CHARS:
845      branchlength += *(++cc);      branchlength += *(++cc);
846    #ifdef SUPPORT_UTF8
847        for (d = 1; d <= *cc; d++)
848          if ((cc[d] & 0xc0) == 0x80) branchlength--;
849    #endif
850      cc += *cc + 1;      cc += *cc + 1;
851      break;      break;
852    
# Line 574  for (;;) Line 875  for (;;)
875      /* Check a class for variable quantification */      /* Check a class for variable quantification */
876    
877      case OP_CLASS:      case OP_CLASS:
878      cc += (*cc == OP_REF)? 2 : 33;      cc += 33;
879    
880      switch (*cc)      switch (*cc)
881        {        {
# Line 609  for (;;) Line 910  for (;;)
910    
911    
912  /*************************************************  /*************************************************
913    *           Check for POSIX class syntax         *
914    *************************************************/
915    
916    /* This function is called when the sequence "[:" or "[." or "[=" is
917    encountered in a character class. It checks whether this is followed by an
918    optional ^ and then a sequence of letters, terminated by a matching ":]" or
919    ".]" or "=]".
920    
921    Argument:
922      ptr      pointer to the initial [
923      endptr   where to return the end pointer
924      cd       pointer to compile data
925    
926    Returns:   TRUE or FALSE
927    */
928    
929    static BOOL
930    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
931    {
932    int terminator;          /* Don't combine these lines; the Solaris cc */
933    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
934    if (*(++ptr) == '^') ptr++;
935    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
936    if (*ptr == terminator && ptr[1] == ']')
937      {
938      *endptr = ptr;
939      return TRUE;
940      }
941    return FALSE;
942    }
943    
944    
945    
946    
947    /*************************************************
948    *          Check POSIX class name                *
949    *************************************************/
950    
951    /* This function is called to check the name given in a POSIX-style class entry
952    such as [:alnum:].
953    
954    Arguments:
955      ptr        points to the first letter
956      len        the length of the name
957    
958    Returns:     a value representing the name, or -1 if unknown
959    */
960    
961    static int
962    check_posix_name(const uschar *ptr, int len)
963    {
964    register int yield = 0;
965    while (posix_name_lengths[yield] != 0)
966      {
967      if (len == posix_name_lengths[yield] &&
968        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
969      yield++;
970      }
971    return -1;
972    }
973    
974    
975    
976    
977    /*************************************************
978  *           Compile one branch                   *  *           Compile one branch                   *
979  *************************************************/  *************************************************/
980    
981  /* Scan the pattern, compiling it into the code vector.  /* Scan the pattern, compiling it into the code vector.
982    
983  Arguments:  Arguments:
984    options     the option bits    options      the option bits
985    brackets    points to number of brackets used    brackets     points to number of extracting brackets used
986    code        points to the pointer to the current code point    code         points to the pointer to the current code point
987    ptrptr      points to the current pattern pointer    ptrptr       points to the current pattern pointer
988    errorptr    points to pointer to error message    errorptr     points to pointer to error message
989    optchanged  set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
990      reqchar      set to the last literal character required, else -1
991      countlits    set to count of mandatory literal characters
992      cd           contains pointers to tables
993    
994  Returns:      TRUE on success  Returns:       TRUE on success
995                FALSE, with *errorptr set on error                 FALSE, with *errorptr set on error
996  */  */
997    
998  static BOOL  static BOOL
999  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
1000    const uschar **ptrptr, const char **errorptr, int *optchanged)    const uschar **ptrptr, const char **errorptr, int *optchanged,
1001      int *reqchar, int *countlits, compile_data *cd)
1002  {  {
1003  int repeat_type, op_type;  int repeat_type, op_type;
1004  int repeat_min, repeat_max;  int repeat_min, repeat_max;
1005  int bravalue, length;  int bravalue, length;
1006  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
1007    int prevreqchar;
1008    int condcount = 0;
1009    int subcountlits = 0;
1010  register int c;  register int c;
1011  register uschar *code = *codeptr;  register uschar *code = *codeptr;
1012  uschar *tempcode;  uschar *tempcode;
# Line 647  uschar class[32]; Line 1020  uschar class[32];
1020  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
1021  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
1022    
1023    /* Initialize no required char, and count of literals */
1024    
1025    *reqchar = prevreqchar = -1;
1026    *countlits = 0;
1027    
1028  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
1029    
1030  for (;; ptr++)  for (;; ptr++)
# Line 655  for (;; ptr++) Line 1033  for (;; ptr++)
1033    int class_charcount;    int class_charcount;
1034    int class_lastchar;    int class_lastchar;
1035    int newoptions;    int newoptions;
1036    int condref;    int skipbytes;
1037      int subreqchar;
1038    
1039    c = *ptr;    c = *ptr;
1040    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
1041      {      {
1042      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
1043      if (c == '#')      if (c == '#')
1044        {        {
1045        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
1046          on the Macintosh. */
1047          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1048        continue;        continue;
1049        }        }
1050      }      }
# Line 738  for (;; ptr++) Line 1119  for (;; ptr++)
1119          goto FAILED;          goto FAILED;
1120          }          }
1121    
1122          /* Handle POSIX class names. Perl allows a negation extension of the
1123          form [:^name]. A square bracket that doesn't match the syntax is
1124          treated as a literal. We also recognize the POSIX constructions
1125          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1126          5.6 does. */
1127    
1128          if (c == '[' &&
1129              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1130              check_posix_syntax(ptr, &tempptr, cd))
1131            {
1132            BOOL local_negate = FALSE;
1133            int posix_class, i;
1134            register const uschar *cbits = cd->cbits;
1135    
1136            if (ptr[1] != ':')
1137              {
1138              *errorptr = ERR31;
1139              goto FAILED;
1140              }
1141    
1142            ptr += 2;
1143            if (*ptr == '^')
1144              {
1145              local_negate = TRUE;
1146              ptr++;
1147              }
1148    
1149            posix_class = check_posix_name(ptr, tempptr - ptr);
1150            if (posix_class < 0)
1151              {
1152              *errorptr = ERR30;
1153              goto FAILED;
1154              }
1155    
1156            /* If matching is caseless, upper and lower are converted to
1157            alpha. This relies on the fact that the class table starts with
1158            alpha, lower, upper as the first 3 entries. */
1159    
1160            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1161              posix_class = 0;
1162    
1163            /* Or into the map we are building up to 3 of the static class
1164            tables, or their negations. */
1165    
1166            posix_class *= 3;
1167            for (i = 0; i < 3; i++)
1168              {
1169              int taboffset = posix_class_maps[posix_class + i];
1170              if (taboffset < 0) break;
1171              if (local_negate)
1172                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1173              else
1174                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1175              }
1176    
1177            ptr = tempptr + 1;
1178            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1179            continue;
1180            }
1181    
1182        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
1183        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
1184        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 748  for (;; ptr++) Line 1189  for (;; ptr++)
1189    
1190        if (c == '\\')        if (c == '\\')
1191          {          {
1192          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1193          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';
1194          else if (c < 0)          else if (c < 0)
1195            {            {
1196              register const uschar *cbits = cd->cbits;
1197            class_charcount = 10;            class_charcount = 10;
1198            switch (-c)            switch (-c)
1199              {              {
1200              case ESC_d:              case ESC_d:
1201              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1202              continue;              continue;
1203    
1204              case ESC_D:              case ESC_D:
1205              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1206              continue;              continue;
1207    
1208              case ESC_w:              case ESC_w:
1209              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1210              continue;              continue;
1211    
1212              case ESC_W:              case ESC_W:
1213              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1214              continue;              continue;
1215    
1216              case ESC_s:              case ESC_s:
1217              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1218              continue;              continue;
1219    
1220              case ESC_S:              case ESC_S:
1221              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1222              continue;              continue;
1223    
1224              default:              default:
# Line 786  for (;; ptr++) Line 1226  for (;; ptr++)
1226              goto FAILED;              goto FAILED;
1227              }              }
1228            }            }
1229          /* Fall through if single character */  
1230            /* Fall through if single character, but don't at present allow
1231            chars > 255 in UTF-8 mode. */
1232    
1233    #ifdef SUPPORT_UTF8
1234            if (c > 255)
1235              {
1236              *errorptr = ERR33;
1237              goto FAILED;
1238              }
1239    #endif
1240          }          }
1241    
1242        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
# Line 806  for (;; ptr++) Line 1256  for (;; ptr++)
1256            }            }
1257    
1258          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1259          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1260            in such circumstances. */
1261    
1262          if (d == '\\')          if (d == '\\')
1263            {            {
1264            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);            const uschar *oldptr = ptr;
1265              d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1266    
1267    #ifdef SUPPORT_UTF8
1268              if (d > 255)
1269                {
1270                *errorptr = ERR33;
1271                goto FAILED;
1272                }
1273    #endif
1274              /* \b is backslash; any other special means the '-' was literal */
1275    
1276            if (d < 0)            if (d < 0)
1277              {              {
1278              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1279                {                {
1280                *errorptr = ERR7;                ptr = oldptr - 2;
1281                goto FAILED;                goto SINGLE_CHARACTER;  /* A few lines below */
1282                }                }
1283              }              }
1284            }            }
# Line 832  for (;; ptr++) Line 1294  for (;; ptr++)
1294            class[c/8] |= (1 << (c&7));            class[c/8] |= (1 << (c&7));
1295            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
1296              {              {
1297              int uc = pcre_fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
1298              class[uc/8] |= (1 << (uc&7));              class[uc/8] |= (1 << (uc&7));
1299              }              }
1300            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
# Line 844  for (;; ptr++) Line 1306  for (;; ptr++)
1306        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1307        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1308    
1309          SINGLE_CHARACTER:
1310    
1311        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1312        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1313          {          {
1314          c = pcre_fcc[c];   /* flip case */          c = cd->fcc[c];   /* flip case */
1315          class[c/8] |= (1 << (c&7));          class[c/8] |= (1 << (c&7));
1316          }          }
1317        class_charcount++;        class_charcount++;
# Line 894  for (;; ptr++) Line 1358  for (;; ptr++)
1358      /* Various kinds of repeat */      /* Various kinds of repeat */
1359    
1360      case '{':      case '{':
1361      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1362      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1363      if (*errorptr != NULL) goto FAILED;      if (*errorptr != NULL) goto FAILED;
1364      goto REPEAT;      goto REPEAT;
1365    
# Line 928  for (;; ptr++) Line 1392  for (;; ptr++)
1392        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
1393      else repeat_type = greedy_default;      else repeat_type = greedy_default;
1394    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
1395      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1396      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1397      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1398        out any reqchar setting, backing up to the previous value. We must also
1399        adjust the countlits value. */
1400    
1401      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1402        {        {
1403        int len = previous[1];        int len = previous[1];
1404    
1405          if (repeat_min == 0) *reqchar = prevreqchar;
1406          *countlits += repeat_min - 1;
1407    
1408        if (len == 1)        if (len == 1)
1409          {          {
1410          c = previous[2];          c = previous[2];
# Line 978  for (;; ptr++) Line 1443  for (;; ptr++)
1443        code = previous;        code = previous;
1444    
1445        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1446        repeat_type += op_type;      /* Combine both values for many cases */  
1447          /* If the maximum is zero then the minimum must also be zero; Perl allows
1448          this case, so we do too - by simply omitting the item altogether. */
1449    
1450          if (repeat_max == 0) goto END_REPEAT;
1451    
1452          /* Combine the op_type with the repeat_type */
1453    
1454          repeat_type += op_type;
1455    
1456        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1457        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1055  for (;; ptr++) Line 1528  for (;; ptr++)
1528        }        }
1529    
1530      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1531      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1532    
1533      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1534        {        {
1535          if (repeat_max == 0)
1536            {
1537            code = previous;
1538            goto END_REPEAT;
1539            }
1540        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1541          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1542        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1082  for (;; ptr++) Line 1560  for (;; ptr++)
1560      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1561               (int)*previous == OP_COND)               (int)*previous == OP_COND)
1562        {        {
1563        int i, ketoffset = 0;        register int i;
1564          int ketoffset = 0;
1565        int len = code - previous;        int len = code - previous;
1566          uschar *bralink = NULL;
1567    
1568        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
1569        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
# Line 1098  for (;; ptr++) Line 1578  for (;; ptr++)
1578          ketoffset = code - ket;          ketoffset = code - ket;
1579          }          }
1580    
1581        /* If the minimum is greater than zero, and the maximum is unlimited or        /* The case of a zero minimum is special because of the need to stick
1582        equal to the minimum, the first copy remains where it is, and is        OP_BRAZERO in front of it, and because the group appears once in the
1583        replicated up to the minimum number of times. This case includes the +        data, whereas in other cases it appears the minimum number of times. For
1584        repeat, but of course no replication is needed in that case. */        this reason, it is simplest to treat this case separately, as otherwise
1585          the code gets far too messy. There are several special subcases when the
1586          minimum is zero. */
1587    
1588        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))        if (repeat_min == 0)
1589          {          {
1590          for (i = 1; i < repeat_min; i++)          /* If we set up a required char from the bracket, we must back off
1591            to the previous value and reset the countlits value too. */
1592    
1593            if (subcountlits > 0)
1594            {            {
1595            memcpy(code, previous, len);            *reqchar = prevreqchar;
1596            code += len;            *countlits -= subcountlits;
1597            }            }
         }  
1598    
1599        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is also zero, we just omit the group from the output
1600        Then, if there is a fixed upper limit, replicated up to that many times,          altogether. */
       sticking BRAZERO in front of all the optional ones. */  
1601    
1602        else          if (repeat_max == 0)
1603          {            {
1604          if (repeat_min == 0)            code = previous;
1605              goto END_REPEAT;
1606              }
1607    
1608            /* If the maximum is 1 or unlimited, we just have to stick in the
1609            BRAZERO and do no more at this point. */
1610    
1611            if (repeat_max <= 1)
1612            {            {
1613            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
1614            code++;            code++;
1615            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1616            }            }
1617    
1618            /* If the maximum is greater than 1 and limited, we have to replicate
1619            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1620            The first one has to be handled carefully because it's the original
1621            copy, which has to be moved up. The remainder can be handled by code
1622            that is common with the non-zero minimum case below. We just have to
1623            adjust the value or repeat_max, since one less copy is required. */
1624    
1625            else
1626              {
1627              int offset;
1628              memmove(previous+4, previous, len);
1629              code += 4;
1630              *previous++ = OP_BRAZERO + repeat_type;
1631              *previous++ = OP_BRA;
1632    
1633              /* We chain together the bracket offset fields that have to be
1634              filled in later when the ends of the brackets are reached. */
1635    
1636              offset = (bralink == NULL)? 0 : previous - bralink;
1637              bralink = previous;
1638              *previous++ = offset >> 8;
1639              *previous++ = offset & 255;
1640              }
1641    
1642            repeat_max--;
1643            }
1644    
1645          /* If the minimum is greater than zero, replicate the group as many
1646          times as necessary, and adjust the maximum to the number of subsequent
1647          copies that we need. */
1648    
1649          else
1650            {
1651          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1652            {            {
1653            memcpy(code, previous, len);            memcpy(code, previous, len);
1654            code += len;            code += len;
1655            }            }
1656            if (repeat_max > 0) repeat_max -= repeat_min;
1657            }
1658    
1659          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        /* This code is common to both the zero and non-zero minimum cases. If
1660          the maximum is limited, it replicates the group in a nested fashion,
1661          remembering the bracket starts on a stack. In the case of a zero minimum,
1662          the first one was set up above. In all cases the repeat_max now specifies
1663          the number of additional copies needed. */
1664    
1665          if (repeat_max >= 0)
1666            {
1667            for (i = repeat_max - 1; i >= 0; i--)
1668            {            {
1669            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1670    
1671              /* All but the final copy start a new nesting, maintaining the
1672              chain of brackets outstanding. */
1673    
1674              if (i != 0)
1675                {
1676                int offset;
1677                *code++ = OP_BRA;
1678                offset = (bralink == NULL)? 0 : code - bralink;
1679                bralink = code;
1680                *code++ = offset >> 8;
1681                *code++ = offset & 255;
1682                }
1683    
1684            memcpy(code, previous, len);            memcpy(code, previous, len);
1685            code += len;            code += len;
1686            }            }
1687    
1688            /* Now chain through the pending brackets, and fill in their length
1689            fields (which are holding the chain links pro tem). */
1690    
1691            while (bralink != NULL)
1692              {
1693              int oldlinkoffset;
1694              int offset = code - bralink + 1;
1695              uschar *bra = code - offset;
1696              oldlinkoffset = (bra[1] << 8) + bra[2];
1697              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1698              *code++ = OP_KET;
1699              *code++ = bra[1] = offset >> 8;
1700              *code++ = bra[2] = (offset & 255);
1701              }
1702          }          }
1703    
1704        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
# Line 1144  for (;; ptr++) Line 1706  for (;; ptr++)
1706        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
1707        correct offset was computed above. */        correct offset was computed above. */
1708    
1709        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1710        }        }
1711    
1712      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1157  for (;; ptr++) Line 1719  for (;; ptr++)
1719    
1720      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1721    
1722        END_REPEAT:
1723      previous = NULL;      previous = NULL;
1724      break;      break;
1725    
# Line 1170  for (;; ptr++) Line 1733  for (;; ptr++)
1733    
1734      case '(':      case '(':
1735      newoptions = options;      newoptions = options;
1736      condref = -1;      skipbytes = 0;
1737    
1738      if (*(++ptr) == '?')      if (*(++ptr) == '?')
1739        {        {
# Line 1191  for (;; ptr++) Line 1754  for (;; ptr++)
1754    
1755          case '(':          case '(':
1756          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
1757          if ((pcre_ctypes[*(++ptr)] & ctype_digit) != 0)          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1758            {            {
1759            condref = *ptr - '0';            int condref = *ptr - '0';
1760            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1761              if (condref == 0)
1762                {
1763                *errorptr = ERR35;
1764                goto FAILED;
1765                }
1766            ptr++;            ptr++;
1767              code[3] = OP_CREF;
1768              code[4] = condref >> 8;
1769              code[5] = condref & 255;
1770              skipbytes = 3;
1771            }            }
1772          else ptr--;          else ptr--;
1773          break;          break;
# Line 1234  for (;; ptr++) Line 1806  for (;; ptr++)
1806          ptr++;          ptr++;
1807          break;          break;
1808    
1809            case 'R':                 /* Pattern recursion */
1810            *code++ = OP_RECURSE;
1811            ptr++;
1812            continue;
1813    
1814          default:                  /* Option setting */          default:                  /* Option setting */
1815          set = unset = 0;          set = unset = 0;
1816          optset = &set;          optset = &set;
# Line 1293  for (;; ptr++) Line 1870  for (;; ptr++)
1870          }          }
1871        }        }
1872    
1873      /* Else we have a referencing group; adjust the opcode. */      /* Else we have a referencing group; adjust the opcode. If the bracket
1874        number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1875        arrange for the true number to follow later, in an OP_BRANUMBER item. */
1876    
1877      else      else
1878        {        {
1879        if (++(*brackets) > EXTRACT_MAX)        if (++(*brackets) > EXTRACT_BASIC_MAX)
1880          {          {
1881          *errorptr = ERR13;          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1882          goto FAILED;          code[3] = OP_BRANUMBER;
1883            code[4] = *brackets >> 8;
1884            code[5] = *brackets & 255;
1885            skipbytes = 3;
1886          }          }
1887        bravalue = OP_BRA + *brackets;        else bravalue = OP_BRA + *brackets;
1888        }        }
1889    
1890      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed re. Assertions may not be repeated, but other
# Line 1318  for (;; ptr++) Line 1900  for (;; ptr++)
1900           options | PCRE_INGROUP,       /* Set for all nested groups */           options | PCRE_INGROUP,       /* Set for all nested groups */
1901           ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?           ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1902             newoptions & PCRE_IMS : -1, /* Pass ims options if changed */             newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1903           brackets,                     /* Bracket level */           brackets,                     /* Extracting bracket count */
1904           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
1905           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
1906           errorptr,                     /* Where to put an error message */           errorptr,                     /* Where to put an error message */
1907           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1908            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1909           condref))                     /* Condition reference number */           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
1910             &subreqchar,                  /* For possible last char */
1911             &subcountlits,                /* For literal count */
1912             cd))                          /* Tables block */
1913        goto FAILED;        goto FAILED;
1914    
1915      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 1335  for (;; ptr++) Line 1920  for (;; ptr++)
1920      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
1921      two branches in the group. */      two branches in the group. */
1922    
1923      if (bravalue == OP_COND)      else if (bravalue == OP_COND)
1924        {        {
       int branchcount = 0;  
1925        uschar *tc = code;        uschar *tc = code;
1926          condcount = 0;
1927    
1928        do {        do {
1929           branchcount++;           condcount++;
1930           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1931           }           }
1932        while (*tc != OP_KET);        while (*tc != OP_KET);
1933    
1934        if (branchcount > 2)        if (condcount > 2)
1935          {          {
1936          *errorptr = ERR27;          *errorptr = ERR27;
1937          goto FAILED;          goto FAILED;
1938          }          }
1939        }        }
1940    
1941        /* Handle updating of the required character. If the subpattern didn't
1942        set one, leave it as it was. Otherwise, update it for normal brackets of
1943        all kinds, forward assertions, and conditions with two branches. Don't
1944        update the literal count for forward assertions, however. If the bracket
1945        is followed by a quantifier with zero repeat, we have to back off. Hence
1946        the definition of prevreqchar and subcountlits outside the main loop so
1947        that they can be accessed for the back off. */
1948    
1949        if (subreqchar > 0 &&
1950             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1951             (bravalue == OP_COND && condcount == 2)))
1952          {
1953          prevreqchar = *reqchar;
1954          *reqchar = subreqchar;
1955          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1956          }
1957    
1958      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1959    
1960      code = tempcode;      code = tempcode;
# Line 1372  for (;; ptr++) Line 1974  for (;; ptr++)
1974    
1975      case '\\':      case '\\':
1976      tempptr = ptr;      tempptr = ptr;
1977      c = check_escape(&ptr, errorptr, *brackets, options, FALSE);      c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1978    
1979      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1980      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
# Line 1385  for (;; ptr++) Line 1987  for (;; ptr++)
1987        {        {
1988        if (-c >= ESC_REF)        if (-c >= ESC_REF)
1989          {          {
1990            int number = -c - ESC_REF;
1991          previous = code;          previous = code;
1992          *code++ = OP_REF;          *code++ = OP_REF;
1993          *code++ = -c - ESC_REF;          *code++ = number >> 8;
1994            *code++ = number & 255;
1995          }          }
1996        else        else
1997          {          {
# Line 1417  for (;; ptr++) Line 2021  for (;; ptr++)
2021        {        {
2022        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
2023          {          {
2024          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
2025          if (c == '#')          if (c == '#')
2026            {            {
2027            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2028              on the Macintosh. */
2029              while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2030            if (c == 0) break;            if (c == 0) break;
2031            continue;            continue;
2032            }            }
# Line 1433  for (;; ptr++) Line 2039  for (;; ptr++)
2039        if (c == '\\')        if (c == '\\')
2040          {          {
2041          tempptr = ptr;          tempptr = ptr;
2042          c = check_escape(&ptr, errorptr, *brackets, options, FALSE);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2043          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
2044    
2045            /* If a character is > 127 in UTF-8 mode, we have to turn it into
2046            two or more characters in the UTF-8 encoding. */
2047    
2048    #ifdef SUPPORT_UTF8
2049            if (c > 127 && (options & PCRE_UTF8) != 0)
2050              {
2051              uschar buffer[8];
2052              int len = ord2utf8(c, buffer);
2053              for (c = 0; c < len; c++) *code++ = buffer[c];
2054              length += len;
2055              continue;
2056              }
2057    #endif
2058          }          }
2059    
2060        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1445  for (;; ptr++) Line 2065  for (;; ptr++)
2065    
2066      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2067    
2068      while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2069    
2070        /* Update the last character and the count of literals */
2071    
2072        prevreqchar = (length > 1)? code[-2] : *reqchar;
2073        *reqchar = code[-1];
2074        *countlits += length;
2075    
2076      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
2077      the next state. */      the next state. */
2078    
2079      previous[1] = length;      previous[1] = length;
2080      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
2081      break;      break;
2082      }      }
2083    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1489  Argument: Line 2115  Argument:
2115    ptrptr      -> the address of the current pattern pointer    ptrptr      -> the address of the current pattern pointer
2116    errorptr    -> pointer to error message    errorptr    -> pointer to error message
2117    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
2118    condref     > 0 for OPT_CREF setting at start of conditional group    skipbytes   skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2119      reqchar     -> place to put the last required character, or a negative number
2120      countlits   -> place to put the shortest literal count of any branch
2121      cd          points to the data block with tables pointers
2122    
2123  Returns:      TRUE on success  Returns:      TRUE on success
2124  */  */
2125    
2126  static BOOL  static BOOL
2127  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2128    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref)    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2129      int *reqchar, int *countlits, compile_data *cd)
2130  {  {
2131  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
2132  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1504  uschar *last_branch = code; Line 2134  uschar *last_branch = code;
2134  uschar *start_bracket = code;  uschar *start_bracket = code;
2135  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
2136  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
2137    int branchreqchar, branchcountlits;
2138    
2139  code += 3;  *reqchar = -1;
2140    *countlits = INT_MAX;
2141  /* At the start of a reference-based conditional group, insert the reference  code += 3 + skipbytes;
 number as an OP_CREF item. */  
   
 if (condref > 0)  
   {  
   *code++ = OP_CREF;  
   *code++ = condref;  
   }  
2142    
2143  /* Loop for each alternative branch */  /* Loop for each alternative branch */
2144    
# Line 1543  for (;;) Line 2167  for (;;)
2167    
2168    /* Now compile the branch */    /* Now compile the branch */
2169    
2170    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2171          &branchreqchar, &branchcountlits, cd))
2172      {      {
2173      *ptrptr = ptr;      *ptrptr = ptr;
2174      return FALSE;      return FALSE;
# Line 1555  for (;;) Line 2180  for (;;)
2180    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
2181    last_branch[2] = length & 255;    last_branch[2] = length & 255;
2182    
2183      /* Save the last required character if all branches have the same; a current
2184      value of -1 means unset, while -2 means "previous branch had no last required
2185      char".  */
2186    
2187      if (*reqchar != -2)
2188        {
2189        if (branchreqchar >= 0)
2190          {
2191          if (*reqchar == -1) *reqchar = branchreqchar;
2192          else if (*reqchar != branchreqchar) *reqchar = -2;
2193          }
2194        else *reqchar = -2;
2195        }
2196    
2197      /* Keep the shortest literal count */
2198    
2199      if (branchcountlits < *countlits) *countlits = branchcountlits;
2200      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2201    
2202    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
2203    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
2204    the branch with OP_END. */    the branch with OP_END. */
# Line 1562  for (;;) Line 2206  for (;;)
2206    if (lookbehind)    if (lookbehind)
2207      {      {
2208      *code = OP_END;      *code = OP_END;
2209      length = find_fixedlength(last_branch);      length = find_fixedlength(last_branch, options);
2210      DPRINTF(("fixed length = %d\n", length));      DPRINTF(("fixed length = %d\n", length));
2211      if (length < 0)      if (length < 0)
2212        {        {
# Line 1646  for (;;) Line 2290  for (;;)
2290      break;      break;
2291    
2292      case OP_CREF:      case OP_CREF:
2293      code += 2;      case OP_BRANUMBER:
2294        code += 3;
2295        break;
2296    
2297        case OP_WORD_BOUNDARY:
2298        case OP_NOT_WORD_BOUNDARY:
2299        code++;
2300      break;      break;
2301    
2302      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
# Line 1676  all of whose alternatives start with OP_ Line 2326  all of whose alternatives start with OP_
2326  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2327  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2328    
2329  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2330  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2331  trying them again.  so there is no point trying them again.
2332    
2333  Arguments:  Arguments:
2334    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1696  do { Line 2346  do {
2346     register int op = *scode;     register int op = *scode;
2347     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2348       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2349     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2350                (*options & PCRE_DOTALL) != 0)
2351       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
2352     else if (op != OP_SOD &&     else if (op != OP_SOD &&
2353             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1710  return TRUE; Line 2361  return TRUE;
2361    
2362    
2363  /*************************************************  /*************************************************
2364  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2365  *************************************************/  *************************************************/
2366    
2367  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2368  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2369    matching and for non-DOTALL patterns that start with .* (which must start at
2370    the beginning or after \n).
2371    
2372  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2373  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1728  do { Line 2381  do {
2381     register int op = *scode;     register int op = *scode;
2382     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2383       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
2384       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2385         { if (scode[1] != OP_ANY) return FALSE; }
2386     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
2387     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2388     }     }
# Line 1813  Arguments: Line 2468  Arguments:
2468    options      various option bits    options      various option bits
2469    errorptr     pointer to pointer to error text    errorptr     pointer to pointer to error text
2470    erroroffset  ptr offset in pattern where error was detected    erroroffset  ptr offset in pattern where error was detected
2471      tables       pointer to character tables or NULL
2472    
2473  Returns:       pointer to compiled data block, or NULL on error,  Returns:       pointer to compiled data block, or NULL on error,
2474                 with errorptr and erroroffset set                 with errorptr and erroroffset set
# Line 1820  Returns:       pointer to compiled data Line 2476  Returns:       pointer to compiled data
2476    
2477  pcre *  pcre *
2478  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
2479    int *erroroffset)    int *erroroffset, const unsigned char *tables)
2480  {  {
2481  real_pcre *re;  real_pcre *re;
2482  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2483  int runlength;  int runlength;
2484  int c, size;  int c, reqchar, countlits;
2485  int bracount = 0;  int bracount = 0;
2486  int top_backref = 0;  int top_backref = 0;
2487  int branch_extra = 0;  int branch_extra = 0;
2488  int branch_newextra;  int branch_newextra;
2489  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2490    size_t size;
2491  uschar *code;  uschar *code;
2492  const uschar *ptr;  const uschar *ptr;
2493    compile_data compile_block;
2494  int brastack[BRASTACK_SIZE];  int brastack[BRASTACK_SIZE];
2495  uschar bralenstack[BRASTACK_SIZE];  uschar bralenstack[BRASTACK_SIZE];
2496    
# Line 1840  uschar bralenstack[BRASTACK_SIZE]; Line 2498  uschar bralenstack[BRASTACK_SIZE];
2498  uschar *code_base, *code_end;  uschar *code_base, *code_end;
2499  #endif  #endif
2500    
2501    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2502    
2503    #ifndef SUPPORT_UTF8
2504    if ((options & PCRE_UTF8) != 0)
2505      {
2506      *errorptr = ERR32;
2507      return NULL;
2508      }
2509    #endif
2510    
2511  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
2512  can do is just return NULL. */  can do is just return NULL. */
2513    
# Line 1861  if ((options & ~PUBLIC_OPTIONS) != 0) Line 2529  if ((options & ~PUBLIC_OPTIONS) != 0)
2529    return NULL;    return NULL;
2530    }    }
2531    
2532    /* Set up pointers to the individual character tables */
2533    
2534    if (tables == NULL) tables = pcre_default_tables;
2535    compile_block.lcc = tables + lcc_offset;
2536    compile_block.fcc = tables + fcc_offset;
2537    compile_block.cbits = tables + cbits_offset;
2538    compile_block.ctypes = tables + ctypes_offset;
2539    
2540    /* Reflect pattern for debugging output */
2541    
2542  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
2543  DPRINTF(("%s\n", pattern));  DPRINTF(("%s\n", pattern));
2544    
# Line 1876  while ((c = *(++ptr)) != 0) Line 2554  while ((c = *(++ptr)) != 0)
2554    {    {
2555    int min, max;    int min, max;
2556    int class_charcount;    int class_charcount;
2557      int bracket_length;
2558    
2559    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2560      {      {
2561      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2562      if (c == '#')      if (c == '#')
2563        {        {
2564        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
2565          on the Macintosh. */
2566          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2567        continue;        continue;
2568        }        }
2569      }      }
# Line 1897  while ((c = *(++ptr)) != 0) Line 2578  while ((c = *(++ptr)) != 0)
2578      case '\\':      case '\\':
2579        {        {
2580        const uschar *save_ptr = ptr;        const uschar *save_ptr = ptr;
2581        c = check_escape(&ptr, errorptr, bracount, options, FALSE);        c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2582        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583        if (c >= 0)        if (c >= 0)
2584          {          {
# Line 1908  while ((c = *(++ptr)) != 0) Line 2589  while ((c = *(++ptr)) != 0)
2589        }        }
2590      length++;      length++;
2591    
2592      /* A back reference needs an additional char, plus either one or 5      /* A back reference needs an additional 2 bytes, plus either one or 5
2593      bytes for a repeat. We also need to keep the value of the highest      bytes for a repeat. We also need to keep the value of the highest
2594      back reference. */      back reference. */
2595    
# Line 1916  while ((c = *(++ptr)) != 0) Line 2597  while ((c = *(++ptr)) != 0)
2597        {        {
2598        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
2599        if (refnum > top_backref) top_backref = refnum;        if (refnum > top_backref) top_backref = refnum;
2600        length++;   /* For single back reference */        length += 2;   /* For single back reference */
2601        if (ptr[1] == '{' && is_counted_repeat(ptr+2))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2602          {          {
2603          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2604          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2605          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2606            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1943  while ((c = *(++ptr)) != 0) Line 2624  while ((c = *(++ptr)) != 0)
2624      or back reference. */      or back reference. */
2625    
2626      case '{':      case '{':
2627      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2628      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2629      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630      if ((min == 0 && (max == 1 || max == -1)) ||      if ((min == 0 && (max == 1 || max == -1)) ||
2631        (min == 1 && max == -1))        (min == 1 && max == -1))
# Line 1979  while ((c = *(++ptr)) != 0) Line 2660  while ((c = *(++ptr)) != 0)
2660        {        {
2661        if (*ptr == '\\')        if (*ptr == '\\')
2662          {          {
2663          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2664              &compile_block);
2665          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2666          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2667          }          }
# Line 1996  while ((c = *(++ptr)) != 0) Line 2678  while ((c = *(++ptr)) != 0)
2678    
2679        /* A repeat needs either 1 or 5 bytes. */        /* A repeat needs either 1 or 5 bytes. */
2680    
2681        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2682          {          {
2683          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2684          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2685          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2686            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 2013  while ((c = *(++ptr)) != 0) Line 2695  while ((c = *(++ptr)) != 0)
2695    
2696      case '(':      case '(':
2697      branch_newextra = 0;      branch_newextra = 0;
2698        bracket_length = 3;
2699    
2700      /* Handle special forms of bracket, which all start (? */      /* Handle special forms of bracket, which all start (? */
2701    
# Line 2046  while ((c = *(++ptr)) != 0) Line 2729  while ((c = *(++ptr)) != 0)
2729          ptr += 2;          ptr += 2;
2730          break;          break;
2731    
2732            /* A recursive call to the regex is an extension, to provide the
2733            facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2734    
2735            case 'R':
2736            if (ptr[3] != ')')
2737              {
2738              *errorptr = ERR29;
2739              goto PCRE_ERROR_RETURN;
2740              }
2741            ptr += 3;
2742            length += 1;
2743            break;
2744    
2745          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
2746    
2747          case '<':          case '<':
# Line 2064  while ((c = *(++ptr)) != 0) Line 2760  while ((c = *(++ptr)) != 0)
2760          group. */          group. */
2761    
2762          case '(':          case '(':
2763          if ((pcre_ctypes[ptr[3]] & ctype_digit) != 0)          if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2764            {            {
2765            ptr += 4;            ptr += 4;
2766            length += 2;            length += 3;
2767            while ((pcre_ctypes[*ptr] & ctype_digit) != 0) ptr++;            while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2768            if (*ptr != ')')            if (*ptr != ')')
2769              {              {
2770              *errorptr = ERR26;              *errorptr = ERR26;
# Line 2078  while ((c = *(++ptr)) != 0) Line 2774  while ((c = *(++ptr)) != 0)
2774          else   /* An assertion must follow */          else   /* An assertion must follow */
2775            {            {
2776            ptr++;   /* Can treat like ':' as far as spacing is concerned */            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2777              if (ptr[2] != '?' ||
2778            if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)               (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2779              {              {
2780              ptr += 2;    /* To get right offset in message */              ptr += 2;    /* To get right offset in message */
2781              *errorptr = ERR28;              *errorptr = ERR28;
# Line 2153  while ((c = *(++ptr)) != 0) Line 2849  while ((c = *(++ptr)) != 0)
2849              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2850              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2851              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2852              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2853                flag ever changes within the regex. This is used by the "required
2854                character" code. */
2855    
2856              case ':':              case ':':
2857              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2858                {                {
2859                length += 4;                length += 4;
2860                branch_newextra = 2;                branch_newextra = 2;
2861                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2862                }                }
2863              goto END_OPTIONS;              goto END_OPTIONS;
2864    
# Line 2191  while ((c = *(++ptr)) != 0) Line 2890  while ((c = *(++ptr)) != 0)
2890        }        }
2891    
2892      /* Extracting brackets must be counted so we can process escapes in a      /* Extracting brackets must be counted so we can process escapes in a
2893      Perlish way. */      Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2894        need an additional 3 bytes of store per extracting bracket. */
2895    
2896      else bracount++;      else
2897          {
2898          bracount++;
2899          if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2900          }
2901    
2902      /* Non-special forms of bracket. Save length for computing whole length      /* Save length for computing whole length at end if there's a repeat that
2903      at end if there's a repeat that requires duplication of the group. Also      requires duplication of the group. Also save the current value of
2904      save the current value of branch_extra, and start the new group with      branch_extra, and start the new group with the new value. If non-zero, this
2905      the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3      will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
     for a lookbehind assertion. */  
2906    
2907      if (brastackptr >= sizeof(brastack)/sizeof(int))      if (brastackptr >= sizeof(brastack)/sizeof(int))
2908        {        {
# Line 2211  while ((c = *(++ptr)) != 0) Line 2914  while ((c = *(++ptr)) != 0)
2914      branch_extra = branch_newextra;      branch_extra = branch_newextra;
2915    
2916      brastack[brastackptr++] = length;      brastack[brastackptr++] = length;
2917      length += 3;      length += bracket_length;
2918      continue;      continue;
2919    
2920      /* Handle ket. Look for subsequent max/min; for certain sets of values we      /* Handle ket. Look for subsequent max/min; for certain sets of values we
# Line 2237  while ((c = *(++ptr)) != 0) Line 2940  while ((c = *(++ptr)) != 0)
2940        /* Leave ptr at the final char; for read_repeat_counts this happens        /* Leave ptr at the final char; for read_repeat_counts this happens
2941        automatically; for the others we need an increment. */        automatically; for the others we need an increment. */
2942    
2943        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2944          {          {
2945          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2946              &compile_block);
2947          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2948          }          }
2949        else if (c == '*') { minval = 0; maxval = -1; ptr++; }        else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2950        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2951        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2952    
2953        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2954        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2955        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2956        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2957    
2958        if (minval == 0) length++;        if (minval == 0)
2959          else if (minval > 1) length += (minval - 1) * duplength;          {
2960        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2961            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2962            }
2963    
2964          /* When the minimum is greater than zero, 1 we have to replicate up to
2965          minval-1 times, with no additions required in the copies. Then, if
2966          there is a limited maximum we have to replicate up to maxval-1 times
2967          allowing for a BRAZERO item before each optional copy and nesting
2968          brackets for all but one of the optional copies. */
2969    
2970          else
2971            {
2972            length += (minval - 1) * duplength;
2973            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2974              length += (maxval - minval) * (duplength + 7) - 6;
2975            }
2976        }        }
2977      continue;      continue;
2978    
# Line 2270  while ((c = *(++ptr)) != 0) Line 2989  while ((c = *(++ptr)) != 0)
2989        {        {
2990        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
2991          {          {
2992          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2993          if (c == '#')          if (c == '#')
2994            {            {
2995            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2996              on the Macintosh. */
2997              while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2998            continue;            continue;
2999            }            }
3000          }          }
# Line 2284  while ((c = *(++ptr)) != 0) Line 3005  while ((c = *(++ptr)) != 0)
3005        if (c == '\\')        if (c == '\\')
3006          {          {
3007          const uschar *saveptr = ptr;          const uschar *saveptr = ptr;
3008          c = check_escape(&ptr, errorptr, bracount, options, FALSE);          c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3009              &compile_block);
3010          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3011          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
3012    
3013    #ifdef SUPPORT_UTF8
3014            if (c > 127 && (options & PCRE_UTF8) != 0)
3015              {
3016              int i;
3017              for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3018                if (c <= utf8_table1[i]) break;
3019              runlength += i;
3020              }
3021    #endif
3022          }          }
3023    
3024        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 2296  while ((c = *(++ptr)) != 0) Line 3028  while ((c = *(++ptr)) != 0)
3028    
3029      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
3030    
3031      while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (runlength < MAXLIT &&
3032          (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3033    
3034      ptr--;      ptr--;
3035      length += runlength;      length += runlength;
# Line 2327  if (re == NULL) Line 3060  if (re == NULL)
3060    return NULL;    return NULL;
3061    }    }
3062    
3063  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
3064    
3065  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
3066    re->size = size;
3067  re->options = options;  re->options = options;
3068    re->tables = tables;
3069    
3070  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
3071  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
# Line 2340  ptr = (const uschar *)pattern; Line 3075  ptr = (const uschar *)pattern;
3075  code = re->code;  code = re->code;
3076  *code = OP_BRA;  *code = OP_BRA;
3077  bracount = 0;  bracount = 0;
3078  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1);  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3079      &reqchar, &countlits, &compile_block);
3080  re->top_bracket = bracount;  re->top_bracket = bracount;
3081  re->top_backref = top_backref;  re->top_backref = top_backref;
3082    
# Line 2372  if (*errorptr != NULL) Line 3108  if (*errorptr != NULL)
3108    return NULL;    return NULL;
3109    }    }
3110    
3111  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
3112  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
3113  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
3114  unanchored matches no end. In the case of multiline matches, an alternative is  
3115  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
3116    that speeds up unanchored matches no end. If not, see if we can set the
3117    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3118    start with ^. and also when all branches start with .* for non-DOTALL matches.
3119    */
3120    
3121  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
3122    {    {
# Line 2396  if ((options & PCRE_ANCHORED) == 0) Line 3136  if ((options & PCRE_ANCHORED) == 0)
3136      }      }
3137    }    }
3138    
3139    /* Save the last required character if there are at least two literal
3140    characters on all paths, or if there is no first character setting. */
3141    
3142    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3143      {
3144      re->req_char = reqchar;
3145      re->options |= PCRE_REQCHSET;
3146      }
3147    
3148  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
3149    
3150  #ifdef DEBUG  #ifdef DEBUG
# Line 2405  printf("Length = %d top_bracket = %d top Line 3154  printf("Length = %d top_bracket = %d top
3154    
3155  if (re->options != 0)  if (re->options != 0)
3156    {    {
3157    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
3158      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3159      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3160        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3161      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3162      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3163      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2422  if ((re->options & PCRE_FIRSTSET) != 0) Line 3172  if ((re->options & PCRE_FIRSTSET) != 0)
3172      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
3173    }    }
3174    
3175    if ((re->options & PCRE_REQCHSET) != 0)
3176      {
3177      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3178        else printf("Req char = \\x%02x\n", re->req_char);
3179      }
3180    
3181  code_end = code;  code_end = code;
3182  code_base = code = re->code;  code_base = code = re->code;
3183    
# Line 2433  while (code < code_end) Line 3189  while (code < code_end)
3189    
3190    if (*code >= OP_BRA)    if (*code >= OP_BRA)
3191      {      {
3192      printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);      if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3193          printf("%3d Bra extra", (code[1] << 8) + code[2]);
3194        else
3195          printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3196      code += 2;      code += 2;
3197      }      }
3198    
# Line 2444  while (code < code_end) Line 3203  while (code < code_end)
3203      code++;      code++;
3204      break;      break;
3205    
     case OP_COND:  
     printf("%3d Cond", (code[1] << 8) + code[2]);  
     code += 2;  
     break;  
   
     case OP_CREF:  
     printf(" %.2d %s", code[1], OP_names[*code]);  
     code++;  
     break;  
   
3206      case OP_CHARS:      case OP_CHARS:
3207      charlength = *(++code);      charlength = *(++code);
3208      printf("%3d ", charlength);      printf("%3d ", charlength);
# Line 2470  while (code < code_end) Line 3219  while (code < code_end)
3219      case OP_ASSERTBACK:      case OP_ASSERTBACK:
3220      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
3221      case OP_ONCE:      case OP_ONCE:
     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);  
     code += 2;  
     break;  
   
3222      case OP_REVERSE:      case OP_REVERSE:
3223        case OP_BRANUMBER:
3224        case OP_COND:
3225        case OP_CREF:
3226      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3227      code += 2;      code += 2;
3228      break;      break;
# Line 2547  while (code < code_end) Line 3295  while (code < code_end)
3295      break;      break;
3296    
3297      case OP_REF:      case OP_REF:
3298      printf("    \\%d", *(++code));      printf("    \\%d", (code[1] << 8) | code[2]);
3299      code ++;      code += 3;
3300      goto CLASS_REF_REPEAT;      goto CLASS_REF_REPEAT;
3301    
3302      case OP_CLASS:      case OP_CLASS:
# Line 2637  return (pcre *)re; Line 3385  return (pcre *)re;
3385    
3386    
3387  /*************************************************  /*************************************************
 *        Match a character type                  *  
 *************************************************/  
   
 /* Not used in all the places it might be as it's sometimes faster  
 to put the code inline.  
   
 Arguments:  
   type        the character type  
   c           the character  
   dotall      the dotall flag  
   
 Returns:      TRUE if character is of the type  
 */  
   
 static BOOL  
 match_type(int type, int c, BOOL dotall)  
 {  
   
 #ifdef DEBUG  
 if (isprint(c)) printf("matching subject %c against ", c);  
   else printf("matching subject \\x%02x against ", c);  
 printf("%s\n", OP_names[type]);  
 #endif  
   
 switch(type)  
   {  
   case OP_ANY:            return dotall || c != '\n';  
   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;  
   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;  
   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;  
   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;  
   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;  
   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;  
   }  
 return FALSE;  
 }  
   
   
   
 /*************************************************  
3388  *          Match a back-reference                *  *          Match a back-reference                *
3389  *************************************************/  *************************************************/
3390    
# Line 2695  Returns:      TRUE if matched Line 3403  Returns:      TRUE if matched
3403    
3404  static BOOL  static BOOL
3405  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3406    int ims)    unsigned long int ims)
3407  {  {
3408  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
3409    
# Line 2719  if (length > md->end_subject - eptr) ret Line 3427  if (length > md->end_subject - eptr) ret
3427  /* Separate the caselesss case for speed */  /* Separate the caselesss case for speed */
3428    
3429  if ((ims & PCRE_CASELESS) != 0)  if ((ims & PCRE_CASELESS) != 0)
3430    { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }    {
3431      while (length-- > 0)
3432        if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3433      }
3434  else  else
3435    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3436    
# Line 2743  Arguments: Line 3454  Arguments:
3454     offset_top  current top pointer     offset_top  current top pointer
3455     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3456     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
3457     condassert  TRUE if called to check a condition assertion     eptrb       pointer to chain of blocks containing eptr at start of
3458     eptrb       eptr at start of last bracket                   brackets - for testing for empty matches
3459       flags       can contain
3460                     match_condassert - this is an assertion condition
3461                     match_isgroup - this is the start of a bracketed group
3462    
3463  Returns:       TRUE if matched  Returns:       TRUE if matched
3464  */  */
3465    
3466  static BOOL  static BOOL
3467  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
3468    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3469      int flags)
3470  {  {
3471  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3472    eptrblock newptrb;
3473    
3474    /* At the start of a bracketed group, add the current subject pointer to the
3475    stack of such pointers, to be re-instated at the end of the group when we hit
3476    the closing ket. When match() is called in other circumstances, we don't add to
3477    the stack. */
3478    
3479    if ((flags & match_isgroup) != 0)
3480      {
3481      newptrb.prev = eptrb;
3482      newptrb.saved_eptr = eptr;
3483      eptrb = &newptrb;
3484      }
3485    
3486    /* Now start processing the operations. */
3487    
3488  for (;;)  for (;;)
3489    {    {
# Line 2779  for (;;) Line 3509  for (;;)
3509    
3510    if (op > OP_BRA)    if (op > OP_BRA)
3511      {      {
3512        int offset;
3513      int number = op - OP_BRA;      int number = op - OP_BRA;
     int offset = number << 1;  
3514    
3515      DPRINTF(("start bracket %d\n", number));      /* For extended extraction brackets (large number), we have to fish out the
3516        number from a dummy opcode at the start. */
3517    
3518        if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3519        offset = number << 1;
3520    
3521    #ifdef DEBUG
3522        printf("start bracket %d subject=", number);
3523        pchars(eptr, 16, TRUE, md);
3524        printf("\n");
3525    #endif
3526    
3527      if (offset < md->offset_max)      if (offset < md->offset_max)
3528        {        {
# Line 2795  for (;;) Line 3535  for (;;)
3535    
3536        do        do
3537          {          {
3538          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3539              return TRUE;
3540          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3541          }          }
3542        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 2805  for (;;) Line 3546  for (;;)
3546        md->offset_vector[offset] = save_offset1;        md->offset_vector[offset] = save_offset1;
3547        md->offset_vector[offset+1] = save_offset2;        md->offset_vector[offset+1] = save_offset2;
3548        md->offset_vector[md->offset_end - number] = save_offset3;        md->offset_vector[md->offset_end - number] = save_offset3;
3549    
3550        return FALSE;        return FALSE;
3551        }        }
3552    
# Line 2821  for (;;) Line 3563  for (;;)
3563      DPRINTF(("start bracket 0\n"));      DPRINTF(("start bracket 0\n"));
3564      do      do
3565        {        {
3566        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3567            return TRUE;
3568        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3569        }        }
3570      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 2836  for (;;) Line 3579  for (;;)
3579      case OP_COND:      case OP_COND:
3580      if (ecode[3] == OP_CREF)         /* Condition is extraction test */      if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3581        {        {
3582        int offset = ecode[4] << 1;    /* Doubled reference number */        int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3583        return match(eptr,        return match(eptr,
3584          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585            5 : 3 + (ecode[1] << 8) + ecode[2]),            6 : 3 + (ecode[1] << 8) + ecode[2]),
3586          offset_top, md, ims, FALSE, eptr);          offset_top, md, ims, eptrb, match_isgroup);
3587        }        }
3588    
3589      /* The condition is an assertion. Call match() to evaluate it - setting      /* The condition is an assertion. Call match() to evaluate it - setting
# Line 2848  for (;;) Line 3591  for (;;)
3591    
3592      else      else
3593        {        {
3594        if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))        if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595              match_condassert | match_isgroup))
3596          {          {
3597          ecode += 3 + (ecode[4] << 8) + ecode[5];          ecode += 3 + (ecode[4] << 8) + ecode[5];
3598          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3599          }          }
3600        else ecode += (ecode[1] << 8) + ecode[2];        else ecode += (ecode[1] << 8) + ecode[2];
3601        return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);        return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3602        }        }
3603      /* Control never reaches here */      /* Control never reaches here */
3604    
3605      /* Skip over conditional reference data if encountered (should not be) */      /* Skip over conditional reference or large extraction number data if
3606        encountered. */
3607    
3608      case OP_CREF:      case OP_CREF:
3609      ecode += 2;      case OP_BRANUMBER:
3610        ecode += 3;
3611      break;      break;
3612    
3613      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3614        an empty string - recursion will then try other alternatives, if any. */
3615    
3616      case OP_END:      case OP_END:
3617        if (md->notempty && eptr == md->start_match) return FALSE;
3618      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3619      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3620      return TRUE;      return TRUE;
# Line 2876  for (;;) Line 3624  for (;;)
3624      case OP_OPT:      case OP_OPT:
3625      ims = ecode[1];      ims = ecode[1];
3626      ecode += 2;      ecode += 2;
3627      DPRINTF(("ims set to %02x\n", ims));      DPRINTF(("ims set to %02lx\n", ims));
3628      break;      break;
3629    
3630      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
# Line 2889  for (;;) Line 3637  for (;;)
3637      case OP_ASSERTBACK:      case OP_ASSERTBACK:
3638      do      do
3639        {        {
3640        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3641        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3642        }        }
3643      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 2897  for (;;) Line 3645  for (;;)
3645    
3646      /* If checking an assertion for a condition, return TRUE. */      /* If checking an assertion for a condition, return TRUE. */
3647    
3648      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3649    
3650      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3651      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
# Line 2913  for (;;) Line 3661  for (;;)
3661      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
3662      do      do
3663        {        {
3664        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3665            return FALSE;
3666        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3667        }        }
3668      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3669    
3670      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3671    
3672      ecode += 3;      ecode += 3;
3673      continue;      continue;
3674    
3675      /* Move the subject pointer back. This occurs only at the start of      /* Move the subject pointer back. This occurs only at the start of
3676      each branch of a lookbehind assertion. If we are too close to the start to      each branch of a lookbehind assertion. If we are too close to the start to
3677      move back, this match function fails. */      move back, this match function fails. When working with UTF-8 we move
3678        back a number of characters, not bytes. */
3679    
3680      case OP_REVERSE:      case OP_REVERSE:
3681    #ifdef SUPPORT_UTF8
3682        c = (ecode[1] << 8) + ecode[2];
3683        for (i = 0; i < c; i++)
3684          {
3685          eptr--;
3686          BACKCHAR(eptr)
3687          }
3688    #else
3689      eptr -= (ecode[1] << 8) + ecode[2];      eptr -= (ecode[1] << 8) + ecode[2];
3690    #endif
3691    
3692      if (eptr < md->start_subject) return FALSE;      if (eptr < md->start_subject) return FALSE;
3693      ecode += 3;      ecode += 3;
3694      break;      break;
3695    
3696        /* Recursion matches the current regex, nested. If there are any capturing
3697        brackets started but not finished, we have to save their starting points
3698        and reinstate them after the recursion. However, we don't know how many
3699        such there are (offset_top records the completed total) so we just have
3700        to save all the potential data. There may be up to 99 such values, which
3701        is a bit large to put on the stack, but using malloc for small numbers
3702        seems expensive. As a compromise, the stack is used when there are fewer
3703        than 16 values to store; otherwise malloc is used. A problem is what to do
3704        if the malloc fails ... there is no way of returning to the top level with
3705        an error. Save the top 15 values on the stack, and accept that the rest
3706        may be wrong. */
3707    
3708        case OP_RECURSE:
3709          {
3710          BOOL rc;
3711          int *save;
3712          int stacksave[15];
3713    
3714          c = md->offset_max;
3715    
3716          if (c < 16) save = stacksave; else
3717            {
3718            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3719            if (save == NULL)
3720              {
3721              save = stacksave;
3722              c = 15;
3723              }
3724            }
3725    
3726          for (i = 1; i <= c; i++)
3727            save[i] = md->offset_vector[md->offset_end - i];
3728          rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3729            match_isgroup);
3730          for (i = 1; i <= c; i++)
3731            md->offset_vector[md->offset_end - i] = save[i];
3732          if (save != stacksave) (pcre_free)(save);
3733          if (!rc) return FALSE;
3734    
3735          /* In case the recursion has set more capturing values, save the final
3736          number, then move along the subject till after the recursive match,
3737          and advance one byte in the pattern code. */
3738    
3739          offset_top = md->end_offset_top;
3740          eptr = md->end_match_ptr;
3741          ecode++;
3742          }
3743        break;
3744    
3745      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3746      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
# Line 2943  for (;;) Line 3752  for (;;)
3752      case OP_ONCE:      case OP_ONCE:
3753        {        {
3754        const uschar *prev = ecode;        const uschar *prev = ecode;
3755          const uschar *saved_eptr = eptr;
3756    
3757        do        do
3758          {          {
3759          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3760              break;
3761          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3762          }          }
3763        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 2969  for (;;) Line 3780  for (;;)
3780        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3781        course of events. */        course of events. */
3782    
3783        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3784          {          {
3785          ecode += 3;          ecode += 3;
3786          break;          break;
# Line 2983  for (;;) Line 3794  for (;;)
3794        if (ecode[3] == OP_OPT)        if (ecode[3] == OP_OPT)
3795          {          {
3796          ims = (ims & ~PCRE_IMS) | ecode[4];          ims = (ims & ~PCRE_IMS) | ecode[4];
3797          DPRINTF(("ims set to %02x at group repeat\n", ims));          DPRINTF(("ims set to %02lx at group repeat\n", ims));
3798          }          }
3799    
3800        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3801          {          {
3802          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3803              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3804                  return TRUE;
3805          }          }
3806        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3807          {          {
3808          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3809              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3810          }          }
3811        }        }
3812      return FALSE;      return FALSE;
# Line 3015  for (;;) Line 3827  for (;;)
3827      case OP_BRAZERO:      case OP_BRAZERO:
3828        {        {
3829        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3830        if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3831            return TRUE;
3832        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3833        ecode = next + 3;        ecode = next + 3;
3834        }        }
# Line 3025  for (;;) Line 3838  for (;;)
3838        {        {
3839        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3840        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3841        if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3842            return TRUE;
3843        ecode++;        ecode++;
3844        }        }
3845      break;      break;
# Line 3040  for (;;) Line 3854  for (;;)
3854      case OP_KETRMAX:      case OP_KETRMAX:
3855        {        {
3856        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3857          const uschar *saved_eptr = eptrb->saved_eptr;
3858    
3859          eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3860    
3861        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3862            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
# Line 3056  for (;;) Line 3873  for (;;)
3873    
3874        if (*prev != OP_COND)        if (*prev != OP_COND)
3875          {          {
3876            int offset;
3877          int number = *prev - OP_BRA;          int number = *prev - OP_BRA;
         int offset = number << 1;  
3878    
3879          DPRINTF(("end bracket %d\n", number));          /* For extended extraction brackets (large number), we have to fish out
3880            the number from a dummy opcode at the start. */
3881    
3882            if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3883            offset = number << 1;
3884    
3885    #ifdef DEBUG
3886            printf("end bracket %d", number);
3887            printf("\n");
3888    #endif
3889    
3890          if (number > 0)          if (number > 0)
3891            {            {
# Line 3077  for (;;) Line 3903  for (;;)
3903        the group. */        the group. */
3904    
3905        ims = original_ims;        ims = original_ims;
3906        DPRINTF(("ims reset to %02x\n", ims));        DPRINTF(("ims reset to %02lx\n", ims));
3907    
3908        /* For a non-repeating ket, just continue at this level. This also        /* For a non-repeating ket, just continue at this level. This also
3909        happens for a repeating ket if no characters were matched in the group.        happens for a repeating ket if no characters were matched in the group.
# Line 3085  for (;;) Line 3911  for (;;)
3911        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3912        course of events. */        course of events. */
3913    
3914        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3915          {          {
3916          ecode += 3;          ecode += 3;
3917          break;          break;
# Line 3096  for (;;) Line 3922  for (;;)
3922    
3923        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3924          {          {
3925          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3926              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3927                  return TRUE;
3928          }          }
3929        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3930          {          {
3931          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3932              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3933          }          }
3934        }        }
3935      return FALSE;      return FALSE;
# Line 3113  for (;;) Line 3940  for (;;)
3940      if (md->notbol && eptr == md->start_subject) return FALSE;      if (md->notbol && eptr == md->start_subject) return FALSE;
3941      if ((ims & PCRE_MULTILINE) != 0)      if ((ims & PCRE_MULTILINE) != 0)
3942        {        {
3943        if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;        if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3944        ecode++;        ecode++;
3945        break;        break;
3946        }        }
# Line 3132  for (;;) Line 3959  for (;;)
3959      case OP_DOLL:      case OP_DOLL:
3960      if ((ims & PCRE_MULTILINE) != 0)      if ((ims & PCRE_MULTILINE) != 0)
3961        {        {
3962        if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }        if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3963          else { if (md->noteol) return FALSE; }          else { if (md->noteol) return FALSE; }
3964        ecode++;        ecode++;
3965        break;        break;
# Line 3143  for (;;) Line 3970  for (;;)
3970        if (!md->endonly)        if (!md->endonly)
3971          {          {
3972          if (eptr < md->end_subject - 1 ||          if (eptr < md->end_subject - 1 ||
3973             (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;             (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3974    
3975          ecode++;          ecode++;
3976          break;          break;
# Line 3162  for (;;) Line 3989  for (;;)
3989    
3990      case OP_EODN:      case OP_EODN:
3991      if (eptr < md->end_subject - 1 ||      if (eptr < md->end_subject - 1 ||
3992         (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;         (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3993      ecode++;      ecode++;
3994      break;      break;
3995    
# Line 3172  for (;;) Line 3999  for (;;)
3999      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
4000        {        {
4001        BOOL prev_is_word = (eptr != md->start_subject) &&        BOOL prev_is_word = (eptr != md->start_subject) &&
4002          ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);          ((md->ctypes[eptr[-1]] & ctype_word) != 0);
4003        BOOL cur_is_word = (eptr < md->end_subject) &&        BOOL cur_is_word = (eptr < md->end_subject) &&
4004          ((pcre_ctypes[*eptr] & ctype_word) != 0);          ((md->ctypes[*eptr] & ctype_word) != 0);
4005        if ((*ecode++ == OP_WORD_BOUNDARY)?        if ((*ecode++ == OP_WORD_BOUNDARY)?
4006             cur_is_word == prev_is_word : cur_is_word != prev_is_word)             cur_is_word == prev_is_word : cur_is_word != prev_is_word)
4007          return FALSE;          return FALSE;
# Line 3184  for (;;) Line 4011  for (;;)
4011      /* Match a single character type; inline for speed */      /* Match a single character type; inline for speed */
4012    
4013      case OP_ANY:      case OP_ANY:
4014      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4015        return FALSE;        return FALSE;
4016      if (eptr++ >= md->end_subject) return FALSE;      if (eptr++ >= md->end_subject) return FALSE;
4017    #ifdef SUPPORT_UTF8
4018        if (md->utf8)
4019          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4020    #endif
4021      ecode++;      ecode++;
4022      break;      break;
4023    
4024      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
4025      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)      if (eptr >= md->end_subject ||
4026           (md->ctypes[*eptr++] & ctype_digit) != 0)
4027        return FALSE;        return FALSE;
4028      ecode++;      ecode++;
4029      break;      break;
4030    
4031      case OP_DIGIT:      case OP_DIGIT:
4032      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)      if (eptr >= md->end_subject ||
4033           (md->ctypes[*eptr++] & ctype_digit) == 0)
4034        return FALSE;        return FALSE;
4035      ecode++;      ecode++;
4036      break;      break;
4037    
4038      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
4039      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)      if (eptr >= md->end_subject ||
4040           (md->ctypes[*eptr++] & ctype_space) != 0)
4041        return FALSE;        return FALSE;
4042      ecode++;      ecode++;
4043      break;      break;
4044    
4045      case OP_WHITESPACE:      case OP_WHITESPACE:
4046      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)      if (eptr >= md->end_subject ||
4047           (md->ctypes[*eptr++] & ctype_space) == 0)
4048        return FALSE;        return FALSE;
4049      ecode++;      ecode++;
4050      break;      break;
4051    
4052      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
4053      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)      if (eptr >= md->end_subject ||
4054           (md->ctypes[*eptr++] & ctype_word) != 0)
4055        return FALSE;        return FALSE;
4056      ecode++;      ecode++;
4057      break;      break;
4058    
4059      case OP_WORDCHAR:      case OP_WORDCHAR:
4060      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)      if (eptr >= md->end_subject ||
4061           (md->ctypes[*eptr++] & ctype_word) == 0)
4062        return FALSE;        return FALSE;
4063      ecode++;      ecode++;
4064      break;      break;
# Line 3237  for (;;) Line 4074  for (;;)
4074      case OP_REF:      case OP_REF:
4075        {        {
4076        int length;        int length;
4077        int offset = ecode[1] << 1;                /* Doubled reference number */        int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4078        ecode += 2;                                /* Advance past the item */        ecode += 3;                                     /* Advance past item */
4079    
4080        /* If the reference is unset, set the length to be longer than the amount        /* If the reference is unset, set the length to be longer than the amount
4081        of subject left; this ensures that every attempt at a match fails. We        of subject left; this ensures that every attempt at a match fails. We
# Line 3307  for (;;) Line 4144  for (;;)
4144          {          {
4145          for (i = min;; i++)          for (i = min;; i++)
4146            {            {
4147            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4148              return TRUE;              return TRUE;
4149            if (i >= max || !match_ref(offset, eptr, length, md, ims))            if (i >= max || !match_ref(offset, eptr, length, md, ims))
4150              return FALSE;              return FALSE;
# Line 3328  for (;;) Line 4165  for (;;)
4165            }            }
4166          while (eptr >= pp)          while (eptr >= pp)
4167            {            {
4168            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4169              return TRUE;              return TRUE;
4170            eptr -= length;            eptr -= length;
4171            }            }
# Line 3382  for (;;) Line 4219  for (;;)
4219        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4220          {          {
4221          if (eptr >= md->end_subject) return FALSE;          if (eptr >= md->end_subject) return FALSE;
4222          c = *eptr++;          GETCHARINC(c, eptr)         /* Get character; increment eptr */
4223    
4224    #ifdef SUPPORT_UTF8
4225            /* We do not yet support class members > 255 */
4226            if (c > 255) return FALSE;
4227    #endif
4228    
4229          if ((data[c/8] & (1 << (c&7))) != 0) continue;          if ((data[c/8] & (1 << (c&7))) != 0) continue;
4230          return FALSE;          return FALSE;
4231          }          }
# Line 3399  for (;;) Line 4242  for (;;)
4242          {          {
4243          for (i = min;; i++)          for (i = min;; i++)
4244            {            {
4245            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4246              return TRUE;              return TRUE;
4247            if (i >= max || eptr >= md->end_subject) return FALSE;            if (i >= max || eptr >= md->end_subject) return FALSE;
4248            c = *eptr++;            GETCHARINC(c, eptr)       /* Get character; increment eptr */
4249    
4250    #ifdef SUPPORT_UTF8
4251              /* We do not yet support class members > 255 */
4252              if (c > 255) return FALSE;
4253    #endif
4254            if ((data[c/8] & (1 << (c&7))) != 0) continue;            if ((data[c/8] & (1 << (c&7))) != 0) continue;
4255            return FALSE;            return FALSE;
4256            }            }
# Line 3414  for (;;) Line 4262  for (;;)
4262        else        else
4263          {          {
4264          const uschar *pp = eptr;          const uschar *pp = eptr;
4265          for (i = min; i < max; eptr++, i++)          int len = 1;
4266            for (i = min; i < max; i++)
4267            {            {
4268            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
4269            c = *eptr;            GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4270            if ((data[c/8] & (1 << (c&7))) != 0) continue;  
4271            break;  #ifdef SUPPORT_UTF8
4272              /* We do not yet support class members > 255 */
4273              if (c > 255) break;
4274    #endif
4275              if ((data[c/8] & (1 << (c&7))) == 0) break;
4276              eptr += len;
4277            }            }
4278    
4279          while (eptr >= pp)          while (eptr >= pp)
4280            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            {
4281              if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4282              return TRUE;              return TRUE;
4283    
4284    #ifdef SUPPORT_UTF8
4285              BACKCHAR(eptr)
4286    #endif
4287              }
4288          return FALSE;          return FALSE;
4289          }          }
4290        }        }
# Line 3453  for (;;) Line 4313  for (;;)
4313        if (length > md->end_subject - eptr) return FALSE;        if (length > md->end_subject - eptr) return FALSE;
4314        if ((ims & PCRE_CASELESS) != 0)        if ((ims & PCRE_CASELESS) != 0)
4315          {          {
4316          while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;          while (length-- > 0)
4317              if (md->lcc[*ecode++] != md->lcc[*eptr++])
4318                return FALSE;
4319          }          }
4320        else        else
4321          {          {
# Line 3510  for (;;) Line 4372  for (;;)
4372    
4373      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
4374        {        {
4375        c = pcre_lcc[c];        c = md->lcc[c];
4376        for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;        for (i = 1; i <= min; i++)
4377            if (c != md->lcc[*eptr++]) return FALSE;
4378        if (min == max) continue;        if (min == max) continue;
4379        if (minimize)        if (minimize)
4380          {          {
4381          for (i = min;; i++)          for (i = min;; i++)
4382            {            {
4383            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4384              return TRUE;              return TRUE;
4385            if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])            if (i >= max || eptr >= md->end_subject ||
4386                  c != md->lcc[*eptr++])
4387              return FALSE;              return FALSE;
4388            }            }
4389          /* Control never gets here */          /* Control never gets here */
# Line 3529  for (;;) Line 4393  for (;;)
4393          const uschar *pp = eptr;          const uschar *pp = eptr;
4394          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4395            {            {
4396            if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;            if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4397            eptr++;            eptr++;
4398            }            }
4399          while (eptr >= pp)          while (eptr >= pp)
4400            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4401              return TRUE;              return TRUE;
4402          return FALSE;          return FALSE;
4403          }          }
# Line 3550  for (;;) Line 4414  for (;;)
4414          {          {
4415          for (i = min;; i++)          for (i = min;; i++)
4416            {            {
4417            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4418              return TRUE;              return TRUE;
4419            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4420            }            }
# Line 3565  for (;;) Line 4429  for (;;)
4429            eptr++;            eptr++;
4430            }            }
4431          while (eptr >= pp)          while (eptr >= pp)
4432           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4433             return TRUE;             return TRUE;
4434          return FALSE;          return FALSE;
4435          }          }
# Line 3579  for (;;) Line 4443  for (;;)
4443      ecode++;      ecode++;
4444      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
4445        {        {
4446        if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;        if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4447        }        }
4448      else      else
4449        {        {
# Line 3639  for (;;) Line 4503  for (;;)
4503    
4504      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
4505        {        {
4506        c = pcre_lcc[c];        c = md->lcc[c];
4507        for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;        for (i = 1; i <= min; i++)
4508            if (c == md->lcc[*eptr++]) return FALSE;
4509        if (min == max) continue;        if (min == max) continue;
4510        if (minimize)        if (minimize)
4511          {          {
4512          for (i = min;; i++)          for (i = min;; i++)
4513            {            {
4514            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4515              return TRUE;              return TRUE;
4516            if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])            if (i >= max || eptr >= md->end_subject ||
4517                  c == md->lcc[*eptr++])
4518              return FALSE;              return FALSE;
4519            }            }
4520          /* Control never gets here */          /* Control never gets here */
# Line 3658  for (;;) Line 4524  for (;;)
4524          const uschar *pp = eptr;          const uschar *pp = eptr;
4525          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4526            {            {
4527            if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;            if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4528            eptr++;            eptr++;
4529            }            }
4530          while (eptr >= pp)          while (eptr >= pp)
4531            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4532              return TRUE;              return TRUE;
4533          return FALSE;          return FALSE;
4534          }          }
# Line 3679  for (;;) Line 4545  for (;;)
4545          {          {
4546          for (i = min;; i++)          for (i = min;; i++)
4547            {            {
4548            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4549              return TRUE;              return TRUE;
4550            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4551            }            }
# Line 3694  for (;;) Line 4560  for (;;)
4560            eptr++;            eptr++;
4561            }            }
4562          while (eptr >= pp)          while (eptr >= pp)
4563           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4564             return TRUE;             return TRUE;
4565          return FALSE;          return FALSE;
4566          }          }
# Line 3738  for (;;) Line 4604  for (;;)
4604    
4605      /* First, ensure the minimum number of matches are present. Use inline      /* First, ensure the minimum number of matches are present. Use inline
4606      code for maximizing the speed, and do the type test once at the start      code for maximizing the speed, and do the type test once at the start
4607      (i.e. keep it out of the loop). Also test that there are at least the      (i.e. keep it out of the loop). Also we can test that there are at least
4608      minimum number of characters before we start. */      the minimum number of bytes before we start, except when doing '.' in
4609        UTF8 mode. Leave the test in in all cases; in the special case we have
4610        to test after each character. */
4611    
4612      if (min > md->end_subject - eptr) return FALSE;      if (min > md->end_subject - eptr) return FALSE;
4613      if (min > 0) switch(ctype)      if (min > 0) switch(ctype)
4614        {        {
4615        case OP_ANY:        case OP_ANY:
4616    #ifdef SUPPORT_UTF8
4617          if (md->utf8)
4618            {
4619            for (i = 1; i <= min; i++)
4620              {
4621              if (eptr >= md->end_subject ||
4622                 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4623                return FALSE;
4624              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4625              }
4626            break;
4627            }
4628    #endif
4629          /* Non-UTF8 can be faster */
4630        if ((ims & PCRE_DOTALL) == 0)        if ((ims & PCRE_DOTALL) == 0)
4631          { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }          { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4632        else eptr += min;        else eptr += min;
4633        break;        break;
4634    
4635        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
4636        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4637          if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4638        break;        break;
4639    
4640        case OP_DIGIT:        case OP_DIGIT:
4641        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4642          if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4643        break;        break;
4644    
4645        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
4646        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4647          if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4648        break;        break;
4649    
4650        case OP_WHITESPACE:        case OP_WHITESPACE:
4651        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4652          if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4653        break;        break;
4654    
4655        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
4656        for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)        for (i = 1; i <= min; i++)
4657          return FALSE;          if ((md->ctypes[*eptr++] & ctype_word) != 0)
4658              return FALSE;
4659        break;        break;
4660    
4661        case OP_WORDCHAR:        case OP_WORDCHAR:
4662        for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)        for (i = 1; i <= min; i++)
4663          return FALSE;          if ((md->ctypes[*eptr++] & ctype_word) == 0)
4664              return FALSE;
4665        break;        break;
4666        }        }
4667    
# Line 3786  for (;;) Line 4670  for (;;)
4670      if (min == max) continue;      if (min == max) continue;
4671    
4672      /* If minimizing, we have to test the rest of the pattern before each      /* If minimizing, we have to test the rest of the pattern before each
4673      subsequent match, so inlining isn't much help; just use the function. */      subsequent match. */
4674    
4675      if (minimize)      if (minimize)
4676        {        {
4677        for (i = min;; i++)        for (i = min;; i++)
4678          {          {
4679          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;          if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4680          if (i >= max || eptr >= md->end_subject ||          if (i >= max || eptr >= md->end_subject) return FALSE;
4681            !match_type(ctype, *eptr++, (ims & PCRE_DOTALL) != 0))  
4682              return FALSE;          c = *eptr++;
4683            switch(ctype)
4684              {
4685              case OP_ANY:
4686              if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4687    #ifdef SUPPORT_UTF8
4688              if (md->utf8)
4689                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4690    #endif
4691              break;
4692    
4693              case OP_NOT_DIGIT:
4694              if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4695              break;
4696    
4697              case OP_DIGIT:
4698              if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4699              break;
4700    
4701              case OP_NOT_WHITESPACE:
4702              if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4703              break;
4704    
4705              case OP_WHITESPACE:
4706              if  ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4707              break;
4708    
4709              case OP_NOT_WORDCHAR:
4710              if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4711              break;
4712    
4713              case OP_WORDCHAR:
4714              if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4715              break;
4716              }
4717          }          }
4718        /* Control never gets here */        /* Control never gets here */
4719        }        }
# Line 3809  for (;;) Line 4727  for (;;)
4727        switch(ctype)        switch(ctype)
4728          {          {
4729          case OP_ANY:          case OP_ANY:
4730    
4731            /* Special code is required for UTF8, but when the maximum is unlimited
4732            we don't need it. */
4733    
4734    #ifdef SUPPORT_UTF8
4735            if (md->utf8 && max < INT_MAX)
4736              {
4737              if ((ims & PCRE_DOTALL) == 0)
4738                {
4739                for (i = min; i < max; i++)
4740                  {
4741                  if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4742                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4743                  }
4744                }
4745              else
4746                {
4747                for (i = min; i < max; i++)
4748                  {
4749                  eptr++;
4750                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4751                  }
4752                }
4753              break;
4754              }
4755    #endif
4756            /* Non-UTF8 can be faster */
4757          if ((ims & PCRE_DOTALL) == 0)          if ((ims & PCRE_DOTALL) == 0)
4758            {            {
4759            for (i = min; i < max; i++)            for (i = min; i < max; i++)
4760              {              {
4761              if (eptr >= md->end_subject || *eptr == '\n') break;              if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4762              eptr++;              eptr++;
4763              }              }
4764            }            }
# Line 3828  for (;;) Line 4773  for (;;)
4773          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
4774          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4775            {            {
4776            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4777              break;              break;
4778            eptr++;            eptr++;
4779            }            }
# Line 3837  for (;;) Line 4782  for (;;)
4782          case OP_DIGIT:          case OP_DIGIT:
4783          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4784            {            {
4785            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4786              break;              break;
4787            eptr++;            eptr++;
4788            }            }
# Line 3846  for (;;) Line 4791  for (;;)
4791          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
4792          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4793            {            {
4794            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4795              break;              break;
4796            eptr++;            eptr++;
4797            }            }
# Line 3855  for (;;) Line 4800  for (;;)
4800          case OP_WHITESPACE:          case OP_WHITESPACE:
4801          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4802            {            {
4803            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4804              break;              break;
4805            eptr++;            eptr++;
4806            }            }
# Line 3864  for (;;) Line 4809  for (;;)
4809          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
4810          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4811            {            {
4812            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4813              break;              break;
4814            eptr++;            eptr++;
4815            }            }
# Line 3873  for (;;) Line 4818  for (;;)
4818          case OP_WORDCHAR:          case OP_WORDCHAR:
4819          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4820            {            {
4821            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4822              break;              break;
4823            eptr++;            eptr++;
4824            }            }
# Line 3881  for (;;) Line 4826  for (;;)
4826          }          }
4827    
4828        while (eptr >= pp)        while (eptr >= pp)
4829          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))          {
4830            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4831            return TRUE;            return TRUE;
4832    #ifdef SUPPORT_UTF8
4833            if (md->utf8)
4834              while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4835    #endif
4836            }
4837        return FALSE;        return FALSE;
4838        }        }
4839      /* Control never gets here */      /* Control never gets here */
# Line 3919  Arguments: Line 4870  Arguments:
4870    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4871    subject         points to the subject string    subject         points to the subject string
4872    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4873      start_offset    where to start in the subject string
4874    options         option bits    options         option bits
4875    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4876    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 3931  Returns:          > 0 => success; value Line 4883  Returns:          > 0 => success; value
4883    
4884  int  int
4885  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4886    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4887      int offsetcount)
4888  {  {
4889  int resetcount, ocount;  int resetcount, ocount;
4890  int first_char = -1;  int first_char = -1;
4891  int ims = 0;  int req_char = -1;
4892    int req_char2 = -1;
4893    unsigned long int ims = 0;
4894  match_data match_block;  match_data match_block;
4895  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4896  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4897  const uschar *end_subject;  const uschar *end_subject;
4898    const uschar *req_char_ptr = start_match - 1;
4899  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4900  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4901  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
4902  BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;  BOOL anchored;
4903  BOOL startline = (re->options & PCRE_STARTLINE) != 0;  BOOL startline;
4904    
4905  if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;  if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4906    
# Line 3952  if (re == NULL || subject == NULL || Line 4908  if (re == NULL || subject == NULL ||
4908     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4909  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4910    
4911    anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4912    startline = (re->options & PCRE_STARTLINE) != 0;
4913    
4914    match_block.start_pattern = re->code;
4915  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
4916  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
4917  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;
4918    
4919  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4920    match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4921    
4922  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4923  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4924    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4925    
4926  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4927    
4928    match_block.lcc = re->tables + lcc_offset;
4929    match_block.ctypes = re->tables + ctypes_offset;
4930    
4931  /* The ims options can vary during the matching as a result of the presence  /* The ims options can vary during the matching as a result of the presence
4932  of (?ims) items in the pattern. They are kept in a local variable so that  of (?ims) items in the pattern. They are kept in a local variable so that
4933  restoring at the exit of a group is easy. */  restoring at the exit of a group is easy. */
# Line 3997  in the pattern. */ Line 4962  in the pattern. */
4962  resetcount = 2 + re->top_bracket * 2;  resetcount = 2 + re->top_bracket * 2;
4963  if (resetcount > offsetcount) resetcount = ocount;  if (resetcount > offsetcount) resetcount = ocount;
4964    
4965    /* Reset the working variable associated with each extraction. These should
4966    never be used unless previously set, but they get saved and restored, and so we
4967    initialize them to avoid reading uninitialized locations. */
4968    
4969    if (match_block.offset_vector != NULL)
4970      {
4971      register int *iptr = match_block.offset_vector + ocount;
4972      register int *iend = iptr - resetcount/2 + 1;
4973      while (--iptr >= iend) *iptr = -1;
4974      }
4975    
4976  /* Set up the first character to match, if available. The first_char value is  /* Set up the first character to match, if available. The first_char value is
4977  never set for an anchored regular expression, but the anchoring may be forced  never set for an anchored regular expression, but the anchoring may be forced
4978  at run time, so we have to test for anchoring. The first char may be unset for  at run time, so we have to test for anchoring. The first char may be unset for
# Line 4008  if (!anchored) Line 4984  if (!anchored)
4984    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->options & PCRE_FIRSTSET) != 0)
4985      {      {
4986      first_char = re->first_char;      first_char = re->first_char;
4987      if ((ims & PCRE_CASELESS) != 0) first_char = pcre_lcc[first_char];      if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4988      }      }
4989    else    else
4990      if (!startline && extra != NULL &&      if (!startline && extra != NULL &&
# Line 4016  if (!anchored) Line 4992  if (!anchored)
4992          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4993    }    }
4994    
4995  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4996    character" set. If the PCRE_CASELESS is set, implying that the match starts
4997    caselessly, or if there are any changes of this flag within the regex, set up
4998    both cases of the character. Otherwise set the two values the same, which will
4999    avoid duplicate testing (which takes significant time). This covers the vast
5000    majority of cases. It will be suboptimal when the case flag changes in a regex
5001    and the required character in fact is caseful. */
5002    
5003    if ((re->options & PCRE_REQCHSET) != 0)
5004      {
5005      req_char = re->req_char;
5006      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
5007        (re->tables + fcc_offset)[req_char] : req_char;
5008      }
5009    
5010    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5011    the loop runs just once. */
5012    
5013  do  do
5014    {    {
# Line 4033  do Line 5025  do
5025    if (first_char >= 0)    if (first_char >= 0)
5026      {      {
5027      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
5028        while (start_match < end_subject && pcre_lcc[*start_match] != first_char)        while (start_match < end_subject &&
5029                 match_block.lcc[*start_match] != first_char)
5030          start_match++;          start_match++;
5031      else      else
5032        while (start_match < end_subject && *start_match != first_char)        while (start_match < end_subject && *start_match != first_char)
# Line 4044  do Line 5037  do
5037    
5038    else if (startline)    else if (startline)
5039      {      {
5040      if (start_match > match_block.start_subject)      if (start_match > match_block.start_subject + start_offset)
5041        {        {
5042        while (start_match < end_subject && start_match[-1] != '\n')        while (start_match < end_subject && start_match[-1] != NEWLINE)
5043          start_match++;          start_match++;
5044        }        }
5045      }      }
5046    
5047    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
5048    
5049    else if (start_bits != NULL)    else if (start_bits != NULL)
5050      {      {
# Line 4068  do Line 5061  do
5061    printf("\n");    printf("\n");
5062  #endif  #endif
5063    
5064      /* If req_char is set, we know that that character must appear in the subject
5065      for the match to succeed. If the first character is set, req_char must be
5066      later in the subject; otherwise the test starts at the match point. This
5067      optimization can save a huge amount of backtracking in patterns with nested
5068      unlimited repeats that aren't going to match. We don't know what the state of
5069      case matching may be when this character is hit, so test for it in both its
5070      cases if necessary. However, the different cased versions will not be set up
5071      unless PCRE_CASELESS was given or the casing state changes within the regex.
5072      Writing separate code makes it go faster, as does using an autoincrement and
5073      backing off on a match. */
5074    
5075      if (req_char >= 0)
5076        {
5077        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5078    
5079        /* We don't need to repeat the search if we haven't yet reached the
5080        place we found it at last time. */
5081    
5082        if (p > req_char_ptr)
5083          {
5084          /* Do a single test if no case difference is set up */
5085    
5086          if (req_char == req_char2)
5087            {
5088            while (p < end_subject)
5089              {
5090              if (*p++ == req_char) { p--; break; }
5091              }
5092            }
5093    
5094          /* Otherwise test for either case */
5095    
5096          else
5097            {
5098            while (p < end_subject)
5099              {
5100              register int pp = *p++;
5101              if (pp == req_char || pp == req_char2) { p--; break; }
5102              }
5103            }
5104    
5105          /* If we can't find the required character, break the matching loop */
5106    
5107          if (p >= end_subject) break;
5108    
5109          /* If we have found the required character, save the point where we
5110          found it, so that we don't search again next time round the loop if
5111          the start hasn't passed this character yet. */
5112    
5113          req_char_ptr = p;
5114          }
5115        }
5116    
5117    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
5118    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
5119    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4075  do Line 5121  do
5121    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
5122    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
5123    
5124    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    match_block.start_match = start_match;
5125      if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5126      continue;      continue;
5127    
5128    /* Copy the offset information from temporary store if necessary */    /* Copy the offset information from temporary store if necessary */
# Line 4106  do Line 5153  do
5153    DPRINTF((">>>> returning %d\n", rc));    DPRINTF((">>>> returning %d\n", rc));
5154    return rc;    return rc;
5155    }    }
5156    
5157    /* This "while" is the end of the "do" above */
5158    
5159  while (!anchored &&  while (!anchored &&
5160         match_block.errorcode == PCRE_ERROR_NOMATCH &&         match_block.errorcode == PCRE_ERROR_NOMATCH &&
5161         start_match++ < end_subject);         start_match++ < end_subject);

Legend:
Removed from v.23  
changed lines
  Added in v.53

  ViewVC Help
Powered by ViewVC 1.1.5