/[pcre]/code/tags/pcre-3.2/pcre.c
ViewVC logotype

Diff of /code/tags/pcre-3.2/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3 by nigel, Sat Feb 24 21:38:01 2007 UTC revision 43 by nigel, Sat Feb 24 21:39:21 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997 University of Cambridge             Copyright (c) 1997-2000 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 33  restrictions: Line 37  restrictions:
37    
38  /* #define DEBUG */  /* #define DEBUG */
39    
40    /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41    inline, and there are *still* stupid compilers about that don't like indented
42    pre-processor statements. I suppose it's only been 10 years... */
43    
44    #ifdef DEBUG
45    #define DPRINTF(p) printf p
46    #else
47    #define DPRINTF(p) /*nothing*/
48    #endif
49    
50  /* Include the internals header, which itself includes Standard C headers plus  /* Include the internals header, which itself includes Standard C headers plus
51  the external pcre header. */  the external pcre header. */
# Line 40  the external pcre header. */ Line 53  the external pcre header. */
53  #include "internal.h"  #include "internal.h"
54    
55    
56    /* Allow compilation as C++ source code, should anybody want to do that. */
57    
58    #ifdef __cplusplus
59    #define class pcre_class
60    #endif
61    
62    
63    /* Number of items on the nested bracket stacks at compile time. This should
64    not be set greater than 200. */
65    
66    #define BRASTACK_SIZE 200
67    
68    
69  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70    
71  static char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72  static char rep_max[] = { 0, 0, 0, 0, 1, 1 };  static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73    
74  /* Text forms of OP_ values and things, for debugging */  /* Text forms of OP_ values and things, for debugging (not all used) */
75    
76  #ifdef DEBUG  #ifdef DEBUG
77  static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",  static const char *OP_names[] = {
78    "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",    "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79    "not",    "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80      "Opt", "^", "$", "Any", "chars", "not",
81    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
85    "class", "Ref",    "class", "Ref", "Recurse",
86    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87      "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
89  };  };
90  #endif  #endif
# Line 66  are simple data values; negative values Line 94  are simple data values; negative values
94  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
95  is invalid. */  is invalid. */
96    
97  static short int escapes[] = {  static const short int escapes[] = {
98      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
99      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
100    '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */    '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
# Line 76  static short int escapes[] = { Line 104  static short int escapes[] = {
104    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */
105      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
106      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */
107      0,      0,      0                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
108  };  };
109    
110  /* Definition to allow mutual recursion */  /* Tables of names of POSIX character classes and their lengths. The list is
111    terminated by a zero length entry. The first three must be alpha, upper, lower,
112    as this is assumed for handling case independence. */
113    
114    static const char *posix_names[] = {
115      "alpha", "lower", "upper",
116      "alnum", "ascii", "cntrl", "digit", "graph",
117      "print", "punct", "space", "word",  "xdigit" };
118    
119    static const uschar posix_name_lengths[] = {
120      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
121    
122    /* Table of class bit maps for each POSIX class; up to three may be combined
123    to form the class. */
124    
125    static const int posix_class_maps[] = {
126      cbit_lower, cbit_upper, -1,             /* alpha */
127      cbit_lower, -1,         -1,             /* lower */
128      cbit_upper, -1,         -1,             /* upper */
129      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
130      cbit_print, cbit_cntrl, -1,             /* ascii */
131      cbit_cntrl, -1,         -1,             /* cntrl */
132      cbit_digit, -1,         -1,             /* digit */
133      cbit_graph, -1,         -1,             /* graph */
134      cbit_print, -1,         -1,             /* print */
135      cbit_punct, -1,         -1,             /* punct */
136      cbit_space, -1,         -1,             /* space */
137      cbit_word,  -1,         -1,             /* word */
138      cbit_xdigit,-1,         -1              /* xdigit */
139    };
140    
 static BOOL compile_regex(int, int *,uschar **,uschar **,char **);  
141    
142  /* Structure for passing "static" information around between the functions  /* Definition to allow mutual recursion */
 doing the matching, so that they are thread-safe. */  
143    
144  typedef struct match_data {  static BOOL
145    int    errorcode;             /* As it says */    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
146    int   *offset_vector;         /* Offset vector */      BOOL, int, int *, int *, compile_data *);
   int    offset_end;            /* One past the end */  
   BOOL   offset_overflow;       /* Set if too many extractions */  
   BOOL   caseless;              /* Case-independent flag */  
   BOOL   runtime_caseless;      /* Caseless forced at run time */  
   BOOL   multiline;             /* Multiline flag */  
   BOOL   notbol;                /* NOTBOL flag */  
   BOOL   noteol;                /* NOTEOL flag */  
   BOOL   dotall;                /* Dot matches any char */  
   BOOL   endonly;               /* Dollar not before final \n */  
   uschar *start_subject;        /* Start of the subject string */  
   uschar *end_subject;          /* End of the subject string */  
   jmp_buf fail_env;             /* Environment for longjump() break out */  
   uschar *end_match_ptr;        /* Subject position at end match */  
   int     end_offset_top;       /* Highwater mark at end of match */  
 } match_data;  
147    
148    
149    
# Line 123  void  (*pcre_free)(void *) = free; Line 163  void  (*pcre_free)(void *) = free;
163    
164    
165  /*************************************************  /*************************************************
166    *             Default character tables           *
167    *************************************************/
168    
169    /* A default set of character tables is included in the PCRE binary. Its source
170    is built by the maketables auxiliary program, which uses the default C ctypes
171    functions, and put in the file chartables.c. These tables are used by PCRE
172    whenever the caller of pcre_compile() does not provide an alternate set of
173    tables. */
174    
175    #include "chartables.c"
176    
177    
178    
179    /*************************************************
180  *          Return version string                 *  *          Return version string                 *
181  *************************************************/  *************************************************/
182    
183  char *  #define STRING(a)  # a
184    #define XSTRING(s) STRING(s)
185    
186    const char *
187  pcre_version(void)  pcre_version(void)
188  {  {
189  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
190  }  }
191    
192    
193    
194    
195  /*************************************************  /*************************************************
196  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
197  *************************************************/  *************************************************/
198    
199  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
200  structure.  of the private structure, but its interface was too rigid. It remains for
201    backwards compatibility. The public options are passed back in an int - though
202    the re->options field has been expanded to a long int, all the public options
203    at the low end of it, and so even on 16-bit systems this will still be OK.
204    Therefore, I haven't changed the API for pcre_info().
205    
206  Arguments:  Arguments:
207    external_re   points to compiled code    external_re   points to compiled code
# Line 149  Arguments: Line 210  Arguments:
210                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
211                  or -2 otherwise                  or -2 otherwise
212    
213  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
214                  or negative values on error                  or negative values on error
215  */  */
216    
217  int  int
218  pcre_info(const pcre *external_re, int *optptr, int *first_char)  pcre_info(const pcre *external_re, int *optptr, int *first_char)
219  {  {
220  real_pcre *re = (real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
221  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
222  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
223  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
224  if (first_char != NULL)  if (first_char != NULL)
225    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
226       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 168  return re->top_bracket; Line 229  return re->top_bracket;
229    
230    
231    
232    /*************************************************
233    *        Return info about compiled pattern      *
234    *************************************************/
235    
236    /* This is a newer "info" function which has an extensible interface so
237    that additional items can be added compatibly.
238    
239    Arguments:
240      external_re      points to compiled code
241      external_study   points to study data, or NULL
242      what             what information is required
243      where            where to put the information
244    
245    Returns:           0 if data returned, negative on error
246    */
247    
248    int
249    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
250      void *where)
251    {
252    const real_pcre *re = (const real_pcre *)external_re;
253    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
254    
255    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
256    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
257    
258    switch (what)
259      {
260      case PCRE_INFO_OPTIONS:
261      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
262      break;
263    
264      case PCRE_INFO_SIZE:
265      *((size_t *)where) = re->size;
266      break;
267    
268      case PCRE_INFO_CAPTURECOUNT:
269      *((int *)where) = re->top_bracket;
270      break;
271    
272      case PCRE_INFO_BACKREFMAX:
273      *((int *)where) = re->top_backref;
274      break;
275    
276      case PCRE_INFO_FIRSTCHAR:
277      *((int *)where) =
278        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
279        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
280      break;
281    
282      case PCRE_INFO_FIRSTTABLE:
283      *((const uschar **)where) =
284        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
285          study->start_bits : NULL;
286      break;
287    
288      case PCRE_INFO_LASTLITERAL:
289      *((int *)where) =
290        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
291      break;
292    
293      default: return PCRE_ERROR_BADOPTION;
294      }
295    
296    return 0;
297    }
298    
299    
300    
301  #ifdef DEBUG  #ifdef DEBUG
302  /*************************************************  /*************************************************
# Line 186  Arguments: Line 315  Arguments:
315  Returns:     nothing  Returns:     nothing
316  */  */
317    
318  static pchars(uschar *p, int length, BOOL is_subject, match_data *md)  static void
319    pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
320  {  {
321  int c;  int c;
322  if (is_subject && length > md->end_subject - p) length = md->end_subject - p;  if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
# Line 199  while (length-- > 0) Line 329  while (length-- > 0)
329    
330    
331  /*************************************************  /*************************************************
 *         Check subpattern for empty operand     *  
 *************************************************/  
   
 /* This function checks a bracketed subpattern to see if any of the paths  
 through it could match an empty string. This is used to diagnose an error if  
 such a subpattern is followed by a quantifier with an unlimited upper bound.  
   
 Argument:  
   code      points to the opening bracket  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 could_be_empty(uschar *code)  
 {  
 do {  
   uschar *cc = code + 3;  
   
   /* Scan along the opcodes for this branch; as soon as we find something  
   that matches a non-empty string, break out and advance to test the next  
   branch. If we get to the end of the branch, return TRUE for the whole  
   sub-expression. */  
   
   for (;;)  
     {  
     /* Test an embedded subpattern; if it could not be empty, break the  
     loop. Otherwise carry on in the branch. */  
   
     if ((int)(*cc) >= OP_BRA)  
       {  
       if (!could_be_empty(cc)) break;  
       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);  
       cc += 3;  
       }  
   
     else switch (*cc)  
       {  
       /* Reached end of a branch: the subpattern may match the empty string */  
   
       case OP_ALT:  
       case OP_KET:  
       case OP_KETRMAX:  
       case OP_KETRMIN:  
       return TRUE;  
   
       /* Skip over assertive subpatterns */  
   
       case OP_ASSERT:  
       case OP_ASSERT_NOT:  
       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);  
       cc += 3;  
       break;  
   
       /* Skip over things that don't match chars */  
   
       case OP_SOD:  
       case OP_EOD:  
       case OP_CIRC:  
       case OP_DOLL:  
       case OP_BRAZERO:  
       case OP_BRAMINZERO:  
       case OP_NOT_WORD_BOUNDARY:  
       case OP_WORD_BOUNDARY:  
       cc++;  
       break;  
   
       /* Skip over simple repeats with zero lower bound */  
   
       case OP_STAR:  
       case OP_MINSTAR:  
       case OP_QUERY:  
       case OP_MINQUERY:  
       case OP_TYPESTAR:  
       case OP_TYPEMINSTAR:  
       case OP_TYPEQUERY:  
       case OP_TYPEMINQUERY:  
       cc += 2;  
       break;  
   
       /* Skip over UPTOs (lower bound is zero) */  
   
       case OP_UPTO:  
       case OP_MINUPTO:  
       case OP_TYPEUPTO:  
       case OP_TYPEMINUPTO:  
       cc += 4;  
       break;  
   
       /* Check a class or a back reference for a zero minimum */  
   
       case OP_CLASS:  
       case OP_REF:  
       cc += (*cc == OP_REF)? 2 : 4 + 2 * cc[2] + cc[3];  
   
       switch (*cc)  
         {  
         case OP_CRSTAR:  
         case OP_CRMINSTAR:  
         case OP_CRQUERY:  
         case OP_CRMINQUERY:  
         cc++;  
         break;  
   
         case OP_CRRANGE:  
         case OP_CRMINRANGE:  
         if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;  
         cc += 3;  
         break;  
   
         default:  
         goto NEXT_BRANCH;  
         }  
       break;  
   
       /* Anything else matches at least one character */  
   
       default:  
       goto NEXT_BRANCH;  
       }  
     }  
   
   NEXT_BRANCH:  
   code += (code[1] << 8) + code[2];  
   }  
 while (*code == OP_ALT);  
   
 /* No branches match the empty string */  
   
 return FALSE;  
 }  
   
   
   
 /*************************************************  
332  *            Handle escapes                      *  *            Handle escapes                      *
333  *************************************************/  *************************************************/
334    
# Line 349  Arguments: Line 344  Arguments:
344    bracount   number of previous extracting brackets    bracount   number of previous extracting brackets
345    options    the options bits    options    the options bits
346    isclass    TRUE if inside a character class    isclass    TRUE if inside a character class
347      cd         pointer to char tables block
348    
349  Returns:     zero or positive => a data character  Returns:     zero or positive => a data character
350               negative => a special escape sequence               negative => a special escape sequence
# Line 356  Returns:     zero or positive => a data Line 352  Returns:     zero or positive => a data
352  */  */
353    
354  static int  static int
355  check_escape(uschar **ptrptr, char **errorptr, int bracount, int options,  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
356    BOOL isclass)    int options, BOOL isclass, compile_data *cd)
357  {  {
358  uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
359  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
360    
361    c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
362  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
363    
364  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 378  else if ((i = escapes[c - '0']) != 0) c Line 374  else if ((i = escapes[c - '0']) != 0) c
374    
375  else  else
376    {    {
377    uschar *oldptr;    const uschar *oldptr;
378    switch (c)    switch (c)
379      {      {
380      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
# Line 400  else Line 396  else
396        {        {
397        oldptr = ptr;        oldptr = ptr;
398        c -= '0';        c -= '0';
399        while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)        while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
400          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
401        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
402          {          {
# Line 426  else Line 422  else
422    
423      case '0':      case '0':
424      c -= '0';      c -= '0';
425      while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
426        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
427          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
428      break;      break;
# Line 435  else Line 431  else
431    
432      case 'x':      case 'x':
433      c = 0;      c = 0;
434      while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
435        {        {
436        ptr++;        ptr++;
437        c = c * 16 + pcre_lcc[*ptr] -        c = c * 16 + cd->lcc[*ptr] -
438          (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');          (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
439        }        }
440      break;      break;
441    
# Line 453  else Line 449  else
449    
450      /* A letter is upper-cased; then the 0x40 bit is flipped */      /* A letter is upper-cased; then the 0x40 bit is flipped */
451    
452      if (c >= 'a' && c <= 'z') c = pcre_fcc[c];      if (c >= 'a' && c <= 'z') c = cd->fcc[c];
453      c ^= 0x40;      c ^= 0x40;
454      break;      break;
455    
456      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
457      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
458      for Perl compatibility, it is a literal. */      for Perl compatibility, it is a literal. This code looks a bit odd, but
459        there used to be some cases other than the default, and there may be again
460        in future, so I haven't "optimized" it. */
461    
462      default:      default:
463      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
464        {        {
       case 'X':  
       c = -ESC_X;      /* This could be a lookup if it ever got into Perl */  
       break;  
   
465        default:        default:
466        *errorptr = ERR3;        *errorptr = ERR3;
467        break;        break;
# Line 493  where the ddds are digits. Line 487  where the ddds are digits.
487    
488  Arguments:  Arguments:
489    p         pointer to the first char after '{'    p         pointer to the first char after '{'
490      cd        pointer to char tables block
491    
492  Returns:    TRUE or FALSE  Returns:    TRUE or FALSE
493  */  */
494    
495  static BOOL  static BOOL
496  is_counted_repeat(uschar *p)  is_counted_repeat(const uschar *p, compile_data *cd)
497  {  {
498  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
499  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
500  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
501    
502  if (*p++ != ',') return FALSE;  if (*p++ != ',') return FALSE;
503  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
504    
505  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
506  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
507  return (*p == '}');  return (*p == '}');
508  }  }
509    
# Line 528  Arguments: Line 523  Arguments:
523    maxp       pointer to int for max    maxp       pointer to int for max
524               returned as -1 if no max               returned as -1 if no max
525    errorptr   points to pointer to error message    errorptr   points to pointer to error message
526      cd         pointer to character tables clock
527    
528  Returns:     pointer to '}' on success;  Returns:     pointer to '}' on success;
529               current ptr on error, with errorptr set               current ptr on error, with errorptr set
530  */  */
531    
532  static uschar *  static const uschar *
533  read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr)  read_repeat_counts(const uschar *p, int *minp, int *maxp,
534      const char **errorptr, compile_data *cd)
535  {  {
536  int min = 0;  int min = 0;
537  int max = -1;  int max = -1;
538    
539  while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
540    
541  if (*p == '}') max = min; else  if (*p == '}') max = min; else
542    {    {
543    if (*(++p) != '}')    if (*(++p) != '}')
544      {      {
545      max = 0;      max = 0;
546      while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
547      if (max < min)      if (max < min)
548        {        {
549        *errorptr = ERR4;        *errorptr = ERR4;
# Line 571  return p; Line 568  return p;
568    
569    
570  /*************************************************  /*************************************************
571    *        Find the fixed length of a pattern      *
572    *************************************************/
573    
574    /* Scan a pattern and compute the fixed length of subject that will match it,
575    if the length is fixed. This is needed for dealing with backward assertions.
576    
577    Arguments:
578      code     points to the start of the pattern (the bracket)
579    
580    Returns:   the fixed length, or -1 if there is no fixed length
581    */
582    
583    static int
584    find_fixedlength(uschar *code)
585    {
586    int length = -1;
587    
588    register int branchlength = 0;
589    register uschar *cc = code + 3;
590    
591    /* Scan along the opcodes for this branch. If we get to the end of the
592    branch, check the length against that of the other branches. */
593    
594    for (;;)
595      {
596      int d;
597      register int op = *cc;
598      if (op >= OP_BRA) op = OP_BRA;
599    
600      switch (op)
601        {
602        case OP_BRA:
603        case OP_ONCE:
604        case OP_COND:
605        d = find_fixedlength(cc);
606        if (d < 0) return -1;
607        branchlength += d;
608        do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
609        cc += 3;
610        break;
611    
612        /* Reached end of a branch; if it's a ket it is the end of a nested
613        call. If it's ALT it is an alternation in a nested call. If it is
614        END it's the end of the outer call. All can be handled by the same code. */
615    
616        case OP_ALT:
617        case OP_KET:
618        case OP_KETRMAX:
619        case OP_KETRMIN:
620        case OP_END:
621        if (length < 0) length = branchlength;
622          else if (length != branchlength) return -1;
623        if (*cc != OP_ALT) return length;
624        cc += 3;
625        branchlength = 0;
626        break;
627    
628        /* Skip over assertive subpatterns */
629    
630        case OP_ASSERT:
631        case OP_ASSERT_NOT:
632        case OP_ASSERTBACK:
633        case OP_ASSERTBACK_NOT:
634        do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
635        cc += 3;
636        break;
637    
638        /* Skip over things that don't match chars */
639    
640        case OP_REVERSE:
641        cc++;
642        /* Fall through */
643    
644        case OP_CREF:
645        case OP_OPT:
646        cc++;
647        /* Fall through */
648    
649        case OP_SOD:
650        case OP_EOD:
651        case OP_EODN:
652        case OP_CIRC:
653        case OP_DOLL:
654        case OP_NOT_WORD_BOUNDARY:
655        case OP_WORD_BOUNDARY:
656        cc++;
657        break;
658    
659        /* Handle char strings */
660    
661        case OP_CHARS:
662        branchlength += *(++cc);
663        cc += *cc + 1;
664        break;
665    
666        /* Handle exact repetitions */
667    
668        case OP_EXACT:
669        case OP_TYPEEXACT:
670        branchlength += (cc[1] << 8) + cc[2];
671        cc += 4;
672        break;
673    
674        /* Handle single-char matchers */
675    
676        case OP_NOT_DIGIT:
677        case OP_DIGIT:
678        case OP_NOT_WHITESPACE:
679        case OP_WHITESPACE:
680        case OP_NOT_WORDCHAR:
681        case OP_WORDCHAR:
682        case OP_ANY:
683        branchlength++;
684        cc++;
685        break;
686    
687    
688        /* Check a class for variable quantification */
689    
690        case OP_CLASS:
691        cc += (*cc == OP_REF)? 2 : 33;
692    
693        switch (*cc)
694          {
695          case OP_CRSTAR:
696          case OP_CRMINSTAR:
697          case OP_CRQUERY:
698          case OP_CRMINQUERY:
699          return -1;
700    
701          case OP_CRRANGE:
702          case OP_CRMINRANGE:
703          if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
704          branchlength += (cc[1] << 8) + cc[2];
705          cc += 5;
706          break;
707    
708          default:
709          branchlength++;
710          }
711        break;
712    
713        /* Anything else is variable length */
714    
715        default:
716        return -1;
717        }
718      }
719    /* Control never gets here */
720    }
721    
722    
723    
724    
725    /*************************************************
726    *           Check for POSIX class syntax         *
727    *************************************************/
728    
729    /* This function is called when the sequence "[:" or "[." or "[=" is
730    encountered in a character class. It checks whether this is followed by an
731    optional ^ and then a sequence of letters, terminated by a matching ":]" or
732    ".]" or "=]".
733    
734    Argument:
735      ptr      pointer to the initial [
736      endptr   where to return the end pointer
737      cd       pointer to compile data
738    
739    Returns:   TRUE or FALSE
740    */
741    
742    static BOOL
743    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
744    {
745    int terminator;          /* Don't combine these lines; the Solaris cc */
746    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
747    if (*(++ptr) == '^') ptr++;
748    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
749    if (*ptr == terminator && ptr[1] == ']')
750      {
751      *endptr = ptr;
752      return TRUE;
753      }
754    return FALSE;
755    }
756    
757    
758    
759    
760    /*************************************************
761    *          Check POSIX class name                *
762    *************************************************/
763    
764    /* This function is called to check the name given in a POSIX-style class entry
765    such as [:alnum:].
766    
767    Arguments:
768      ptr        points to the first letter
769      len        the length of the name
770    
771    Returns:     a value representing the name, or -1 if unknown
772    */
773    
774    static int
775    check_posix_name(const uschar *ptr, int len)
776    {
777    register int yield = 0;
778    while (posix_name_lengths[yield] != 0)
779      {
780      if (len == posix_name_lengths[yield] &&
781        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
782      yield++;
783      }
784    return -1;
785    }
786    
787    
788    
789    
790    /*************************************************
791  *           Compile one branch                   *  *           Compile one branch                   *
792  *************************************************/  *************************************************/
793    
794  /* Scan the pattern, compiling it into the code vector.  /* Scan the pattern, compiling it into the code vector.
795    
796  Arguments:  Arguments:
797    options    the option bits    options      the option bits
798    bracket    points to number of brackets used    brackets     points to number of brackets used
799    code       points to the pointer to the current code point    code         points to the pointer to the current code point
800    ptrptr     points to the current pattern pointer    ptrptr       points to the current pattern pointer
801    errorptr   points to pointer to error message    errorptr     points to pointer to error message
802      optchanged   set to the value of the last OP_OPT item compiled
803      reqchar      set to the last literal character required, else -1
804      countlits    set to count of mandatory literal characters
805      cd           contains pointers to tables
806    
807  Returns:     TRUE on success  Returns:       TRUE on success
808               FALSE, with *errorptr set on error                 FALSE, with *errorptr set on error
809  */  */
810    
811  static BOOL  static BOOL
812  compile_branch(int options, int *brackets, uschar **codeptr, uschar **ptrptr,  compile_branch(int options, int *brackets, uschar **codeptr,
813    char **errorptr)    const uschar **ptrptr, const char **errorptr, int *optchanged,
814      int *reqchar, int *countlits, compile_data *cd)
815  {  {
816  int repeat_type, op_type;  int repeat_type, op_type;
817  int repeat_min, repeat_max;  int repeat_min, repeat_max;
818  int bravalue, length;  int bravalue, length;
819    int greedy_default, greedy_non_default;
820    int prevreqchar;
821    int condcount = 0;
822    int subcountlits = 0;
823  register int c;  register int c;
824  register uschar *code = *codeptr;  register uschar *code = *codeptr;
825  uschar *ptr = *ptrptr;  uschar *tempcode;
826    const uschar *ptr = *ptrptr;
827    const uschar *tempptr;
828  uschar *previous = NULL;  uschar *previous = NULL;
 uschar *oldptr;  
829  uschar class[32];  uschar class[32];
830    
831    /* Set up the default and non-default settings for greediness */
832    
833    greedy_default = ((options & PCRE_UNGREEDY) != 0);
834    greedy_non_default = greedy_default ^ 1;
835    
836    /* Initialize no required char, and count of literals */
837    
838    *reqchar = prevreqchar = -1;
839    *countlits = 0;
840    
841  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
842    
843  for (;; ptr++)  for (;; ptr++)
844    {    {
845    BOOL negate_class;    BOOL negate_class;
846    int  class_charcount;    int class_charcount;
847    int  class_lastchar;    int class_lastchar;
848      int newoptions;
849      int condref;
850      int subreqchar;
851    
852    c = *ptr;    c = *ptr;
853    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
854      {      {
855      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
856      if (c == '#')      if (c == '#')
857        {        {
858        while ((c = *(++ptr)) != 0 && c != '\n');        while ((c = *(++ptr)) != 0 && c != '\n');
# Line 657  for (;; ptr++) Line 897  for (;; ptr++)
897      previous = code;      previous = code;
898      *code++ = OP_CLASS;      *code++ = OP_CLASS;
899    
900      /* If the first character is '^', set the negation flag */      /* If the first character is '^', set the negation flag and skip it. */
901    
902      if ((c = *(++ptr)) == '^')      if ((c = *(++ptr)) == '^')
903        {        {
# Line 690  for (;; ptr++) Line 930  for (;; ptr++)
930          goto FAILED;          goto FAILED;
931          }          }
932    
933          /* Handle POSIX class names. Perl allows a negation extension of the
934          form [:^name]. A square bracket that doesn't match the syntax is
935          treated as a literal. We also recognize the POSIX constructions
936          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
937          5.6 does. */
938    
939          if (c == '[' &&
940              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
941              check_posix_syntax(ptr, &tempptr, cd))
942            {
943            BOOL local_negate = FALSE;
944            int posix_class, i;
945            register const uschar *cbits = cd->cbits;
946    
947            if (ptr[1] != ':')
948              {
949              *errorptr = ERR31;
950              goto FAILED;
951              }
952    
953            ptr += 2;
954            if (*ptr == '^')
955              {
956              local_negate = TRUE;
957              ptr++;
958              }
959    
960            posix_class = check_posix_name(ptr, tempptr - ptr);
961            if (posix_class < 0)
962              {
963              *errorptr = ERR30;
964              goto FAILED;
965              }
966    
967            /* If matching is caseless, upper and lower are converted to
968            alpha. This relies on the fact that the class table starts with
969            alpha, lower, upper as the first 3 entries. */
970    
971            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
972              posix_class = 0;
973    
974            /* Or into the map we are building up to 3 of the static class
975            tables, or their negations. */
976    
977            posix_class *= 3;
978            for (i = 0; i < 3; i++)
979              {
980              int taboffset = posix_class_maps[posix_class + i];
981              if (taboffset < 0) break;
982              if (local_negate)
983                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
984              else
985                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
986              }
987    
988            ptr = tempptr + 1;
989            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
990            continue;
991            }
992    
993        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
994        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
995        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
996        Inside a class (and only there) it is treated as backslash. Elsewhere        Inside a class (and only there) it is treated as backspace. Elsewhere
997        it marks a word boundary. Other escapes have preset maps ready to        it marks a word boundary. Other escapes have preset maps ready to
998        or into the one we are building. We assume they have more than one        or into the one we are building. We assume they have more than one
999        character in them, so set class_count bigger than one. */        character in them, so set class_count bigger than one. */
1000    
1001        if (c == '\\')        if (c == '\\')
1002          {          {
1003          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1004          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';
1005          else if (c < 0)          else if (c < 0)
1006            {            {
1007              register const uschar *cbits = cd->cbits;
1008            class_charcount = 10;            class_charcount = 10;
1009            switch (-c)            switch (-c)
1010              {              {
1011              case ESC_d:              case ESC_d:
1012              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1013              continue;              continue;
1014    
1015              case ESC_D:              case ESC_D:
1016              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1017              continue;              continue;
1018    
1019              case ESC_w:              case ESC_w:
1020              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1021              continue;              continue;
1022    
1023              case ESC_W:              case ESC_W:
1024              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1025              continue;              continue;
1026    
1027              case ESC_s:              case ESC_s:
1028              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1029              continue;              continue;
1030    
1031              case ESC_S:              case ESC_S:
1032              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1033              continue;              continue;
1034    
1035              default:              default:
# Line 762  for (;; ptr++) Line 1061  for (;; ptr++)
1061    
1062          if (d == '\\')          if (d == '\\')
1063            {            {
1064            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1065            if (d < 0)            if (d < 0)
1066              {              {
1067              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
# Line 784  for (;; ptr++) Line 1083  for (;; ptr++)
1083            class[c/8] |= (1 << (c&7));            class[c/8] |= (1 << (c&7));
1084            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
1085              {              {
1086              int uc = pcre_fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
1087              class[uc/8] |= (1 << (uc&7));              class[uc/8] |= (1 << (uc&7));
1088              }              }
1089            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
# Line 799  for (;; ptr++) Line 1098  for (;; ptr++)
1098        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1099        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1100          {          {
1101          c = pcre_fcc[c];   /* flip case */          c = cd->fcc[c];   /* flip case */
1102          class[c/8] |= (1 << (c&7));          class[c/8] |= (1 << (c&7));
1103          }          }
1104        class_charcount++;        class_charcount++;
# Line 846  for (;; ptr++) Line 1145  for (;; ptr++)
1145      /* Various kinds of repeat */      /* Various kinds of repeat */
1146    
1147      case '{':      case '{':
1148      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1149      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1150      if (*errorptr != NULL) goto FAILED;      if (*errorptr != NULL) goto FAILED;
1151      goto REPEAT;      goto REPEAT;
1152    
# Line 872  for (;; ptr++) Line 1171  for (;; ptr++)
1171        goto FAILED;        goto FAILED;
1172        }        }
1173    
1174      /* If the next character is '?' this is a minimizing repeat. Advance to the      /* If the next character is '?' this is a minimizing repeat, by default,
1175        but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1176      next character. */      next character. */
1177    
1178      if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;      if (ptr[1] == '?')
1179          { repeat_type = greedy_non_default; ptr++; }
1180      /* If the maximum is zero then the minimum must also be zero; Perl allows      else repeat_type = greedy_default;
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
1181    
1182      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1183      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1184      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1185        out any reqchar setting, backing up to the previous value. We must also
1186        adjust the countlits value. */
1187    
1188      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1189        {        {
1190        int len = previous[1];        int len = previous[1];
1191    
1192          if (repeat_min == 0) *reqchar = prevreqchar;
1193          *countlits += repeat_min - 1;
1194    
1195        if (len == 1)        if (len == 1)
1196          {          {
1197          c = previous[2];          c = previous[2];
# Line 920  for (;; ptr++) Line 1223  for (;; ptr++)
1223      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
1224      repeats by adding a suitable offset into repeat_type. */      repeats by adding a suitable offset into repeat_type. */
1225    
1226      else if ((int)*previous < OP_EOD || *previous == OP_ANY)      else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1227        {        {
1228        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1229        c = *previous;        c = *previous;
1230        code = previous;        code = previous;
1231    
1232        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1233        repeat_type += op_type;      /* Combine both values for many cases */  
1234          /* If the maximum is zero then the minimum must also be zero; Perl allows
1235          this case, so we do too - by simply omitting the item altogether. */
1236    
1237          if (repeat_max == 0) goto END_REPEAT;
1238    
1239          /* Combine the op_type with the repeat_type */
1240    
1241          repeat_type += op_type;
1242    
1243        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1244        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 964  for (;; ptr++) Line 1275  for (;; ptr++)
1275          /* If the mininum is 1 and the previous item was a character string,          /* If the mininum is 1 and the previous item was a character string,
1276          we either have to put back the item that got cancelled if the string          we either have to put back the item that got cancelled if the string
1277          length was 1, or add the character back onto the end of a longer          length was 1, or add the character back onto the end of a longer
1278          string. For a character type nothing need be done; it will just get put          string. For a character type nothing need be done; it will just get
1279          back naturally. */          put back naturally. Note that the final character is always going to
1280            get added below. */
1281    
1282          else if (*previous == OP_CHARS)          else if (*previous == OP_CHARS)
1283            {            {
1284            if (code == previous) code += 2; else previous[1]++;            if (code == previous) code += 2; else previous[1]++;
1285            }            }
1286    
1287          /* Insert an UPTO if the max is greater than the min. */          /*  For a single negated character we also have to put back the
1288            item that got cancelled. */
1289    
1290            else if (*previous == OP_NOT) code++;
1291    
1292            /* If the maximum is unlimited, insert an OP_STAR. */
1293    
1294            if (repeat_max < 0)
1295              {
1296              *code++ = c;
1297              *code++ = OP_STAR + repeat_type;
1298              }
1299    
1300            /* Else insert an UPTO if the max is greater than the min. */
1301    
1302          if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
1303            {            {
1304            *code++ = c;            *code++ = c;
1305            repeat_max -= repeat_min;            repeat_max -= repeat_min;
# Line 990  for (;; ptr++) Line 1315  for (;; ptr++)
1315        }        }
1316    
1317      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1318      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1319    
1320      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1321        {        {
1322          if (repeat_max == 0)
1323            {
1324            code = previous;
1325            goto END_REPEAT;
1326            }
1327        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1328          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1329        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1012  for (;; ptr++) Line 1342  for (;; ptr++)
1342        }        }
1343    
1344      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
1345      cases. If the maximum repeat count is unlimited, check that the bracket      cases. */
     group cannot match the empty string, and diagnose an error if it can. */  
1346    
1347      else if ((int)*previous >= OP_BRA)      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1348                 (int)*previous == OP_COND)
1349        {        {
1350        int i;        register int i;
1351        int length = code - previous;        int ketoffset = 0;
1352          int len = code - previous;
1353          uschar *bralink = NULL;
1354    
1355          /* If the maximum repeat count is unlimited, find the end of the bracket
1356          by scanning through from the start, and compute the offset back to it
1357          from the current code pointer. There may be an OP_OPT setting following
1358          the final KET, so we can't find the end just by going back from the code
1359          pointer. */
1360    
1361          if (repeat_max == -1)
1362            {
1363            register uschar *ket = previous;
1364            do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1365            ketoffset = code - ket;
1366            }
1367    
1368          /* The case of a zero minimum is special because of the need to stick
1369          OP_BRAZERO in front of it, and because the group appears once in the
1370          data, whereas in other cases it appears the minimum number of times. For
1371          this reason, it is simplest to treat this case separately, as otherwise
1372          the code gets far too mess. There are several special subcases when the
1373          minimum is zero. */
1374    
1375        if (repeat_max == -1 && could_be_empty(previous))        if (repeat_min == 0)
1376          {          {
1377          *errorptr = ERR10;          /* If we set up a required char from the bracket, we must back off
1378          goto FAILED;          to the previous value and reset the countlits value too. */
         }  
1379    
1380        /* If the minimum is greater than zero, and the maximum is unlimited or          if (subcountlits > 0)
1381        equal to the minimum, the first copy remains where it is, and is            {
1382        replicated up to the minimum number of times. This case includes the +            *reqchar = prevreqchar;
1383        repeat, but of course no replication is needed in that case. */            *countlits -= subcountlits;
1384              }
1385    
1386        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))          /* If the maximum is also zero, we just omit the group from the output
1387          {          altogether. */
1388          for (i = 1; i < repeat_min; i++)  
1389            if (repeat_max == 0)
1390            {            {
1391            memcpy(code, previous, length);            code = previous;
1392            code += length;            goto END_REPEAT;
1393            }            }
         }  
1394    
1395        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is 1 or unlimited, we just have to stick in the
1396        Then, if there is a fixed upper limit, replicated up to that many times,          BRAZERO and do no more at this point. */
       sticking BRAZERO in front of all the optional ones. */  
1397    
1398        else          if (repeat_max <= 1)
         {  
         if (repeat_min == 0)  
1399            {            {
1400            memmove(previous+1, previous, length);            memmove(previous+1, previous, len);
1401            code++;            code++;
1402            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1403            }            }
1404    
1405            /* If the maximum is greater than 1 and limited, we have to replicate
1406            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1407            The first one has to be handled carefully because it's the original
1408            copy, which has to be moved up. The remainder can be handled by code
1409            that is common with the non-zero minimum case below. We just have to
1410            adjust the value or repeat_max, since one less copy is required. */
1411    
1412            else
1413              {
1414              int offset;
1415              memmove(previous+4, previous, len);
1416              code += 4;
1417              *previous++ = OP_BRAZERO + repeat_type;
1418              *previous++ = OP_BRA;
1419    
1420              /* We chain together the bracket offset fields that have to be
1421              filled in later when the ends of the brackets are reached. */
1422    
1423              offset = (bralink == NULL)? 0 : previous - bralink;
1424              bralink = previous;
1425              *previous++ = offset >> 8;
1426              *previous++ = offset & 255;
1427              }
1428    
1429            repeat_max--;
1430            }
1431    
1432          /* If the minimum is greater than zero, replicate the group as many
1433          times as necessary, and adjust the maximum to the number of subsequent
1434          copies that we need. */
1435    
1436          else
1437            {
1438          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1439            {            {
1440            memcpy(code, previous, length);            memcpy(code, previous, len);
1441            code += length;            code += len;
1442            }            }
1443            if (repeat_max > 0) repeat_max -= repeat_min;
1444            }
1445    
1446          /* This code is common to both the zero and non-zero minimum cases. If
1447          the maximum is limited, it replicates the group in a nested fashion,
1448          remembering the bracket starts on a stack. In the case of a zero minimum,
1449          the first one was set up above. In all cases the repeat_max now specifies
1450          the number of additional copies needed. */
1451    
1452          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        if (repeat_max >= 0)
1453            {
1454            for (i = repeat_max - 1; i >= 0; i--)
1455            {            {
1456            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1457            memcpy(code, previous, length);  
1458            code += length;            /* All but the final copy start a new nesting, maintaining the
1459              chain of brackets outstanding. */
1460    
1461              if (i != 0)
1462                {
1463                int offset;
1464                *code++ = OP_BRA;
1465                offset = (bralink == NULL)? 0 : code - bralink;
1466                bralink = code;
1467                *code++ = offset >> 8;
1468                *code++ = offset & 255;
1469                }
1470    
1471              memcpy(code, previous, len);
1472              code += len;
1473              }
1474    
1475            /* Now chain through the pending brackets, and fill in their length
1476            fields (which are holding the chain links pro tem). */
1477    
1478            while (bralink != NULL)
1479              {
1480              int oldlinkoffset;
1481              int offset = code - bralink + 1;
1482              uschar *bra = code - offset;
1483              oldlinkoffset = (bra[1] << 8) + bra[2];
1484              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1485              *code++ = OP_KET;
1486              *code++ = bra[1] = offset >> 8;
1487              *code++ = bra[2] = (offset & 255);
1488            }            }
1489          }          }
1490    
1491        /* If the maximum is unlimited, set a repeater in the final copy. */        /* If the maximum is unlimited, set a repeater in the final copy. We
1492          can't just offset backwards from the current code point, because we
1493          don't know if there's been an options resetting after the ket. The
1494          correct offset was computed above. */
1495    
1496        if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1497        }        }
1498    
1499      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1082  for (;; ptr++) Line 1506  for (;; ptr++)
1506    
1507      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1508    
1509        END_REPEAT:
1510      previous = NULL;      previous = NULL;
1511      break;      break;
1512    
1513    
1514      /* Start of nested bracket sub-expression, or comment or lookahead.      /* Start of nested bracket sub-expression, or comment or lookahead or
1515      First deal with special things that can come after a bracket; all are      lookbehind or option setting or condition. First deal with special things
1516      introduced by ?, and the appearance of any of them means that this is not a      that can come after a bracket; all are introduced by ?, and the appearance
1517      referencing group. They were checked for validity in the first pass over      of any of them means that this is not a referencing group. They were
1518      the string, so we don't have to check for syntax errors here.  */      checked for validity in the first pass over the string, so we don't have to
1519        check for syntax errors here.  */
1520    
1521      case '(':      case '(':
1522      previous = code;              /* Only real brackets can be repeated */      newoptions = options;
1523        condref = -1;
1524    
1525      if (*(++ptr) == '?')      if (*(++ptr) == '?')
1526        {        {
1527        bravalue = OP_BRA;        int set, unset;
1528          int *optset;
1529    
1530        switch (*(++ptr))        switch (*(++ptr))
1531          {          {
1532          case '#':          case '#':                 /* Comment; skip to ket */
         case 'i':  
         case 'm':  
         case 's':  
         case 'x':  
1533          ptr++;          ptr++;
1534          while (*ptr != ')') ptr++;          while (*ptr != ')') ptr++;
         previous = NULL;  
1535          continue;          continue;
1536    
1537          case ':':                 /* Non-extracting bracket */          case ':':                 /* Non-extracting bracket */
1538          ptr++;          bravalue = OP_BRA;
1539            ptr++;
1540            break;
1541    
1542            case '(':
1543            bravalue = OP_COND;       /* Conditional group */
1544            if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1545              {
1546              condref = *ptr - '0';
1547              while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1548              ptr++;
1549              }
1550            else ptr--;
1551            break;
1552    
1553            case '=':                 /* Positive lookahead */
1554            bravalue = OP_ASSERT;
1555            ptr++;
1556            break;
1557    
1558            case '!':                 /* Negative lookahead */
1559            bravalue = OP_ASSERT_NOT;
1560            ptr++;
1561            break;
1562    
1563            case '<':                 /* Lookbehinds */
1564            switch (*(++ptr))
1565              {
1566              case '=':               /* Positive lookbehind */
1567              bravalue = OP_ASSERTBACK;
1568              ptr++;
1569              break;
1570    
1571              case '!':               /* Negative lookbehind */
1572              bravalue = OP_ASSERTBACK_NOT;
1573              ptr++;
1574              break;
1575    
1576              default:                /* Syntax error */
1577              *errorptr = ERR24;
1578              goto FAILED;
1579              }
1580          break;          break;
1581    
1582          case '=':                 /* Assertions can't be repeated */          case '>':                 /* One-time brackets */
1583          bravalue = OP_ASSERT;          bravalue = OP_ONCE;
1584          ptr++;          ptr++;
         previous = NULL;  
1585          break;          break;
1586    
1587          case '!':          case 'R':                 /* Pattern recursion */
1588          bravalue = OP_ASSERT_NOT;          *code++ = OP_RECURSE;
1589          ptr++;          ptr++;
1590          previous = NULL;          continue;
         break;  
1591    
1592          case '>':                         /* "Match once" brackets */          default:                  /* Option setting */
1593          if ((options & PCRE_EXTRA) != 0)  /* Not yet standard */          set = unset = 0;
1594            optset = &set;
1595    
1596            while (*ptr != ')' && *ptr != ':')
1597            {            {
1598            bravalue = OP_ONCE;            switch (*ptr++)
1599            ptr++;              {
1600            previous = NULL;              case '-': optset = &unset; break;
1601            break;  
1602                case 'i': *optset |= PCRE_CASELESS; break;
1603                case 'm': *optset |= PCRE_MULTILINE; break;
1604                case 's': *optset |= PCRE_DOTALL; break;
1605                case 'x': *optset |= PCRE_EXTENDED; break;
1606                case 'U': *optset |= PCRE_UNGREEDY; break;
1607                case 'X': *optset |= PCRE_EXTRA; break;
1608    
1609                default:
1610                *errorptr = ERR12;
1611                goto FAILED;
1612                }
1613            }            }
         /* Else fall through */  
1614    
1615          default:          /* Set up the changed option bits, but don't change anything yet. */
1616          *errorptr = ERR12;  
1617          goto FAILED;          newoptions = (options | set) & (~unset);
1618    
1619            /* If the options ended with ')' this is not the start of a nested
1620            group with option changes, so the options change at this level. At top
1621            level there is nothing else to be done (the options will in fact have
1622            been set from the start of compiling as a result of the first pass) but
1623            at an inner level we must compile code to change the ims options if
1624            necessary, and pass the new setting back so that it can be put at the
1625            start of any following branches, and when this group ends, a resetting
1626            item can be compiled. */
1627    
1628            if (*ptr == ')')
1629              {
1630              if ((options & PCRE_INGROUP) != 0 &&
1631                  (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1632                {
1633                *code++ = OP_OPT;
1634                *code++ = *optchanged = newoptions & PCRE_IMS;
1635                }
1636              options = newoptions;  /* Change options at this level */
1637              previous = NULL;       /* This item can't be repeated */
1638              continue;              /* It is complete */
1639              }
1640    
1641            /* If the options ended with ':' we are heading into a nested group
1642            with possible change of options. Such groups are non-capturing and are
1643            not assertions of any kind. All we need to do is skip over the ':';
1644            the newoptions value is handled below. */
1645    
1646            bravalue = OP_BRA;
1647            ptr++;
1648          }          }
1649        }        }
1650    
1651      /* Else we have a referencing group */      /* Else we have a referencing group; adjust the opcode. */
1652    
1653      else      else
1654        {        {
# Line 1154  for (;; ptr++) Line 1660  for (;; ptr++)
1660        bravalue = OP_BRA + *brackets;        bravalue = OP_BRA + *brackets;
1661        }        }
1662    
1663      /* Process nested bracketed re; at end pointer is on the bracket. We copy      /* Process nested bracketed re. Assertions may not be repeated, but other
1664      code into a non-register variable in order to be able to pass its address      kinds can be. We copy code into a non-register variable in order to be able
1665      because some compilers complain otherwise. */      to pass its address because some compilers complain otherwise. Pass in a
1666        new setting for the ims options if they have changed. */
1667    
1668        previous = (bravalue >= OP_ONCE)? code : NULL;
1669      *code = bravalue;      *code = bravalue;
1670        tempcode = code;
1671    
1672        if (!compile_regex(
1673             options | PCRE_INGROUP,       /* Set for all nested groups */
1674             ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1675               newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1676             brackets,                     /* Bracket level */
1677             &tempcode,                    /* Where to put code (updated) */
1678             &ptr,                         /* Input pointer (updated) */
1679             errorptr,                     /* Where to put an error message */
1680             (bravalue == OP_ASSERTBACK ||
1681              bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1682             condref,                      /* Condition reference number */
1683             &subreqchar,                  /* For possible last char */
1684             &subcountlits,                /* For literal count */
1685             cd))                          /* Tables block */
1686          goto FAILED;
1687    
1688        /* At the end of compiling, code is still pointing to the start of the
1689        group, while tempcode has been updated to point past the end of the group
1690        and any option resetting that may follow it. The pattern pointer (ptr)
1691        is on the bracket. */
1692    
1693        /* If this is a conditional bracket, check that there are no more than
1694        two branches in the group. */
1695    
1696        if (bravalue == OP_COND)
1697        {        {
1698        uschar *mcode = code;        uschar *tc = code;
1699        if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))        condcount = 0;
1700    
1701          do {
1702             condcount++;
1703             tc += (tc[1] << 8) | tc[2];
1704             }
1705          while (*tc != OP_KET);
1706    
1707          if (condcount > 2)
1708            {
1709            *errorptr = ERR27;
1710          goto FAILED;          goto FAILED;
1711        code = mcode;          }
1712          }
1713    
1714        /* Handle updating of the required character. If the subpattern didn't
1715        set one, leave it as it was. Otherwise, update it for normal brackets of
1716        all kinds, forward assertions, and conditions with two branches. Don't
1717        update the literal count for forward assertions, however. If the bracket
1718        is followed by a quantifier with zero repeat, we have to back off. Hence
1719        the definition of prevreqchar and subcountlits outside the main loop so
1720        that they can be accessed for the back off. */
1721    
1722        if (subreqchar > 0 &&
1723             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1724             (bravalue == OP_COND && condcount == 2)))
1725          {
1726          prevreqchar = *reqchar;
1727          *reqchar = subreqchar;
1728          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1729        }        }
1730    
1731        /* Now update the main code pointer to the end of the group. */
1732    
1733        code = tempcode;
1734    
1735        /* Error if hit end of pattern */
1736    
1737      if (*ptr != ')')      if (*ptr != ')')
1738        {        {
1739        *errorptr = ERR14;        *errorptr = ERR14;
# Line 1178  for (;; ptr++) Line 1746  for (;; ptr++)
1746      for validity in the pre-compiling pass. */      for validity in the pre-compiling pass. */
1747    
1748      case '\\':      case '\\':
1749      oldptr = ptr;      tempptr = ptr;
1750      c = check_escape(&ptr, errorptr, *brackets, options, FALSE);      c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1751    
1752      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1753      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
# Line 1192  for (;; ptr++) Line 1760  for (;; ptr++)
1760        {        {
1761        if (-c >= ESC_REF)        if (-c >= ESC_REF)
1762          {          {
         int refnum = -c - ESC_REF;  
         if (*brackets < refnum)  
           {  
           *errorptr = ERR15;  
           goto FAILED;  
           }  
1763          previous = code;          previous = code;
1764          *code++ = OP_REF;          *code++ = OP_REF;
1765          *code++ = refnum;          *code++ = -c - ESC_REF;
1766          }          }
1767        else        else
1768          {          {
1769          previous = (-c > ESC_b && -c < ESC_X)? code : NULL;          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1770          *code++ = -c;          *code++ = -c;
1771          }          }
1772        continue;        continue;
1773        }        }
1774    
1775      /* Reset and fall through */      /* Data character: reset and fall through */
1776    
1777      ptr = oldptr;      ptr = tempptr;
1778      c = '\\';      c = '\\';
1779    
1780      /* Handle a run of data characters until a metacharacter is encountered.      /* Handle a run of data characters until a metacharacter is encountered.
# Line 1230  for (;; ptr++) Line 1792  for (;; ptr++)
1792        {        {
1793        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
1794          {          {
1795          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
1796          if (c == '#')          if (c == '#')
1797            {            {
1798            while ((c = *(++ptr)) != 0 && c != '\n');            while ((c = *(++ptr)) != 0 && c != '\n');
# Line 1245  for (;; ptr++) Line 1807  for (;; ptr++)
1807    
1808        if (c == '\\')        if (c == '\\')
1809          {          {
1810          oldptr = ptr;          tempptr = ptr;
1811          c = check_escape(&ptr, errorptr, *brackets, options, FALSE);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1812          if (c < 0) { ptr = oldptr; break; }          if (c < 0) { ptr = tempptr; break; }
1813          }          }
1814    
1815        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1258  for (;; ptr++) Line 1820  for (;; ptr++)
1820    
1821      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
1822    
1823      while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1824    
1825        /* Update the last character and the count of literals */
1826    
1827        prevreqchar = (length > 1)? code[-2] : *reqchar;
1828        *reqchar = code[-1];
1829        *countlits += length;
1830    
1831      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
1832      the next state. */      the next state. */
1833    
1834      previous[1] = length;      previous[1] = length;
1835      ptr--;      if (length < 255) ptr--;
1836      break;      break;
1837      }      }
1838    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1288  return FALSE; Line 1856  return FALSE;
1856  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return
1857  it points to the closing bracket, or vertical bar, or end of string.  it points to the closing bracket, or vertical bar, or end of string.
1858  The code variable is pointing at the byte into which the BRA operator has been  The code variable is pointing at the byte into which the BRA operator has been
1859  stored.  stored. If the ims options are changed at the start (for a (?ims: group) or
1860    during any branch, we need to insert an OP_OPT item at the start of every
1861    following branch to ensure they get set correctly at run time, and also pass
1862    the new options into every subsequent branch compile.
1863    
1864  Argument:  Argument:
1865    options   the option bits    options     the option bits
1866    brackets  -> int containing the number of extracting brackets used    optchanged  new ims options to set as if (?ims) were at the start, or -1
1867    codeptr   -> the address of the current code pointer                 for no change
1868    ptrptr    -> the address of the current pattern pointer    brackets    -> int containing the number of extracting brackets used
1869    errorptr  -> pointer to error message    codeptr     -> the address of the current code pointer
1870      ptrptr      -> the address of the current pattern pointer
1871      errorptr    -> pointer to error message
1872      lookbehind  TRUE if this is a lookbehind assertion
1873      condref     > 0 for OPT_CREF setting at start of conditional group
1874      reqchar     -> place to put the last required character, or a negative number
1875      countlits   -> place to put the shortest literal count of any branch
1876      cd          points to the data block with tables pointers
1877    
1878  Returns:    TRUE on success  Returns:      TRUE on success
1879  */  */
1880    
1881  static BOOL  static BOOL
1882  compile_regex(int options, int *brackets, uschar **codeptr, uschar **ptrptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1883    char **errorptr)    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1884      int *reqchar, int *countlits, compile_data *cd)
1885  {  {
1886  uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1887  uschar *code = *codeptr;  uschar *code = *codeptr;
1888    uschar *last_branch = code;
1889  uschar *start_bracket = code;  uschar *start_bracket = code;
1890    uschar *reverse_count = NULL;
1891    int oldoptions = options & PCRE_IMS;
1892    int branchreqchar, branchcountlits;
1893    
1894    *reqchar = -1;
1895    *countlits = INT_MAX;
1896    code += 3;
1897    
1898    /* At the start of a reference-based conditional group, insert the reference
1899    number as an OP_CREF item. */
1900    
1901    if (condref > 0)
1902      {
1903      *code++ = OP_CREF;
1904      *code++ = condref;
1905      }
1906    
1907    /* Loop for each alternative branch */
1908    
1909  for (;;)  for (;;)
1910    {    {
1911    int length;    int length;
   uschar *last_branch = code;  
1912    
1913    code += 3;    /* Handle change of options */
1914    if (!compile_branch(options, brackets, &code, &ptr, errorptr))  
1915      if (optchanged >= 0)
1916        {
1917        *code++ = OP_OPT;
1918        *code++ = optchanged;
1919        options = (options & ~PCRE_IMS) | optchanged;
1920        }
1921    
1922      /* Set up dummy OP_REVERSE if lookbehind assertion */
1923    
1924      if (lookbehind)
1925        {
1926        *code++ = OP_REVERSE;
1927        reverse_count = code;
1928        *code++ = 0;
1929        *code++ = 0;
1930        }
1931    
1932      /* Now compile the branch */
1933    
1934      if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1935          &branchreqchar, &branchcountlits, cd))
1936      {      {
1937      *ptrptr = ptr;      *ptrptr = ptr;
1938      return FALSE;      return FALSE;
# Line 1326  for (;;) Line 1944  for (;;)
1944    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
1945    last_branch[2] = length & 255;    last_branch[2] = length & 255;
1946    
1947      /* Save the last required character if all branches have the same; a current
1948      value of -1 means unset, while -2 means "previous branch had no last required
1949      char".  */
1950    
1951      if (*reqchar != -2)
1952        {
1953        if (branchreqchar >= 0)
1954          {
1955          if (*reqchar == -1) *reqchar = branchreqchar;
1956          else if (*reqchar != branchreqchar) *reqchar = -2;
1957          }
1958        else *reqchar = -2;
1959        }
1960    
1961      /* Keep the shortest literal count */
1962    
1963      if (branchcountlits < *countlits) *countlits = branchcountlits;
1964      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1965    
1966      /* If lookbehind, check that this branch matches a fixed-length string,
1967      and put the length into the OP_REVERSE item. Temporarily mark the end of
1968      the branch with OP_END. */
1969    
1970      if (lookbehind)
1971        {
1972        *code = OP_END;
1973        length = find_fixedlength(last_branch);
1974        DPRINTF(("fixed length = %d\n", length));
1975        if (length < 0)
1976          {
1977          *errorptr = ERR25;
1978          *ptrptr = ptr;
1979          return FALSE;
1980          }
1981        reverse_count[0] = (length >> 8);
1982        reverse_count[1] = length & 255;
1983        }
1984    
1985    /* Reached end of expression, either ')' or end of pattern. Insert a    /* Reached end of expression, either ')' or end of pattern. Insert a
1986    terminating ket and the length of the whole bracketed item, and return,    terminating ket and the length of the whole bracketed item, and return,
1987    leaving the pointer at the terminating char. */    leaving the pointer at the terminating char. If any of the ims options
1988      were changed inside the group, compile a resetting op-code following. */
1989    
1990    if (*ptr != '|')    if (*ptr != '|')
1991      {      {
# Line 1336  for (;;) Line 1993  for (;;)
1993      *code++ = OP_KET;      *code++ = OP_KET;
1994      *code++ = length >> 8;      *code++ = length >> 8;
1995      *code++ = length & 255;      *code++ = length & 255;
1996        if (optchanged >= 0)
1997          {
1998          *code++ = OP_OPT;
1999          *code++ = oldoptions;
2000          }
2001      *codeptr = code;      *codeptr = code;
2002      *ptrptr = ptr;      *ptrptr = ptr;
2003      return TRUE;      return TRUE;
# Line 1344  for (;;) Line 2006  for (;;)
2006    /* Another branch follows; insert an "or" node and advance the pointer. */    /* Another branch follows; insert an "or" node and advance the pointer. */
2007    
2008    *code = OP_ALT;    *code = OP_ALT;
2009      last_branch = code;
2010      code += 3;
2011    ptr++;    ptr++;
2012    }    }
2013  /* Control never reaches here */  /* Control never reaches here */
# Line 1351  for (;;) Line 2015  for (;;)
2015    
2016    
2017    
2018    
2019    /*************************************************
2020    *      Find first significant op code            *
2021    *************************************************/
2022    
2023    /* This is called by several functions that scan a compiled expression looking
2024    for a fixed first character, or an anchoring op code etc. It skips over things
2025    that do not influence this. For one application, a change of caseless option is
2026    important.
2027    
2028    Arguments:
2029      code       pointer to the start of the group
2030      options    pointer to external options
2031      optbit     the option bit whose changing is significant, or
2032                 zero if none are
2033      optstop    TRUE to return on option change, otherwise change the options
2034                   value and continue
2035    
2036    Returns:     pointer to the first significant opcode
2037    */
2038    
2039    static const uschar*
2040    first_significant_code(const uschar *code, int *options, int optbit,
2041      BOOL optstop)
2042    {
2043    for (;;)
2044      {
2045      switch ((int)*code)
2046        {
2047        case OP_OPT:
2048        if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2049          {
2050          if (optstop) return code;
2051          *options = (int)code[1];
2052          }
2053        code += 2;
2054        break;
2055    
2056        case OP_CREF:
2057        code += 2;
2058        break;
2059    
2060        case OP_WORD_BOUNDARY:
2061        case OP_NOT_WORD_BOUNDARY:
2062        code++;
2063        break;
2064    
2065        case OP_ASSERT_NOT:
2066        case OP_ASSERTBACK:
2067        case OP_ASSERTBACK_NOT:
2068        do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2069        code += 3;
2070        break;
2071    
2072        default:
2073        return code;
2074        }
2075      }
2076    /* Control never reaches here */
2077    }
2078    
2079    
2080    
2081    
2082  /*************************************************  /*************************************************
2083  *          Check for anchored expression         *  *          Check for anchored expression         *
2084  *************************************************/  *************************************************/
# Line 1361  all of whose alternatives start with OP_ Line 2089  all of whose alternatives start with OP_
2089  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2090  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2091    
2092  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2093  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2094  trying them again.  so there is no point trying them again.
2095    
2096  Argument:  points to start of expression (the bracket)  Arguments:
2097  Returns:   TRUE or FALSE    code       points to start of expression (the bracket)
2098      options    points to the options setting
2099    
2100    Returns:     TRUE or FALSE
2101  */  */
2102    
2103  static BOOL  static BOOL
2104  is_anchored(register uschar *code, BOOL multiline)  is_anchored(register const uschar *code, int *options)
2105  {  {
2106  do {  do {
2107     int op = (int)code[3];     const uschar *scode = first_significant_code(code + 3, options,
2108     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)       PCRE_MULTILINE, FALSE);
2109       { if (!is_anchored(code+3, multiline)) return FALSE; }     register int op = *scode;
2110     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2111       { if (code[4] != OP_ANY) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2112     else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2113                (*options & PCRE_DOTALL) != 0)
2114         { if (scode[1] != OP_ANY) return FALSE; }
2115       else if (op != OP_SOD &&
2116               ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2117         return FALSE;
2118     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2119     }     }
2120  while (*code == OP_ALT);  while (*code == OP_ALT);
# Line 1388  return TRUE; Line 2124  return TRUE;
2124    
2125    
2126  /*************************************************  /*************************************************
2127  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2128  *************************************************/  *************************************************/
2129    
2130  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2131  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2132    matching and for non-DOTALL patterns that start with .* (which must start at
2133    the beginning or after \n).
2134    
2135  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2136  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
2137  */  */
2138    
2139  static BOOL  static BOOL
2140  is_startline(uschar *code)  is_startline(const uschar *code)
2141  {  {
2142  do {  do {
2143     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)     const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2144       { if (!is_startline(code+3)) return FALSE; }     register int op = *scode;
2145     else if (code[3] != OP_CIRC) return FALSE;     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2146         { if (!is_startline(scode)) return FALSE; }
2147       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2148         { if (scode[1] != OP_ANY) return FALSE; }
2149       else if (op != OP_CIRC) return FALSE;
2150     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2151     }     }
2152  while (*code == OP_ALT);  while (*code == OP_ALT);
# Line 1423  Consider each alternative branch. If the Line 2165  Consider each alternative branch. If the
2165  a bracket all of whose alternatives start with the same char (recurse ad lib),  a bracket all of whose alternatives start with the same char (recurse ad lib),
2166  then we return that char, otherwise -1.  then we return that char, otherwise -1.
2167    
2168  Argument:  points to start of expression (the bracket)  Arguments:
2169  Returns:   -1 or the fixed first char    code       points to start of expression (the bracket)
2170      options    pointer to the options (used to check casing changes)
2171    
2172    Returns:     -1 or the fixed first char
2173  */  */
2174    
2175  static int  static int
2176  find_firstchar(uschar *code)  find_firstchar(const uschar *code, int *options)
2177  {  {
2178  register int c = -1;  register int c = -1;
2179  do  do {
2180    {     int d;
2181    register int charoffset = 4;     const uschar *scode = first_significant_code(code + 3, options,
2182         PCRE_CASELESS, TRUE);
2183    if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)     register int op = *scode;
2184      {  
2185      register int d;     if (op >= OP_BRA) op = OP_BRA;
2186      if ((d = find_firstchar(code+3)) < 0) return -1;  
2187      if (c < 0) c = d; else if (c != d) return -1;     switch(op)
2188      }       {
2189         default:
2190    else switch(code[3])       return -1;
2191      {  
2192      default:       case OP_BRA:
2193      return -1;       case OP_ASSERT:
2194         case OP_ONCE:
2195      case OP_EXACT:       /* Fall through */       case OP_COND:
2196      charoffset++;       if ((d = find_firstchar(scode, options)) < 0) return -1;
2197         if (c < 0) c = d; else if (c != d) return -1;
2198      case OP_CHARS:       /* Fall through */       break;
2199      charoffset++;  
2200         case OP_EXACT:       /* Fall through */
2201         scode++;
2202    
2203         case OP_CHARS:       /* Fall through */
2204         scode++;
2205    
2206         case OP_PLUS:
2207         case OP_MINPLUS:
2208         if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2209         break;
2210         }
2211    
2212      case OP_PLUS:     code += (code[1] << 8) + code[2];
2213      case OP_MINPLUS:     }
     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;  
     break;  
     }  
   code += (code[1] << 8) + code[2];  
   }  
2214  while (*code == OP_ALT);  while (*code == OP_ALT);
2215  return c;  return c;
2216  }  }
2217    
2218    
2219    
2220    
2221    
2222  /*************************************************  /*************************************************
2223  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
2224  *************************************************/  *************************************************/
# Line 1478  Arguments: Line 2231  Arguments:
2231    options      various option bits    options      various option bits
2232    errorptr     pointer to pointer to error text    errorptr     pointer to pointer to error text
2233    erroroffset  ptr offset in pattern where error was detected    erroroffset  ptr offset in pattern where error was detected
2234      tables       pointer to character tables or NULL
2235    
2236  Returns:       pointer to compiled data block, or NULL on error,  Returns:       pointer to compiled data block, or NULL on error,
2237                 with errorptr and erroroffset set                 with errorptr and erroroffset set
2238  */  */
2239    
2240  pcre *  pcre *
2241  pcre_compile(const char *pattern, int options, char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
2242    int *erroroffset)    int *erroroffset, const unsigned char *tables)
2243  {  {
2244  real_pcre *re;  real_pcre *re;
 int spaces = 0;  
2245  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2246  int runlength;  int runlength;
2247  int c, size;  int c, reqchar, countlits;
2248  int bracount = 0;  int bracount = 0;
 int brastack[200];  
 int brastackptr = 0;  
2249  int top_backref = 0;  int top_backref = 0;
2250  uschar *code, *ptr;  int branch_extra = 0;
2251    int branch_newextra;
2252    unsigned int brastackptr = 0;
2253    size_t size;
2254    uschar *code;
2255    const uschar *ptr;
2256    compile_data compile_block;
2257    int brastack[BRASTACK_SIZE];
2258    uschar bralenstack[BRASTACK_SIZE];
2259    
2260  #ifdef DEBUG  #ifdef DEBUG
2261  uschar *code_base, *code_end;  uschar *code_base, *code_end;
# Line 1523  if ((options & ~PUBLIC_OPTIONS) != 0) Line 2282  if ((options & ~PUBLIC_OPTIONS) != 0)
2282    return NULL;    return NULL;
2283    }    }
2284    
2285  #ifdef DEBUG  /* Set up pointers to the individual character tables */
2286  printf("------------------------------------------------------------------\n");  
2287  printf("%s\n", pattern);  if (tables == NULL) tables = pcre_default_tables;
2288  #endif  compile_block.lcc = tables + lcc_offset;
2289    compile_block.fcc = tables + fcc_offset;
2290    compile_block.cbits = tables + cbits_offset;
2291    compile_block.ctypes = tables + ctypes_offset;
2292    
2293    /* Reflect pattern for debugging output */
2294    
2295    DPRINTF(("------------------------------------------------------------------\n"));
2296    DPRINTF(("%s\n", pattern));
2297    
2298  /* The first thing to do is to make a pass over the pattern to compute the  /* The first thing to do is to make a pass over the pattern to compute the
2299  amount of store required to hold the compiled code. This does not have to be  amount of store required to hold the compiled code. This does not have to be
# Line 1535  internal flag settings. Make an attempt Line 2302  internal flag settings. Make an attempt
2302  if an "extended" flag setting appears late in the pattern. We can't be so  if an "extended" flag setting appears late in the pattern. We can't be so
2303  clever for #-comments. */  clever for #-comments. */
2304    
2305  ptr = (uschar *)(pattern - 1);  ptr = (const uschar *)(pattern - 1);
2306  while ((c = *(++ptr)) != 0)  while ((c = *(++ptr)) != 0)
2307    {    {
2308    int min, max;    int min, max;
2309    int class_charcount;    int class_charcount;
2310    
2311    if ((pcre_ctypes[c] & ctype_space) != 0)    if ((options & PCRE_EXTENDED) != 0)
     {  
     if ((options & PCRE_EXTENDED) != 0) continue;  
     spaces++;  
     }  
   
   if (c == '#' && (options & PCRE_EXTENDED) != 0)  
2312      {      {
2313      while ((c = *(++ptr)) != 0 && c != '\n');      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2314      continue;      if (c == '#')
2315          {
2316          while ((c = *(++ptr)) != 0 && c != '\n');
2317          continue;
2318          }
2319      }      }
2320    
2321    switch(c)    switch(c)
# Line 1562  while ((c = *(++ptr)) != 0) Line 2327  while ((c = *(++ptr)) != 0)
2327    
2328      case '\\':      case '\\':
2329        {        {
2330        uschar *save_ptr = ptr;        const uschar *save_ptr = ptr;
2331        c = check_escape(&ptr, errorptr, bracount, options, FALSE);        c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2332        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2333        if (c >= 0)        if (c >= 0)
2334          {          {
# Line 1583  while ((c = *(++ptr)) != 0) Line 2348  while ((c = *(++ptr)) != 0)
2348        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
2349        if (refnum > top_backref) top_backref = refnum;        if (refnum > top_backref) top_backref = refnum;
2350        length++;   /* For single back reference */        length++;   /* For single back reference */
2351        if (ptr[1] == '{' && is_counted_repeat(ptr+2))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2352          {          {
2353          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2354          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2355          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2356            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1609  while ((c = *(++ptr)) != 0) Line 2374  while ((c = *(++ptr)) != 0)
2374      or back reference. */      or back reference. */
2375    
2376      case '{':      case '{':
2377      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2378      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2379      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2380      if ((min == 0 && (max == 1 || max == -1)) ||      if ((min == 0 && (max == 1 || max == -1)) ||
2381        (min == 1 && max == -1))        (min == 1 && max == -1))
# Line 1624  while ((c = *(++ptr)) != 0) Line 2389  while ((c = *(++ptr)) != 0)
2389      if (ptr[1] == '?') ptr++;      if (ptr[1] == '?') ptr++;
2390      continue;      continue;
2391    
2392      /* An alternation contains an offset to the next branch or ket. */      /* An alternation contains an offset to the next branch or ket. If any ims
2393        options changed in the previous branch(es), and/or if we are in a
2394        lookbehind assertion, extra space will be needed at the start of the
2395        branch. This is handled by branch_extra. */
2396    
2397      case '|':      case '|':
2398      length += 3;      length += 3 + branch_extra;
2399      continue;      continue;
2400    
2401      /* A character class uses 33 characters. Don't worry about character types      /* A character class uses 33 characters. Don't worry about character types
# Line 1641  while ((c = *(++ptr)) != 0) Line 2410  while ((c = *(++ptr)) != 0)
2410        {        {
2411        if (*ptr == '\\')        if (*ptr == '\\')
2412          {          {
2413          int c = check_escape(&ptr, errorptr, bracount, options, TRUE);          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2414              &compile_block);
2415          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2416          if (-c == ESC_b) class_charcount++; else class_charcount = 10;          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2417          }          }
2418        else class_charcount++;        else class_charcount++;
2419        ptr++;        ptr++;
# Line 1658  while ((c = *(++ptr)) != 0) Line 2428  while ((c = *(++ptr)) != 0)
2428    
2429        /* A repeat needs either 1 or 5 bytes. */        /* A repeat needs either 1 or 5 bytes. */
2430    
2431        if (ptr[1] == '{' && is_counted_repeat(ptr+2))        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2432          {          {
2433          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2434          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2435          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2436            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1674  while ((c = *(++ptr)) != 0) Line 2444  while ((c = *(++ptr)) != 0)
2444      /* Brackets may be genuine groups or special things */      /* Brackets may be genuine groups or special things */
2445    
2446      case '(':      case '(':
2447        branch_newextra = 0;
2448    
2449      /* Handle special forms of bracket, which all start (? */      /* Handle special forms of bracket, which all start (? */
2450    
2451      if (ptr[1] == '?') switch (c = ptr[2])      if (ptr[1] == '?')
2452        {        {
2453        /* Skip over comments entirely */        int set, unset;
2454        case '#':        int *optset;
       ptr += 3;  
       while (*ptr != 0 && *ptr != ')') ptr++;  
       if (*ptr == 0)  
         {  
         *errorptr = ERR18;  
         goto PCRE_ERROR_RETURN;  
         }  
       continue;  
2455    
2456        /* Non-referencing groups and lookaheads just move the pointer on, and        switch (c = ptr[2])
2457        then behave like a non-special bracket, except that they don't increment          {
2458        the count of extracting brackets. */          /* Skip over comments entirely */
2459            case '#':
2460        case ':':          ptr += 3;
2461        case '=':          while (*ptr != 0 && *ptr != ')') ptr++;
2462        case '!':          if (*ptr == 0)
2463        ptr += 2;            {
2464        break;            *errorptr = ERR18;
2465              goto PCRE_ERROR_RETURN;
2466              }
2467            continue;
2468    
2469        /* Ditto for the "once only" bracket, allowed only if the extra bit          /* Non-referencing groups and lookaheads just move the pointer on, and
2470        is set. */          then behave like a non-special bracket, except that they don't increment
2471            the count of extracting brackets. Ditto for the "once only" bracket,
2472            which is in Perl from version 5.005. */
2473    
2474        case '>':          case ':':
2475        if ((options & PCRE_EXTRA) != 0)          case '=':
2476          {          case '!':
2477            case '>':
2478          ptr += 2;          ptr += 2;
2479          break;          break;
         }  
       /* Else fall thourh */  
2480    
2481        /* Else loop setting valid options until ) is met. Anything else is an          /* A recursive call to the regex is an extension, to provide the
2482        error. */          facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2483    
2484        default:          case 'R':
2485        ptr += 2;          if (ptr[3] != ')')
       for (;; ptr++)  
         {  
         if ((c = *ptr) == 'i')  
2486            {            {
2487            options |= PCRE_CASELESS;            *errorptr = ERR29;
2488            continue;            goto PCRE_ERROR_RETURN;
2489              }
2490            ptr += 3;
2491            length += 1;
2492            break;
2493    
2494            /* Lookbehinds are in Perl from version 5.005 */
2495    
2496            case '<':
2497            if (ptr[3] == '=' || ptr[3] == '!')
2498              {
2499              ptr += 3;
2500              branch_newextra = 3;
2501              length += 3;         /* For the first branch */
2502              break;
2503              }
2504            *errorptr = ERR24;
2505            goto PCRE_ERROR_RETURN;
2506    
2507            /* Conditionals are in Perl from version 5.005. The bracket must either
2508            be followed by a number (for bracket reference) or by an assertion
2509            group. */
2510    
2511            case '(':
2512            if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2513              {
2514              ptr += 4;
2515              length += 2;
2516              while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2517              if (*ptr != ')')
2518                {
2519                *errorptr = ERR26;
2520                goto PCRE_ERROR_RETURN;
2521                }
2522            }            }
2523          else if ((c = *ptr) == 'm')          else   /* An assertion must follow */
2524            {            {
2525            options |= PCRE_MULTILINE;            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2526            continue;  
2527              if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2528                {
2529                ptr += 2;    /* To get right offset in message */
2530                *errorptr = ERR28;
2531                goto PCRE_ERROR_RETURN;
2532                }
2533            }            }
2534          else if (c == 's')          break;
2535    
2536            /* Else loop checking valid options until ) is met. Anything else is an
2537            error. If we are without any brackets, i.e. at top level, the settings
2538            act as if specified in the options, so massage the options immediately.
2539            This is for backward compatibility with Perl 5.004. */
2540    
2541            default:
2542            set = unset = 0;
2543            optset = &set;
2544            ptr += 2;
2545    
2546            for (;; ptr++)
2547            {            {
2548            options |= PCRE_DOTALL;            c = *ptr;
2549            continue;            switch (c)
2550                {
2551                case 'i':
2552                *optset |= PCRE_CASELESS;
2553                continue;
2554    
2555                case 'm':
2556                *optset |= PCRE_MULTILINE;
2557                continue;
2558    
2559                case 's':
2560                *optset |= PCRE_DOTALL;
2561                continue;
2562    
2563                case 'x':
2564                *optset |= PCRE_EXTENDED;
2565                continue;
2566    
2567                case 'X':
2568                *optset |= PCRE_EXTRA;
2569                continue;
2570    
2571                case 'U':
2572                *optset |= PCRE_UNGREEDY;
2573                continue;
2574    
2575                case '-':
2576                optset = &unset;
2577                continue;
2578    
2579                /* A termination by ')' indicates an options-setting-only item;
2580                this is global at top level; otherwise nothing is done here and
2581                it is handled during the compiling process on a per-bracket-group
2582                basis. */
2583    
2584                case ')':
2585                if (brastackptr == 0)
2586                  {
2587                  options = (options | set) & (~unset);
2588                  set = unset = 0;     /* To save length */
2589                  }
2590                /* Fall through */
2591    
2592                /* A termination by ':' indicates the start of a nested group with
2593                the given options set. This is again handled at compile time, but
2594                we must allow for compiled space if any of the ims options are
2595                set. We also have to allow for resetting space at the end of
2596                the group, which is why 4 is added to the length and not just 2.
2597                If there are several changes of options within the same group, this
2598                will lead to an over-estimate on the length, but this shouldn't
2599                matter very much. We also have to allow for resetting options at
2600                the start of any alternations, which we do by setting
2601                branch_newextra to 2. Finally, we record whether the case-dependent
2602                flag ever changes within the regex. This is used by the "required
2603                character" code. */
2604    
2605                case ':':
2606                if (((set|unset) & PCRE_IMS) != 0)
2607                  {
2608                  length += 4;
2609                  branch_newextra = 2;
2610                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2611                  }
2612                goto END_OPTIONS;
2613    
2614                /* Unrecognized option character */
2615    
2616                default:
2617                *errorptr = ERR12;
2618                goto PCRE_ERROR_RETURN;
2619                }
2620            }            }
2621          else if (c == 'x')  
2622            /* If we hit a closing bracket, that's it - this is a freestanding
2623            option-setting. We need to ensure that branch_extra is updated if
2624            necessary. The only values branch_newextra can have here are 0 or 2.
2625            If the value is 2, then branch_extra must either be 2 or 5, depending
2626            on whether this is a lookbehind group or not. */
2627    
2628            END_OPTIONS:
2629            if (c == ')')
2630            {            {
2631            options |= PCRE_EXTENDED;            if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2632            length -= spaces;          /* Already counted spaces */              branch_extra += branch_newextra;
2633            continue;            continue;
2634            }            }
         else if (c == ')') break;  
2635    
2636          *errorptr = ERR12;          /* If options were terminated by ':' control comes here. Fall through
2637          goto PCRE_ERROR_RETURN;          to handle the group below. */
2638          }          }
       continue;                      /* End of this bracket handling */  
2639        }        }
2640    
2641      /* Extracting brackets must be counted so we can process escapes in a      /* Extracting brackets must be counted so we can process escapes in a
# Line 1753  while ((c = *(++ptr)) != 0) Line 2644  while ((c = *(++ptr)) != 0)
2644      else bracount++;      else bracount++;
2645    
2646      /* Non-special forms of bracket. Save length for computing whole length      /* Non-special forms of bracket. Save length for computing whole length
2647      at end if there's a repeat that requires duplication of the group. */      at end if there's a repeat that requires duplication of the group. Also
2648        save the current value of branch_extra, and start the new group with
2649        the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2650        for a lookbehind assertion. */
2651    
2652      if (brastackptr >= sizeof(brastack)/sizeof(int))      if (brastackptr >= sizeof(brastack)/sizeof(int))
2653        {        {
# Line 1761  while ((c = *(++ptr)) != 0) Line 2655  while ((c = *(++ptr)) != 0)
2655        goto PCRE_ERROR_RETURN;        goto PCRE_ERROR_RETURN;
2656        }        }
2657    
2658        bralenstack[brastackptr] = branch_extra;
2659        branch_extra = branch_newextra;
2660    
2661      brastack[brastackptr++] = length;      brastack[brastackptr++] = length;
2662      length += 3;      length += 3;
2663      continue;      continue;
2664    
2665      /* Handle ket. Look for subsequent max/min; for certain sets of values we      /* Handle ket. Look for subsequent max/min; for certain sets of values we
2666      have to replicate this bracket up to that many times. */      have to replicate this bracket up to that many times. If brastackptr is
2667        0 this is an unmatched bracket which will generate an error, but take care
2668        not to try to access brastack[-1] when computing the length and restoring
2669        the branch_extra value. */
2670    
2671      case ')':      case ')':
2672      length += 3;      length += 3;
2673        {        {
2674        int min = 1;        int minval = 1;
2675        int max = 1;        int maxval = 1;
2676        int duplength = length - brastack[--brastackptr];        int duplength;
2677    
2678          if (brastackptr > 0)
2679            {
2680            duplength = length - brastack[--brastackptr];
2681            branch_extra = bralenstack[brastackptr];
2682            }
2683          else duplength = 0;
2684    
2685          /* Leave ptr at the final char; for read_repeat_counts this happens
2686          automatically; for the others we need an increment. */
2687    
2688          if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2689            {
2690            ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2691              &compile_block);
2692            if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2693            }
2694          else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2695          else if (c == '+') { maxval = -1; ptr++; }
2696          else if (c == '?') { minval = 0; ptr++; }
2697    
2698          /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2699          group, and if the maximum is greater than zero, we have to replicate
2700          maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2701          bracket set - hence the 7. */
2702    
2703          if (minval == 0)
2704            {
2705            length++;
2706            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2707            }
2708    
2709        /* Leave ptr at the final char; for read_repeat_counts this happens        /* When the minimum is greater than zero, 1 we have to replicate up to
2710        automatically; for the others we need an increment. */        minval-1 times, with no additions required in the copies. Then, if
2711          there is a limited maximum we have to replicate up to maxval-1 times
2712          allowing for a BRAZERO item before each optional copy and nesting
2713          brackets for all but one of the optional copies. */
2714    
2715        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))        else
2716          {          {
2717          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          length += (minval - 1) * duplength;
2718          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2719              length += (maxval - minval) * (duplength + 7) - 6;
2720          }          }
       else if (c == '*') { min = 0; max = -1; ptr++; }  
       else if (c == '+') { max = -1; ptr++; }  
       else if (c == '?') { min = 0; ptr++; }  
   
       /* If there is a minimum > 1 we have to replicate up to min-1 times; if  
       there is a limited maximum we have to replicate up to max-1 times and  
       allow for a BRAZERO item before each optional copy, as we also have to  
       do before the first copy if the minimum is zero. */  
   
       if (min == 0) length++;  
         else if (min > 1) length += (min - 1) * duplength;  
       if (max > min) length += (max - min) * (duplength + 1);  
2721        }        }
   
2722      continue;      continue;
2723    
2724      /* Non-special character. For a run of such characters the length required      /* Non-special character. For a run of such characters the length required
# Line 1810  while ((c = *(++ptr)) != 0) Line 2732  while ((c = *(++ptr)) != 0)
2732      runlength = 0;      runlength = 0;
2733      do      do
2734        {        {
2735        if ((pcre_ctypes[c] & ctype_space) != 0)        if ((options & PCRE_EXTENDED) != 0)
         {  
         if ((options & PCRE_EXTENDED) != 0) continue;  
         spaces++;  
         }  
   
       if (c == '#' && (options & PCRE_EXTENDED) != 0)  
2736          {          {
2737          while ((c = *(++ptr)) != 0 && c != '\n');          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2738          continue;          if (c == '#')
2739              {
2740              while ((c = *(++ptr)) != 0 && c != '\n');
2741              continue;
2742              }
2743          }          }
2744    
2745        /* Backslash may introduce a data char or a metacharacter; stop the        /* Backslash may introduce a data char or a metacharacter; stop the
# Line 1827  while ((c = *(++ptr)) != 0) Line 2747  while ((c = *(++ptr)) != 0)
2747    
2748        if (c == '\\')        if (c == '\\')
2749          {          {
2750          uschar *saveptr = ptr;          const uschar *saveptr = ptr;
2751          c = check_escape(&ptr, errorptr, bracount, options, FALSE);          c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2752              &compile_block);
2753          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2754          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
2755          }          }
# Line 1840  while ((c = *(++ptr)) != 0) Line 2761  while ((c = *(++ptr)) != 0)
2761    
2762      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2763    
2764      while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (runlength < 255 &&
2765          (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2766    
2767      ptr--;      ptr--;
2768      length += runlength;      length += runlength;
# Line 1857  if (length > 65539) Line 2779  if (length > 65539)
2779    }    }
2780    
2781  /* Compute the size of data block needed and get it, either from malloc or  /* Compute the size of data block needed and get it, either from malloc or
2782  externally provided function. Put in the magic number and the options. */  externally provided function. We specify "code[0]" in the offsetof() expression
2783    rather than just "code", because it has been reported that one broken compiler
2784    fails on "code" because it is also an independent variable. It should make no
2785    difference to the value of the offsetof(). */
2786    
2787  size = length + offsetof(real_pcre, code);  size = length + offsetof(real_pcre, code[0]);
2788  re = (real_pcre *)(pcre_malloc)(size);  re = (real_pcre *)(pcre_malloc)(size);
2789    
2790  if (re == NULL)  if (re == NULL)
# Line 1868  if (re == NULL) Line 2793  if (re == NULL)
2793    return NULL;    return NULL;
2794    }    }
2795    
2796    /* Put in the magic number, and save the size, options, and table pointer */
2797    
2798  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
2799    re->size = size;
2800  re->options = options;  re->options = options;
2801    re->tables = tables;
2802    
2803  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
2804  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
2805  of the function here. */  of the function here. */
2806    
2807  ptr = (uschar *)pattern;  ptr = (const uschar *)pattern;
2808  code = re->code;  code = re->code;
2809  *code = OP_BRA;  *code = OP_BRA;
2810  bracount = 0;  bracount = 0;
2811  (void)compile_regex(options, &bracount, &code, &ptr, errorptr);  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2812      &reqchar, &countlits, &compile_block);
2813  re->top_bracket = bracount;  re->top_bracket = bracount;
2814  re->top_backref = top_backref;  re->top_backref = top_backref;
2815    
# Line 1896  if debugging, leave the test till after Line 2826  if debugging, leave the test till after
2826  if (code - re->code > length) *errorptr = ERR23;  if (code - re->code > length) *errorptr = ERR23;
2827  #endif  #endif
2828    
2829    /* Give an error if there's back reference to a non-existent capturing
2830    subpattern. */
2831    
2832    if (top_backref > re->top_bracket) *errorptr = ERR15;
2833    
2834  /* Failed to compile */  /* Failed to compile */
2835    
2836  if (*errorptr != NULL)  if (*errorptr != NULL)
2837    {    {
2838    (pcre_free)(re);    (pcre_free)(re);
2839    PCRE_ERROR_RETURN:    PCRE_ERROR_RETURN:
2840    *erroroffset = ptr - (uschar *)pattern;    *erroroffset = ptr - (const uschar *)pattern;
2841    return NULL;    return NULL;
2842    }    }
2843    
2844  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
2845  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
2846  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
2847  unanchored matches no end. In the case of multiline matches, an alternative is  
2848  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
2849    that speeds up unanchored matches no end. If not, see if we can set the
2850    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2851    start with ^. and also when all branches start with .* for non-DOTALL matches.
2852    */
2853    
2854  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
2855    {    {
2856    if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))    int temp_options = options;
2857      if (is_anchored(re->code, &temp_options))
2858      re->options |= PCRE_ANCHORED;      re->options |= PCRE_ANCHORED;
2859    else    else
2860      {      {
2861      int c = find_firstchar(re->code);      int ch = find_firstchar(re->code, &temp_options);
2862      if (c >= 0)      if (ch >= 0)
2863        {        {
2864        re->first_char = c;        re->first_char = ch;
2865        re->options |= PCRE_FIRSTSET;        re->options |= PCRE_FIRSTSET;
2866        }        }
2867      else if (is_startline(re->code))      else if (is_startline(re->code))
# Line 1929  if ((options & PCRE_ANCHORED) == 0) Line 2869  if ((options & PCRE_ANCHORED) == 0)
2869      }      }
2870    }    }
2871    
2872    /* Save the last required character if there are at least two literal
2873    characters on all paths, or if there is no first character setting. */
2874    
2875    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2876      {
2877      re->req_char = reqchar;
2878      re->options |= PCRE_REQCHSET;
2879      }
2880    
2881  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
2882    
2883  #ifdef DEBUG  #ifdef DEBUG
2884    
2885  printf("Length = %d top_bracket = %d top_backref=%d\n",  printf("Length = %d top_bracket = %d top_backref = %d\n",
2886    length, re->top_bracket, re->top_backref);    length, re->top_bracket, re->top_backref);
2887    
2888  if (re->options != 0)  if (re->options != 0)
2889    {    {
2890    printf("%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
2891      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2892      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2893        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2894      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2895      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2896      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2897      ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",      ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2898      ((re->options & PCRE_EXTRA) != 0)? "extra " : "");      ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2899        ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2900    }    }
2901    
2902  if ((re->options & PCRE_FIRSTSET) != 0)  if ((re->options & PCRE_FIRSTSET) != 0)
# Line 1954  if ((re->options & PCRE_FIRSTSET) != 0) Line 2905  if ((re->options & PCRE_FIRSTSET) != 0)
2905      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
2906    }    }
2907    
2908    if ((re->options & PCRE_REQCHSET) != 0)
2909      {
2910      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2911        else printf("Req char = \\x%02x\n", re->req_char);
2912      }
2913    
2914  code_end = code;  code_end = code;
2915  code_base = code = re->code;  code_base = code = re->code;
2916    
# Line 1971  while (code < code_end) Line 2928  while (code < code_end)
2928    
2929    else switch(*code)    else switch(*code)
2930      {      {
2931        case OP_OPT:
2932        printf(" %.2x %s", code[1], OP_names[*code]);
2933        code++;
2934        break;
2935    
2936        case OP_COND:
2937        printf("%3d Cond", (code[1] << 8) + code[2]);
2938        code += 2;
2939        break;
2940    
2941        case OP_CREF:
2942        printf(" %.2d %s", code[1], OP_names[*code]);
2943        code++;
2944        break;
2945    
2946      case OP_CHARS:      case OP_CHARS:
2947      charlength = *(++code);      charlength = *(++code);
2948      printf("%3d ", charlength);      printf("%3d ", charlength);
# Line 1984  while (code < code_end) Line 2956  while (code < code_end)
2956      case OP_KET:      case OP_KET:
2957      case OP_ASSERT:      case OP_ASSERT:
2958      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
2959        case OP_ASSERTBACK:
2960        case OP_ASSERTBACK_NOT:
2961      case OP_ONCE:      case OP_ONCE:
2962      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2963      code += 2;      code += 2;
2964      break;      break;
2965    
2966        case OP_REVERSE:
2967        printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2968        code += 2;
2969        break;
2970    
2971      case OP_STAR:      case OP_STAR:
2972      case OP_MINSTAR:      case OP_MINSTAR:
2973      case OP_PLUS:      case OP_PLUS:
# Line 2013  while (code < code_end) Line 2992  while (code < code_end)
2992      case OP_MINUPTO:      case OP_MINUPTO:
2993      if (isprint(c = code[3])) printf("    %c{", c);      if (isprint(c = code[3])) printf("    %c{", c);
2994        else printf("    \\x%02x{", c);        else printf("    \\x%02x{", c);
2995      if (*code != OP_EXACT) printf(",");      if (*code != OP_EXACT) printf("0,");
2996      printf("%d}", (code[1] << 8) + code[2]);      printf("%d}", (code[1] << 8) + code[2]);
2997      if (*code == OP_MINUPTO) printf("?");      if (*code == OP_MINUPTO) printf("?");
2998      code += 3;      code += 3;
# Line 2058  while (code < code_end) Line 3037  while (code < code_end)
3037    
3038      case OP_REF:      case OP_REF:
3039      printf("    \\%d", *(++code));      printf("    \\%d", *(++code));
3040      break;      code ++;
3041        goto CLASS_REF_REPEAT;
3042    
3043      case OP_CLASS:      case OP_CLASS:
3044        {        {
3045        int i, min, max;        int i, min, max;
   
3046        code++;        code++;
3047        printf("    [");        printf("    [");
3048    
# Line 2088  while (code < code_end) Line 3067  while (code < code_end)
3067        printf("]");        printf("]");
3068        code += 32;        code += 32;
3069    
3070          CLASS_REF_REPEAT:
3071    
3072        switch(*code)        switch(*code)
3073          {          {
3074          case OP_CRSTAR:          case OP_CRSTAR:
# Line 2145  return (pcre *)re; Line 3126  return (pcre *)re;
3126    
3127    
3128  /*************************************************  /*************************************************
 *        Match a character type                  *  
 *************************************************/  
   
 /* Not used in all the places it might be as it's sometimes faster  
 to put the code inline.  
   
 Arguments:  
   type        the character type  
   c           the character  
   dotall      the dotall flag  
   
 Returns:      TRUE if character is of the type  
 */  
   
 static BOOL  
 match_type(int type, int c, BOOL dotall)  
 {  
   
 #ifdef DEBUG  
 if (isprint(c)) printf("matching subject %c against ", c);  
   else printf("matching subject \\x%02x against ", c);  
 printf("%s\n", OP_names[type]);  
 #endif  
   
 switch(type)  
   {  
   case OP_ANY:            return dotall || c != '\n';  
   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;  
   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;  
   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;  
   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;  
   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;  
   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;  
   }  
 return FALSE;  
 }  
   
   
   
 /*************************************************  
3129  *          Match a back-reference                *  *          Match a back-reference                *
3130  *************************************************/  *************************************************/
3131    
3132  /* If a back reference hasn't been set, the match fails.  /* If a back reference hasn't been set, the length that is passed is greater
3133    than the number of characters left in the string, so the match fails.
3134    
3135  Arguments:  Arguments:
3136    number      reference number    offset      index into the offset vector
3137    eptr        points into the subject    eptr        points into the subject
3138    length      length to be matched    length      length to be matched
3139    md          points to match data block    md          points to match data block
3140      ims         the ims flags
3141    
3142  Returns:      TRUE if matched  Returns:      TRUE if matched
3143  */  */
3144    
3145  static BOOL  static BOOL
3146  match_ref(int number, register uschar *eptr, int length, match_data *md)  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3147      unsigned long int ims)
3148  {  {
3149  uschar *p = md->start_subject + md->offset_vector[number];  const uschar *p = md->start_subject + md->offset_vector[offset];
3150    
3151  #ifdef DEBUG  #ifdef DEBUG
3152  if (eptr >= md->end_subject)  if (eptr >= md->end_subject)
# Line 2219  printf("\n"); Line 3163  printf("\n");
3163    
3164  /* Always fail if not enough characters left */  /* Always fail if not enough characters left */
3165    
3166  if (length > md->end_subject - p) return FALSE;  if (length > md->end_subject - eptr) return FALSE;
3167    
3168  /* Separate the caselesss case for speed */  /* Separate the caselesss case for speed */
3169    
3170  if (md->caseless)  if ((ims & PCRE_CASELESS) != 0)
3171    { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }    {
3172      while (length-- > 0)
3173        if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3174      }
3175  else  else
3176    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3177    
# Line 2237  return TRUE; Line 3184  return TRUE;
3184  *         Match from current position            *  *         Match from current position            *
3185  *************************************************/  *************************************************/
3186    
3187  /* On entry ecode points to the first opcode, and eptr to the first character.  /* On entry ecode points to the first opcode, and eptr to the first character
3188    in the subject string, while eptrb holds the value of eptr at the start of the
3189    last bracketed group - used for breaking infinite loops matching zero-length
3190    strings.
3191    
3192  Arguments:  Arguments:
3193     eptr        pointer in subject     eptr        pointer in subject
3194     ecode       position in code     ecode       position in code
3195     offset_top  current top pointer     offset_top  current top pointer
3196     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3197       ims         current /i, /m, and /s options
3198       condassert  TRUE if called to check a condition assertion
3199       eptrb       eptr at start of last bracket
3200    
3201  Returns:       TRUE if matched  Returns:       TRUE if matched
3202  */  */
3203    
3204  static BOOL  static BOOL
3205  match(register uschar *eptr, register uschar *ecode, int offset_top,  match(register const uschar *eptr, register const uschar *ecode,
3206    match_data *md)    int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
3207      const uschar *eptrb)
3208  {  {
3209    unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3210    
3211  for (;;)  for (;;)
3212    {    {
3213      int op = (int)*ecode;
3214    int min, max, ctype;    int min, max, ctype;
3215    register int i;    register int i;
3216    register int c;    register int c;
3217    BOOL minimize;    BOOL minimize = FALSE;
3218    
3219    /* Opening bracket. Check the alternative branches in turn, failing if none    /* Opening capturing bracket. If there is space in the offset vector, save
3220    match. We have to set the start offset if required and there is space    the current subject position in the working slot at the top of the vector. We
3221    in the offset vector so that it is available for subsequent back references    mustn't change the current values of the data slot, because they may be set
3222    if the bracket matches. However, if the bracket fails, we must put back the    from a previous iteration of this group, and be referred to by a reference
3223    previous value of both offsets in case they were set by a previous copy of    inside the group.
3224    the same bracket. Don't worry about setting the flag for the error case here;  
3225    that is handled in the code for KET. */    If the bracket fails to match, we need to restore this value and also the
3226      values of the final offsets, in case they were set by a previous iteration of
3227      the same bracket.
3228    
3229      If there isn't enough space in the offset vector, treat this as if it were a
3230      non-capturing bracket. Don't worry about setting the flag for the error case
3231      here; that is handled in the code for KET. */
3232    
3233    if ((int)*ecode >= OP_BRA)    if (op > OP_BRA)
3234      {      {
3235      int number = (*ecode - OP_BRA) << 1;      int number = op - OP_BRA;
3236      int save_offset1, save_offset2;      int offset = number << 1;
3237    
3238      #ifdef DEBUG  #ifdef DEBUG
3239      printf("start bracket %d\n", number/2);      printf("start bracket %d subject=", number);
3240      #endif      pchars(eptr, 16, TRUE, md);
3241        printf("\n");
3242    #endif
3243    
3244      if (number > 0 && number < md->offset_end)      if (offset < md->offset_max)
3245        {        {
3246        save_offset1 = md->offset_vector[number];        int save_offset1 = md->offset_vector[offset];
3247        save_offset2 = md->offset_vector[number+1];        int save_offset2 = md->offset_vector[offset+1];
3248        md->offset_vector[number] = eptr - md->start_subject;        int save_offset3 = md->offset_vector[md->offset_end - number];
3249    
3250          DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3251          md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3252    
3253          do
3254            {
3255            if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3256            ecode += (ecode[1] << 8) + ecode[2];
3257            }
3258          while (*ecode == OP_ALT);
3259    
3260          DPRINTF(("bracket %d failed\n", number));
3261    
3262        #ifdef DEBUG        md->offset_vector[offset] = save_offset1;
3263        printf("saving %d %d\n", save_offset1, save_offset2);        md->offset_vector[offset+1] = save_offset2;
3264        #endif        md->offset_vector[md->offset_end - number] = save_offset3;
3265          return FALSE;
3266        }        }
3267    
3268      /* Recurse for all the alternatives. */      /* Insufficient room for saving captured contents */
3269    
3270        else op = OP_BRA;
3271        }
3272    
3273      /* Other types of node can be handled by a switch */
3274    
3275      switch(op)
3276        {
3277        case OP_BRA:     /* Non-capturing bracket: optimized */
3278        DPRINTF(("start bracket 0\n"));
3279      do      do
3280        {        {
3281        if (match(eptr, ecode+3, offset_top, md)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3282        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3283        }        }
3284      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3285        DPRINTF(("bracket 0 failed\n"));
3286        return FALSE;
3287    
3288        /* Conditional group: compilation checked that there are no more than
3289        two branches. If the condition is false, skipping the first branch takes us
3290        past the end if there is only one branch, but that's OK because that is
3291        exactly what going to the ket would do. */
3292    
3293        case OP_COND:
3294        if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3295          {
3296          int offset = ecode[4] << 1;    /* Doubled reference number */
3297          return match(eptr,
3298            ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3299              5 : 3 + (ecode[1] << 8) + ecode[2]),
3300            offset_top, md, ims, FALSE, eptr);
3301          }
3302    
3303      #ifdef DEBUG      /* The condition is an assertion. Call match() to evaluate it - setting
3304      printf("bracket %d failed\n", number/2);      the final argument TRUE causes it to stop at the end of an assertion. */
     #endif  
3305    
3306      if (number > 0 && number < md->offset_end)      else
3307        {        {
3308        md->offset_vector[number] = save_offset1;        if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3309        md->offset_vector[number+1] = save_offset2;          {
3310            ecode += 3 + (ecode[4] << 8) + ecode[5];
3311            while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3312            }
3313          else ecode += (ecode[1] << 8) + ecode[2];
3314          return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3315        }        }
3316        /* Control never reaches here */
3317    
3318      return FALSE;      /* Skip over conditional reference data if encountered (should not be) */
     }  
3319    
3320    /* Other types of node can be handled by a switch */      case OP_CREF:
3321        ecode += 2;
3322        break;
3323    
3324        /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3325        an empty string - recursion will then try other alternatives, if any. */
3326    
   switch(*ecode)  
     {  
3327      case OP_END:      case OP_END:
3328        if (md->notempty && eptr == md->start_match) return FALSE;
3329      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3330      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3331      return TRUE;      return TRUE;
3332    
3333      /* The equivalent of Prolog's "cut" - if the rest doesn't match, the      /* Change option settings */
     whole thing doesn't match, so we have to get out via a longjmp(). */  
3334    
3335      case OP_CUT:      case OP_OPT:
3336      if (match(eptr, ecode+1, offset_top, md)) return TRUE;      ims = ecode[1];
3337      longjmp(md->fail_env, 1);      ecode += 2;
3338        DPRINTF(("ims set to %02lx\n", ims));
3339        break;
3340    
3341      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
3342      matching won't pass the KET for an assertion. If any one branch matches,      matching won't pass the KET for an assertion. If any one branch matches,
3343      the assertion is true. */      the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3344        start of each branch to move the current point backwards, so the code at
3345        this level is identical to the lookahead case. */
3346    
3347      case OP_ASSERT:      case OP_ASSERT:
3348        case OP_ASSERTBACK:
3349      do      do
3350        {        {
3351        if (match(eptr, ecode+3, offset_top, md)) break;        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3352        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3353        }        }
3354      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3355      if (*ecode == OP_KET) return FALSE;      if (*ecode == OP_KET) return FALSE;
3356    
3357        /* If checking an assertion for a condition, return TRUE. */
3358    
3359        if (condassert) return TRUE;
3360    
3361      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3362      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
3363    
# Line 2349  for (;;) Line 3369  for (;;)
3369      /* Negative assertion: all branches must fail to match */      /* Negative assertion: all branches must fail to match */
3370    
3371      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
3372        case OP_ASSERTBACK_NOT:
3373      do      do
3374        {        {
3375        if (match(eptr, ecode+3, offset_top, md)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3376        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3377        }        }
3378      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3379    
3380        if (condassert) return TRUE;
3381      ecode += 3;      ecode += 3;
3382      continue;      continue;
3383    
3384        /* Move the subject pointer back. This occurs only at the start of
3385        each branch of a lookbehind assertion. If we are too close to the start to
3386        move back, this match function fails. */
3387    
3388        case OP_REVERSE:
3389        eptr -= (ecode[1] << 8) + ecode[2];
3390        if (eptr < md->start_subject) return FALSE;
3391        ecode += 3;
3392        break;
3393    
3394        /* Recursion matches the current regex, nested. If there are any capturing
3395        brackets started but not finished, we have to save their starting points
3396        and reinstate them after the recursion. However, we don't know how many
3397        such there are (offset_top records the completed total) so we just have
3398        to save all the potential data. There may be up to 99 such values, which
3399        is a bit large to put on the stack, but using malloc for small numbers
3400        seems expensive. As a compromise, the stack is used when there are fewer
3401        than 16 values to store; otherwise malloc is used. A problem is what to do
3402        if the malloc fails ... there is no way of returning to the top level with
3403        an error. Save the top 15 values on the stack, and accept that the rest
3404        may be wrong. */
3405    
3406        case OP_RECURSE:
3407          {
3408          BOOL rc;
3409          int *save;
3410          int stacksave[15];
3411    
3412          c = md->offset_max;
3413    
3414          if (c < 16) save = stacksave; else
3415            {
3416            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3417            if (save == NULL)
3418              {
3419              save = stacksave;
3420              c = 15;
3421              }
3422            }
3423    
3424          for (i = 1; i <= c; i++)
3425            save[i] = md->offset_vector[md->offset_end - i];
3426          rc = match(eptr, md->start_pattern, offset_top, md, ims, FALSE, eptrb);
3427          for (i = 1; i <= c; i++)
3428            md->offset_vector[md->offset_end - i] = save[i];
3429          if (save != stacksave) (pcre_free)(save);
3430          if (!rc) return FALSE;
3431    
3432          /* In case the recursion has set more capturing values, save the final
3433          number, then move along the subject till after the recursive match,
3434          and advance one byte in the pattern code. */
3435    
3436          offset_top = md->end_offset_top;
3437          eptr = md->end_match_ptr;
3438          ecode++;
3439          }
3440        break;
3441    
3442      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3443      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
3444      a back into the brackets. Check the alternative branches in turn - the      a move back into the brackets. Check the alternative branches in turn - the
3445      matching won't pass the KET for this kind of subpattern. If any one branch      matching won't pass the KET for this kind of subpattern. If any one branch
3446      matches, we carry on, leaving the subject pointer. */      matches, we carry on as at the end of a normal bracket, leaving the subject
3447        pointer. */
3448    
3449      case OP_ONCE:      case OP_ONCE:
     do  
3450        {        {
3451        if (match(eptr, ecode+3, offset_top, md)) break;        const uschar *prev = ecode;
3452        ecode += (ecode[1] << 8) + ecode[2];  
3453        }        do
3454      while (*ecode == OP_ALT);          {
3455      if (*ecode == OP_KET) return FALSE;          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3456            ecode += (ecode[1] << 8) + ecode[2];
3457            }
3458          while (*ecode == OP_ALT);
3459    
3460      /* Continue as from after the assertion, updating the offsets high water        /* If hit the end of the group (which could be repeated), fail */
     mark, since extracts may have been taken. */  
3461    
3462      do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);        if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3463      ecode += 3;  
3464      offset_top = md->end_offset_top;        /* Continue as from after the assertion, updating the offsets high water
3465      eptr = md->end_match_ptr;        mark, since extracts may have been taken. */
3466      continue;  
3467          do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3468    
3469          offset_top = md->end_offset_top;
3470          eptr = md->end_match_ptr;
3471    
3472          /* For a non-repeating ket, just continue at this level. This also
3473          happens for a repeating ket if no characters were matched in the group.
3474          This is the forcible breaking of infinite loops as implemented in Perl
3475          5.005. If there is an options reset, it will get obeyed in the normal
3476          course of events. */
3477    
3478          if (*ecode == OP_KET || eptr == eptrb)
3479            {
3480            ecode += 3;
3481            break;
3482            }
3483    
3484          /* The repeating kets try the rest of the pattern or restart from the
3485          preceding bracket, in the appropriate order. We need to reset any options
3486          that changed within the bracket before re-running it, so check the next
3487          opcode. */
3488    
3489          if (ecode[3] == OP_OPT)
3490            {
3491            ims = (ims & ~PCRE_IMS) | ecode[4];
3492            DPRINTF(("ims set to %02lx at group repeat\n", ims));
3493            }
3494    
3495          if (*ecode == OP_KETRMIN)
3496            {
3497            if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3498                match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3499            }
3500          else  /* OP_KETRMAX */
3501            {
3502            if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3503                match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3504            }
3505          }
3506        return FALSE;
3507    
3508      /* An alternation is the end of a branch; scan along to find the end of the      /* An alternation is the end of a branch; scan along to find the end of the
3509      bracketed group and go to there. */      bracketed group and go to there. */
# Line 2397  for (;;) Line 3520  for (;;)
3520    
3521      case OP_BRAZERO:      case OP_BRAZERO:
3522        {        {
3523        uschar *next = ecode+1;        const uschar *next = ecode+1;
3524        if (match(eptr, next, offset_top, md)) return TRUE;        if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3525        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3526        ecode = next + 3;        ecode = next + 3;
3527        }        }
# Line 2406  for (;;) Line 3529  for (;;)
3529    
3530      case OP_BRAMINZERO:      case OP_BRAMINZERO:
3531        {        {
3532        uschar *next = ecode+1;        const uschar *next = ecode+1;
3533        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3534        if (match(eptr, next+3, offset_top, md)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3535        ecode++;        ecode++;
3536        }        }
3537      break;;      break;
3538    
3539      /* End of a group, repeated or non-repeating. If we are at the end of      /* End of a group, repeated or non-repeating. If we are at the end of
3540      an assertion "group", stop matching and return TRUE, but record the      an assertion "group", stop matching and return TRUE, but record the
3541      current high water mark for use by positive assertions. */      current high water mark for use by positive assertions. Do this also
3542        for the "once" (not-backup up) groups. */
3543    
3544      case OP_KET:      case OP_KET:
3545      case OP_KETRMIN:      case OP_KETRMIN:
3546      case OP_KETRMAX:      case OP_KETRMAX:
3547        {        {
3548        int number;        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
       uschar *prev = ecode - (ecode[1] << 8) - ecode[2];  
3549    
3550        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3551              *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3552              *prev == OP_ONCE)
3553          {          {
3554          md->end_match_ptr = eptr;      /* For ONCE */          md->end_match_ptr = eptr;      /* For ONCE */
3555          md->end_offset_top = offset_top;          md->end_offset_top = offset_top;
3556          return TRUE;          return TRUE;
3557          }          }
3558    
3559        /* In all other cases we have to check the group number back at the        /* In all other cases except a conditional group we have to check the
3560        start and if necessary complete handling an extraction by setting the        group number back at the start and if necessary complete handling an
3561        final offset and bumping the high water mark. */        extraction by setting the offsets and bumping the high water mark. */
3562    
3563        number = (*prev - OP_BRA) << 1;        if (*prev != OP_COND)
3564            {
3565            int number = *prev - OP_BRA;
3566            int offset = number << 1;
3567    
3568        #ifdef DEBUG          DPRINTF(("end bracket %d\n", number));
       printf("end bracket %d\n", number/2);  
       #endif  
3569    
3570        if (number > 0)          if (number > 0)
         {  
         if (number >= md->offset_end) md->offset_overflow = TRUE; else  
3571            {            {
3572            md->offset_vector[number+1] = eptr - md->start_subject;            if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3573            if (offset_top <= number) offset_top = number + 2;              {
3574                md->offset_vector[offset] =
3575                  md->offset_vector[md->offset_end - number];
3576                md->offset_vector[offset+1] = eptr - md->start_subject;
3577                if (offset_top <= offset) offset_top = offset + 2;
3578                }
3579            }            }
3580          }          }
3581    
3582        /* For a non-repeating ket, just advance to the next node and continue at        /* Reset the value of the ims flags, in case they got changed during
3583        this level. */        the group. */
3584    
3585          ims = original_ims;
3586          DPRINTF(("ims reset to %02lx\n", ims));
3587    
3588        if (*ecode == OP_KET)        /* For a non-repeating ket, just continue at this level. This also
3589          happens for a repeating ket if no characters were matched in the group.
3590          This is the forcible breaking of infinite loops as implemented in Perl
3591          5.005. If there is an options reset, it will get obeyed in the normal
3592          course of events. */
3593    
3594          if (*ecode == OP_KET || eptr == eptrb)
3595          {          {
3596          ecode += 3;          ecode += 3;
3597          break;          break;
# Line 2464  for (;;) Line 3602  for (;;)
3602    
3603        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3604          {          {
3605          if (match(eptr, ecode+3, offset_top, md) ||          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3606              match(eptr, prev, offset_top, md)) return TRUE;              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3607          }          }
3608        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3609          {          {
3610          if (match(eptr, prev, offset_top, md) ||          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3611              match(eptr, ecode+3, offset_top, md)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3612          }          }
3613        }        }
3614      return FALSE;      return FALSE;
# Line 2479  for (;;) Line 3617  for (;;)
3617    
3618      case OP_CIRC:      case OP_CIRC:
3619      if (md->notbol && eptr == md->start_subject) return FALSE;      if (md->notbol && eptr == md->start_subject) return FALSE;
3620      if (md->multiline)      if ((ims & PCRE_MULTILINE) != 0)
3621        {        {
3622        if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;        if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3623        ecode++;        ecode++;
# Line 2494  for (;;) Line 3632  for (;;)
3632      ecode++;      ecode++;
3633      break;      break;
3634    
3635      /* Assert before internal newline if multiline, or before      /* Assert before internal newline if multiline, or before a terminating
3636      a terminating newline unless endonly is set, else end of subject unless      newline unless endonly is set, else end of subject unless noteol is set. */
     noteol is set. */  
3637    
3638      case OP_DOLL:      case OP_DOLL:
3639      if (md->noteol && eptr >= md->end_subject) return FALSE;      if ((ims & PCRE_MULTILINE) != 0)
     if (md->multiline)  
3640        {        {
3641        if (eptr < md->end_subject && *eptr != '\n') return FALSE;        if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3642            else { if (md->noteol) return FALSE; }
3643        ecode++;        ecode++;
3644        break;        break;
3645        }        }
3646      else if (!md->endonly)      else
3647        {        {
3648        if (eptr < md->end_subject - 1 ||        if (md->noteol) return FALSE;
3649           (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;        if (!md->endonly)
3650        ecode++;          {
3651        break;          if (eptr < md->end_subject - 1 ||
3652               (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3653    
3654            ecode++;
3655            break;
3656            }
3657        }        }
3658      /* ... else fall through */      /* ... else fall through */
3659    
3660      /* End of subject assertion */      /* End of subject assertion (\z) */
3661    
3662      case OP_EOD:      case OP_EOD:
3663      if (eptr < md->end_subject) return FALSE;      if (eptr < md->end_subject) return FALSE;
3664      ecode++;      ecode++;
3665      break;      break;
3666    
3667        /* End of subject or ending \n assertion (\Z) */
3668    
3669        case OP_EODN:
3670        if (eptr < md->end_subject - 1 ||
3671           (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3672        ecode++;
3673        break;
3674    
3675      /* Word boundary assertions */      /* Word boundary assertions */
3676    
3677      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
3678      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
3679        {        {
3680        BOOL prev_is_word = (eptr != md->start_subject) &&        BOOL prev_is_word = (eptr != md->start_subject) &&
3681          ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);          ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3682        BOOL cur_is_word = (eptr < md->end_subject) &&        BOOL cur_is_word = (eptr < md->end_subject) &&
3683          ((pcre_ctypes[*eptr] & ctype_word) != 0);          ((md->ctypes[*eptr] & ctype_word) != 0);
3684        if ((*ecode++ == OP_WORD_BOUNDARY)?        if ((*ecode++ == OP_WORD_BOUNDARY)?
3685             cur_is_word == prev_is_word : cur_is_word != prev_is_word)             cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3686          return FALSE;          return FALSE;
# Line 2540  for (;;) Line 3690  for (;;)
3690      /* Match a single character type; inline for speed */      /* Match a single character type; inline for speed */
3691    
3692      case OP_ANY:      case OP_ANY:
3693      if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3694          return FALSE;
3695      if (eptr++ >= md->end_subject) return FALSE;      if (eptr++ >= md->end_subject) return FALSE;
3696      ecode++;      ecode++;
3697      break;      break;
3698    
3699      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
3700      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)      if (eptr >= md->end_subject ||
3701           (md->ctypes[*eptr++] & ctype_digit) != 0)
3702        return FALSE;        return FALSE;
3703      ecode++;      ecode++;
3704      break;      break;
3705    
3706      case OP_DIGIT:      case OP_DIGIT:
3707      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)      if (eptr >= md->end_subject ||
3708           (md->ctypes[*eptr++] & ctype_digit) == 0)
3709        return FALSE;        return FALSE;
3710      ecode++;      ecode++;
3711      break;      break;
3712    
3713      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
3714      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)      if (eptr >= md->end_subject ||
3715           (md->ctypes[*eptr++] & ctype_space) != 0)
3716        return FALSE;        return FALSE;
3717      ecode++;      ecode++;
3718      break;      break;
3719    
3720      case OP_WHITESPACE:      case OP_WHITESPACE:
3721      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)      if (eptr >= md->end_subject ||
3722           (md->ctypes[*eptr++] & ctype_space) == 0)
3723        return FALSE;        return FALSE;
3724      ecode++;      ecode++;
3725      break;      break;
3726    
3727      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
3728      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)      if (eptr >= md->end_subject ||
3729           (md->ctypes[*eptr++] & ctype_word) != 0)
3730        return FALSE;        return FALSE;
3731      ecode++;      ecode++;
3732      break;      break;
3733    
3734      case OP_WORDCHAR:      case OP_WORDCHAR:
3735      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)      if (eptr >= md->end_subject ||
3736           (md->ctypes[*eptr++] & ctype_word) == 0)
3737        return FALSE;        return FALSE;
3738      ecode++;      ecode++;
3739      break;      break;
# Line 2592  for (;;) Line 3749  for (;;)
3749      case OP_REF:      case OP_REF:
3750        {        {
3751        int length;        int length;
3752        int number = ecode[1] << 1;                /* Doubled reference number */        int offset = ecode[1] << 1;                /* Doubled reference number */
3753        ecode += 2;                                /* Advance past the item */        ecode += 2;                                /* Advance past the item */
3754    
3755        if (number >= offset_top || md->offset_vector[number] < 0)        /* If the reference is unset, set the length to be longer than the amount
3756          {        of subject left; this ensures that every attempt at a match fails. We
3757          md->errorcode = PCRE_ERROR_BADREF;        can't just fail here, because of the possibility of quantifiers with zero
3758          return FALSE;        minima. */
3759          }  
3760          length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3761            md->end_subject - eptr + 1 :
3762            md->offset_vector[offset+1] - md->offset_vector[offset];
3763    
3764        length = md->offset_vector[number+1] - md->offset_vector[number];        /* Set up for repetition, or handle the non-repeated case */
3765    
3766        switch (*ecode)        switch (*ecode)
3767          {          {
# Line 2628  for (;;) Line 3788  for (;;)
3788          break;          break;
3789    
3790          default:               /* No repeat follows */          default:               /* No repeat follows */
3791          if (!match_ref(number, eptr, length, md)) return FALSE;          if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3792          eptr += length;          eptr += length;
3793          continue;              /* With the main loop */          continue;              /* With the main loop */
3794          }          }
# Line 2644  for (;;) Line 3804  for (;;)
3804    
3805        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3806          {          {
3807          if (!match_ref(number, eptr, length, md)) return FALSE;          if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3808          eptr += length;          eptr += length;
3809          }          }
3810    
# Line 2659  for (;;) Line 3819  for (;;)
3819          {          {
3820          for (i = min;; i++)          for (i = min;; i++)
3821            {            {
3822            if (match(eptr, ecode, offset_top, md)) return TRUE;            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3823            if (i >= max || !match_ref(number, eptr, length, md))              return TRUE;
3824              if (i >= max || !match_ref(offset, eptr, length, md, ims))
3825              return FALSE;              return FALSE;
3826            eptr += length;            eptr += length;
3827            }            }
# Line 2671  for (;;) Line 3832  for (;;)
3832    
3833        else        else
3834          {          {
3835          uschar *pp = eptr;          const uschar *pp = eptr;
3836          for (i = min; i < max; i++)          for (i = min; i < max; i++)
3837            {            {
3838            if (!match_ref(number, eptr, length, md)) break;            if (!match_ref(offset, eptr, length, md, ims)) break;
3839            eptr += length;            eptr += length;
3840            }            }
3841          while (eptr >= pp)          while (eptr >= pp)
3842            {            {
3843            if (match(eptr, ecode, offset_top, md)) return TRUE;            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3844                return TRUE;
3845            eptr -= length;            eptr -= length;
3846            }            }
3847          return FALSE;          return FALSE;
# Line 2687  for (;;) Line 3849  for (;;)
3849        }        }
3850      /* Control never gets here */      /* Control never gets here */
3851    
3852    
3853    
3854      /* Match a character class, possibly repeatedly. Look past the end of the      /* Match a character class, possibly repeatedly. Look past the end of the
3855      item to see if there is repeat information following. Then obey similar      item to see if there is repeat information following. Then obey similar
3856      code to character type repeats - written out again for speed. If caseless      code to character type repeats - written out again for speed. */
     matching was set at runtime but not at compile time, we have to check both  
     versions of a character. */  
3857    
3858      case OP_CLASS:      case OP_CLASS:
3859        {        {
3860        uschar *data = ecode + 1;  /* Save for matching */        const uschar *data = ecode + 1;  /* Save for matching */
3861        ecode += 33;               /* Advance past the item */        ecode += 33;                     /* Advance past the item */
3862    
3863        switch (*ecode)        switch (*ecode)
3864          {          {
# Line 2723  for (;;) Line 3885  for (;;)
3885          break;          break;
3886    
3887          default:               /* No repeat follows */          default:               /* No repeat follows */
3888          if (eptr >= md->end_subject) return FALSE;          min = max = 1;
3889          c = *eptr++;          break;
         if ((data[c/8] & (1 << (c&7))) != 0) continue;    /* With main loop */  
         if (md->runtime_caseless)  
           {  
           c = pcre_fcc[c];  
           if ((data[c/8] & (1 << (c&7))) != 0) continue;  /* With main loop */  
           }  
         return FALSE;  
3890          }          }
3891    
3892        /* First, ensure the minimum number of matches are present. */        /* First, ensure the minimum number of matches are present. */
# Line 2741  for (;;) Line 3896  for (;;)
3896          if (eptr >= md->end_subject) return FALSE;          if (eptr >= md->end_subject) return FALSE;
3897          c = *eptr++;          c = *eptr++;
3898          if ((data[c/8] & (1 << (c&7))) != 0) continue;          if ((data[c/8] & (1 << (c&7))) != 0) continue;
         if (md->runtime_caseless)  
           {  
           c = pcre_fcc[c];  
           if ((data[c/8] & (1 << (c&7))) != 0) continue;  
           }  
3899          return FALSE;          return FALSE;
3900          }          }
3901    
# Line 2761  for (;;) Line 3911  for (;;)
3911          {          {
3912          for (i = min;; i++)          for (i = min;; i++)
3913            {            {
3914            if (match(eptr, ecode, offset_top, md)) return TRUE;            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3915                return TRUE;
3916            if (i >= max || eptr >= md->end_subject) return FALSE;            if (i >= max || eptr >= md->end_subject) return FALSE;
3917            c = *eptr++;            c = *eptr++;
3918            if ((data[c/8] & (1 << (c&7))) != 0) continue;            if ((data[c/8] & (1 << (c&7))) != 0) continue;
           if (md->runtime_caseless)  
             {  
             c = pcre_fcc[c];  
             if ((data[c/8] & (1 << (c&7))) != 0) continue;  
             }  
3919            return FALSE;            return FALSE;
3920            }            }
3921          /* Control never gets here */          /* Control never gets here */
# Line 2779  for (;;) Line 3925  for (;;)
3925    
3926        else        else
3927          {          {
3928          uschar *pp = eptr;          const uschar *pp = eptr;
3929          for (i = min; i < max; eptr++, i++)          for (i = min; i < max; eptr++, i++)
3930            {            {
3931            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
3932            c = *eptr;            c = *eptr;
3933            if ((data[c/8] & (1 << (c&7))) != 0) continue;            if ((data[c/8] & (1 << (c&7))) != 0) continue;
           if (md->runtime_caseless)  
             {  
             c = pcre_fcc[c];  
             if ((data[c/8] & (1 << (c&7))) != 0) continue;  
             }  
3934            break;            break;
3935            }            }
3936    
3937          while (eptr >= pp)          while (eptr >= pp)
3938            if (match(eptr--, ecode, offset_top, md)) return TRUE;            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3939                return TRUE;
3940          return FALSE;          return FALSE;
3941          }          }
3942        }        }
# Line 2807  for (;;) Line 3949  for (;;)
3949        register int length = ecode[1];        register int length = ecode[1];
3950        ecode += 2;        ecode += 2;
3951    
3952        #ifdef DEBUG  #ifdef DEBUG    /* Sigh. Some compilers never learn. */
3953        if (eptr >= md->end_subject)        if (eptr >= md->end_subject)
3954          printf("matching subject <null> against pattern ");          printf("matching subject <null> against pattern ");
3955        else        else
# Line 2818  for (;;) Line 3960  for (;;)
3960          }          }
3961        pchars(ecode, length, FALSE, md);        pchars(ecode, length, FALSE, md);
3962        printf("\n");        printf("\n");
3963        #endif  #endif
3964    
3965        if (length > md->end_subject - eptr) return FALSE;        if (length > md->end_subject - eptr) return FALSE;
3966        if (md->caseless)        if ((ims & PCRE_CASELESS) != 0)
3967          {          {
3968          while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;          while (length-- > 0)
3969              if (md->lcc[*ecode++] != md->lcc[*eptr++])
3970                return FALSE;
3971          }          }
3972        else        else
3973          {          {
# Line 2875  for (;;) Line 4019  for (;;)
4019      maximum. Alternatively, if maximizing, find the maximum number of      maximum. Alternatively, if maximizing, find the maximum number of
4020      characters and work backwards. */      characters and work backwards. */
4021    
4022      #ifdef DEBUG      DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4023      printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,        max, eptr));
       max, eptr);  
     #endif  
4024    
4025      if (md->caseless)      if ((ims & PCRE_CASELESS) != 0)
4026        {        {
4027        c = pcre_lcc[c];        c = md->lcc[c];
4028        for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;        for (i = 1; i <= min; i++)
4029            if (c != md->lcc[*eptr++]) return FALSE;
4030        if (min == max) continue;        if (min == max) continue;
4031        if (minimize)        if (minimize)
4032          {          {
4033          for (i = min;; i++)          for (i = min;; i++)
4034            {            {
4035            if (match(eptr, ecode, offset_top, md)) return TRUE;            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4036            if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])              return TRUE;
4037              if (i >= max || eptr >= md->end_subject ||
4038                  c != md->lcc[*eptr++])
4039              return FALSE;              return FALSE;
4040            }            }
4041          /* Control never gets here */          /* Control never gets here */
4042          }          }
4043        else        else
4044          {          {
4045          uschar *pp = eptr;          const uschar *pp = eptr;
4046          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4047            {            {
4048            if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;            if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4049            eptr++;            eptr++;
4050            }            }
4051          while (eptr >= pp)          while (eptr >= pp)
4052            if (match(eptr--, ecode, offset_top, md)) return TRUE;            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4053                return TRUE;
4054          return FALSE;          return FALSE;
4055          }          }
4056        /* Control never gets here */        /* Control never gets here */
# Line 2920  for (;;) Line 4066  for (;;)
4066          {          {
4067          for (i = min;; i++)          for (i = min;; i++)
4068            {            {
4069            if (match(eptr, ecode, offset_top, md)) return TRUE;            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4070                return TRUE;
4071            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4072            }            }
4073          /* Control never gets here */          /* Control never gets here */
4074          }          }
4075        else        else
4076          {          {
4077          uschar *pp = eptr;          const uschar *pp = eptr;
4078          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4079            {            {
4080            if (eptr >= md->end_subject || c != *eptr) break;            if (eptr >= md->end_subject || c != *eptr) break;
4081            eptr++;            eptr++;
4082            }            }
4083          while (eptr >= pp)          while (eptr >= pp)
4084           if (match(eptr--, ecode, offset_top, md)) return TRUE;           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4085               return TRUE;
4086          return FALSE;          return FALSE;
4087          }          }
4088        }        }
# Line 2943  for (;;) Line 4091  for (;;)
4091      /* Match a negated single character */      /* Match a negated single character */
4092    
4093      case OP_NOT:      case OP_NOT:
4094      if (eptr > md->end_subject) return FALSE;      if (eptr >= md->end_subject) return FALSE;
4095      ecode++;      ecode++;
4096      if (md->caseless)      if ((ims & PCRE_CASELESS) != 0)
4097        {        {
4098        if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;        if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4099        }        }
4100      else      else
4101        {        {
# Line 3002  for (;;) Line 4150  for (;;)
4150      maximum. Alternatively, if maximizing, find the maximum number of      maximum. Alternatively, if maximizing, find the maximum number of
4151      characters and work backwards. */      characters and work backwards. */
4152    
4153      #ifdef DEBUG      DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4154      printf("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,        max, eptr));
       max, eptr);  
     #endif  
4155    
4156      if (md->caseless)      if ((ims & PCRE_CASELESS) != 0)
4157        {        {
4158        c = pcre_lcc[c];        c = md->lcc[c];
4159        for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;        for (i = 1; i <= min; i++)
4160            if (c == md->lcc[*eptr++]) return FALSE;
4161        if (min == max) continue;        if (min == max) continue;
4162        if (minimize)        if (minimize)
4163          {          {
4164          for (i = min;; i++)          for (i = min;; i++)
4165